# Compare QTLs for tomato fruit shape and potato tuber shape

## Background

Tomato fruits have a round shape while the potato tubers have an elongated shape. The candidate gene _Solyc10g076180_ ( _SlOFP20_ , a member of the OVATE Family Protein [OFP]) on the chromosome 10 of the reference tomato genome (Heinz 1706) is responsible for round fruits. However, this gene does not have an ortholog in the reference potato genome (DM), which results in elongated tuber (<a href="https://dx.doi.org/10.1038%2Fs41467-018-07216-8">Wu et al., 2018</a>). This notebook uses the <a href="http://grlc.io/">grlc</a>-based Web API of the <a href="https://doi.org/10.5281/zenodo.1458168">pbg-ld</a> platform to map QTL regions for the traits in both tomato and potato as well as to retrieve annotations for the genes in the QTLs.

In [1]:
import io
import requests
import ipywidgets
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

try:
    from networkx.drawing.nx_pydot import graphviz_layout
except ImportError:
    raise ImportError("This example needs Graphviz and either PyGraphviz or Pydot")

from ipywidgets import interactive

ModuleNotFoundError: No module named 'networkx'

In [None]:
# display-related options
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.width', 0)

In [None]:
# Web API
base_url = None
hosts = {'remote': 'http://pbg-ld.candygene-nlesc.surf-hosted.nl:8088/api/candYgene/queries',
         'local': 'http://localhost:8088/api/local/local'}
headers = {'accept': 'text/csv'} # request content in CSV

In [None]:
# create a widget with a drop-down list of hosts
def setWebAPI(host):
    print(host)
    return host
w = interactive(setWebAPI, host=hosts)

In [None]:
display(w) # default host 'remote'

In [None]:
base_url = w.result
#print(base_url)

In [None]:
# the fine-mapped locus sov1 (suppressor of ovate) on chr10 includes three genes,
# Solyc10g076170, Solyc10g076180 (SlOFP20) and Solyc10g076190,
# but use only the 'flanking' genes as input
input_data = dict(genes=['Solyc10g076170', 'Solyc10g076190'],
                  taxon_id=4081,
                  graph_iri='http://solgenomics.net/genome/Solanum_lycopersicum')

In [None]:
# retrieve genomic locations for the genes
genes = pd.DataFrame()
feature = 'protein_coding_gene'
try:
    for g in input_data['genes']:
        if g is not None:
            with requests.get(base_url + '/getFeatureLocation',
                              params = {'featureid': "'%s*'" % g}, # append wildcard '*'
                              headers = headers) as req:
                genes = genes.append(pd.read_csv(io.StringIO(req.text)).query('feature_name==@feature'),
                                     ignore_index=True)
except:
    print('Failed fetch data from Web API!')

In [None]:
display(genes)

In [None]:
# compute the QTL interval based on the 'flanking' genes' start/end positions
pos = pd.concat([genes['begin_pos'], genes['end_pos']]).describe()

In [None]:
qtl_inter = dict(chrom=genes['chrom'].unique()[0],
                 taxon_id=genes['taxon_id'].unique()[0],
                 begin=int(pos['min']),
                 end=int(pos['max']))

In [None]:
display(qtl_inter)

In [None]:
# retrieve genes in the genomic interval
genes_inter = pd.DataFrame()
params = {'feature': feature,
          'chrom': qtl_inter['chrom'],
          'graph': input_data['graph_iri'],
          'begin': qtl_inter['begin'],
          'end': qtl_inter['end']}
try:
    with requests.get(base_url + '/getFeaturesInInterval',
                      params=params,
                      headers=headers) as req:
        genes_inter = pd.read_csv(io.StringIO(req.text))
except:
    print('Failed fetch data from Web API!')

In [None]:
display(genes_inter.sort_values(['chrom', 'begin_pos']))

In [None]:
# for each gene retrieve annotations from SGN, Ensembl Plants and UniProt/GOA
genes_annot = pd.DataFrame()
try:
    for g in genes_inter['feature_id']:
        with requests.get(base_url + '/getGeneAnnotations',
                          params={'geneid': "'%s'" % g},
                          headers=headers) as req:
            genes_annot = genes_annot.append(pd.read_csv(io.StringIO(req.text)), ignore_index=True).fillna('')
except:
    print('Failed fetch data from Web API!')

In [None]:
display(genes_annot)

In [None]:
# get paralogs of the tomato candidate gene
gene_id = 'Solyc10g076180.1'
edges=dict(para=pd.DataFrame(),
           ortho=pd.DataFrame())
try:
    with requests.get(base_url + '/getParalogs',
                      params={'geneid': "'%s'" % gene_id},
                      headers=headers) as req:
        edges['para'] = pd.read_csv(io.StringIO(req.text))
except:
    print('Failed fetch data from Web API!')

In [None]:
display(edges['para'])

In [None]:
# get potato orthologs of the tomato homologs
try:
    for g in set(edges['para']['paralog_id']) | set([gene_id]):
        with requests.get(base_url + '/getOrthologs',
                          params={'geneid': "'%s'" % g},
                          headers=headers) as req:
            df = pd.read_csv(io.StringIO(req.text))
            if df.size == 0:
                df = {'gene_id': g, 'ortholog_id': None}
            edges['ortho'] = edges['ortho'].append(df, ignore_index=True)
except:
    print('Failed fetch data from Web API!')

In [None]:
display(edges['ortho'])

In [None]:
# write results into a CSV file
edges['ortho'].set_index('gene_id').to_csv('orthologs.csv')

In [None]:
# count orthologs per gene
display(edges['ortho'].groupby(['gene_id'])['ortholog_id']
        .agg(['count'])
        .sort_values(['count', 'gene_id'], ascending=False)
        .rename(columns={'count':'n_orthologs'})
        .reset_index())

In [None]:
edges['ortho'] = edges['ortho'].query('ortholog_id.notnull()')

### N.B.: The candidate gene _Solyc10g076180_ does not have an ortholog in potato.

In [None]:
# get additional paralogs of the tomato/potato homologs
try:
    for g in set(edges['para']['paralog_id']) | set(edges['ortho']['ortholog_id']):
        with requests.get(base_url + '/getParalogs',
                          params={'geneid': "'%s'" % g},
                          headers=headers) as req:
            edges['para'] = edges['para'].append(pd.read_csv(io.StringIO(req.text)), ignore_index=True)
except:
    print('Failed fetch data from Web API!')

In [None]:
display(edges['para'])

In [None]:
# write results into a CSV file
edges['para'].sort_values(['gene_id']).set_index('gene_id').to_csv('paralogs.csv')

In [None]:
# count paralogs per gene
display(edges['para'].groupby(['gene_id'])['paralog_id']
        .agg(['count'])
        .sort_values(['count', 'gene_id'], ascending=False)
        .rename(columns={'count':'n_paralogs'})
        .reset_index())

In [None]:
# visualize tomato/potato genes (nodes) and their ortho/paralogous relations (edges) in a graph
G = nx.Graph()
G.add_edges_from(edges['para'].values, style='-')  # add paralogous relations (solid edges)
G.add_edges_from(edges['ortho'].values, style='-.') # add orthologous relations (dotted edges)
style_map = [G[u][v]['style'] for u,v in G.edges()]
color_map = []
for i,n in enumerate(G.nodes()):
    if 'Solyc' in n:
        color_map.append('r') # tomato genes in red
    else:
        color_map.append('y') # potato genes in yellow
pos = graphviz_layout(G, prog='neato')
plt.margins(0.2, 0.05)
nx.draw(G, pos, node_size=100, node_color=color_map, style=style_map, width=0.2, font_size=4, with_labels=True)
plt.savefig('homologs_graph.svg')

In [None]:
# for each gene retrieve annotations from SGN, Ensembl Plants and UniProt/GOA
genes_annot = pd.DataFrame()
genes_loc = pd.DataFrame()
try:
    for g in G.nodes():
        with requests.get(base_url + '/getFeatureLocation',
                          params={'featureid': "'%s'" % g},
                          headers=headers) as req:
            genes_loc = genes_loc.append(pd.read_csv(io.StringIO(req.text)), ignore_index=True)

        with requests.get(base_url + '/getGeneAnnotations',
                          params={'geneid': "'%s'" % g},
                          headers=headers) as req:
            genes_annot = genes_annot.append(pd.read_csv(io.StringIO(req.text)).fillna(''), ignore_index=True)
except:
    print('Failed fetch data from Web API!')

In [None]:
# inner join of both genes* tables
genes_annot = pd.merge(genes_loc.rename(columns={'feature_id': 'gene_id'}).drop(['feature_name'], axis=1),
                       genes_annot, on='gene_id').sort_values(['gene_id', 'chrom', 'begin_pos'])

In [None]:
display(genes_annot.reset_index(drop=True))

In [None]:
# write results into a CSV file
genes_annot.set_index('gene_id').to_csv('homologs_annot.csv')