# Compare QTLs for tomato fruit shape and potato tuber shape

## Background

Tomato fruits have a round shape while the potato tubers have an elongated shape. The candidate gene _Solyc10g076180_ ( _SlOFP20_ , a member of the OVATE family) on the chromosome 10 of the reference tomato genome (Heinz 1706) is responsible for round fruits. However, this gene does not have an ortholog in the reference potato genome (DM), which results in elongated tuber (<a href="https://dx.doi.org/10.1038%2Fs41467-018-07216-8">Wu et al., 2018</a>). This notebook uses the <a href="http://grlc.io/">grlc</a>-based Web API of the <a href="https://doi.org/10.5281/zenodo.1458168">pbg-ld</a> platform to map QTL regions for the traits in both tomato and potato as well as to retrieve annotations for the genes in the QTLs.

In [1]:
import io
import requests
import ipywidgets
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

try:
    from networkx.drawing.nx_pydot import graphviz_layout
except ImportError:
    raise ImportError("This example needs Graphviz and either PyGraphviz or Pydot")

from ipywidgets import interactive

In [2]:
# display-related options
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.width', 0)

In [3]:
# output files
outfiles = {'paralogs': 'paralogs.csv',
            'orthologs': 'orthologs.csv',
            'genes_annot': 'genes_annot.csv',
            'homologs_annot': 'homologs_annot.csv'}

In [4]:
# Web API
base_url = None
hosts = {'remote': 'http://pbg-ld.candygene-nlesc.surf-hosted.nl:8088/api/candYgene/queries',
         'local': 'http://localhost:8088/api/local/local'}
headers = {'accept': 'text/csv'} # request content in CSV

In [5]:
# create a widget with a drop-down list of hosts
def setWebAPI(host):
    print(host)
    return host
w = interactive(setWebAPI, host=hosts)

In [6]:
display(w) # default host 'remote'

interactive(children=(Dropdown(description='host', options={'remote': 'http://pbg-ld.candygene-nlesc.surf-host…

In [None]:
base_url = w.result
#print(base_url)

## Input genes

In [None]:
# tomato genes upstream/downstream of the candidate gene on chromosome 10
input_data = dict(tomato=dict(genes=['Solyc10g075170.1', 'Solyc10g076240.1'],
                              taxon_id=4081,
                              graph_iri='http://solgenomics.net/genome/Solanum_lycopersicum'))

In [None]:
# retrieve genomic locations for the genes
genes = pd.DataFrame()
for g in input_data['tomato']['genes']:
    try:
        if g is not None:
            with requests.get(base_url + '/getFeatureLocation',
                              params = {'featureid': "'%s'" % g},
                              headers = headers) as req:
                genes = genes.append(pd.read_csv(io.StringIO(req.text)), ignore_index=True)
    except:
        print('Failed to connect to the Web API!')
        break

In [None]:
display(genes)

In [None]:
# compute the QTL interval given the start/end positions of the genes
pos = pd.concat([genes['begin_pos'], genes['end_pos']]).describe()
qtl_inter = dict(chrom=genes['chrom'].unique()[0],
                 taxon_id=int(genes['taxon_id'].unique()[0]),
                 begin=int(pos['min']),
                 end=int(pos['max']))

In [None]:
display(qtl_inter)

In [None]:
# retrieve all genes in the genomic interval
genes_inter = pd.DataFrame()
params = {'feature': 'protein_coding_gene',
          'chrom': qtl_inter['chrom'],
          'graph': input_data['tomato']['graph_iri'],
          'begin': qtl_inter['begin'],
          'end': qtl_inter['end']}
try:
    with requests.get(base_url + '/getFeaturesInInterval',
                      params=params,
                      headers=headers) as req:
        genes_inter = pd.read_csv(io.StringIO(req.text))
except:
    print('Failed to connect to the Web API!')

In [None]:
display(genes_inter.sort_values(['chrom', 'begin_pos']))

## Paralogs

In [None]:
# for each tomato gene retrieve its paralog(s)
rows = []
for index, g in genes_inter.iterrows():
    try:
        with requests.get(base_url + '/getParalogs',
                          params={'geneid': "'%s'" % g['feature_id']},
                          headers=headers) as req:
            df = pd.read_csv(io.StringIO(req.text))
            if df.size == 0:
                rows.append([g['feature_id'], g['chrom'], g['begin_pos'], g['end_pos'],
                             None, None, None, None]) # if paralog not found
                continue
            for index, j in df.iterrows():
                with requests.get(base_url + '/getFeatureLocation',
                    params={'featureid': "'%s'" % j['paralog_id']},
                    headers=headers) as req:
                    for index, o in pd.read_csv(io.StringIO(req.text)).iterrows():
                        rows.append([g['feature_id'], g['chrom'], g['begin_pos'], g['end_pos'],
                                     o['feature_id'], o['chrom'], o['begin_pos'], o['end_pos']])
  
    except:
        print('Failed to connect to the Web API!')
        break

In [None]:
# create a dataframe from the query results
cols = ['gene_id', 'chrom', 'begin', 'end', 'para_id', 'para_chrom', 'para_begin', 'para_end']
paralogs = pd.DataFrame(rows, columns=cols)
paralogs['para_begin'] = paralogs['para_begin'].fillna(-1)
paralogs['para_end'] = paralogs['para_end'].fillna(-1)
paralogs['para_begin'] = paralogs['para_begin'].astype(np.int64)
paralogs['para_end'] = paralogs['para_end'].astype(np.int64)
paralogs = paralogs.replace([-1], [None])\
                   .sort_values(['gene_id', 'para_chrom', 'para_begin'])

In [None]:
display(paralogs)

In [None]:
# write results into a CSV file
paralogs.set_index('gene_id').to_csv(outfiles['paralogs'])

In [None]:
# count paralogs per gene
display(paralogs.groupby(['gene_id'])['para_id']
        .agg(['count'])
        .sort_values(['count'], ascending=False)
        .rename(columns={'count':'n_paralogs'})
        .reset_index())

### Note: The gene _Solyc10g076170_ does not have a paralog in tomato.

In [None]:
# create a new dataframe with all genes in the interval including their paralogs
genes = paralogs[['para_id', 'para_chrom', 'para_begin', 'para_end']]\
        .query('para_id.notnull()')\
        .rename(columns=dict(para_id='gene_id', para_chrom='chrom', para_begin='begin', para_end='end'))
genes = genes.append(paralogs[['gene_id', 'chrom', 'begin', 'end']]).drop_duplicates()

In [None]:
display(genes.sort_values(['gene_id', 'chrom', 'begin']).reset_index(drop=True))

## Orthologs

In [None]:
# for each tomato gene retrieve its ortholog(s) in potato
rows = []
for index, g in genes.iterrows():
    try:
        with requests.get(base_url + '/getOrthologs',
                          params={'geneid': "'%s'" % g['gene_id']},
                          headers=headers) as req:
            df = pd.read_csv(io.StringIO(req.text))
            if df.size == 0:
                rows.append([g['gene_id'], g['chrom'], g['begin'], g['end'],
                             None, None, None, None]) # if ortholog not found
                continue
            for index, j in df.iterrows():
                with requests.get(base_url + '/getFeatureLocation',
                    params={'featureid': "'%s'" % j['ortholog_id']},
                    headers=headers) as req:
                    for index, o in pd.read_csv(io.StringIO(req.text)).iterrows():
                        rows.append([g['gene_id'], g['chrom'], g['begin'], g['end'],
                                     o['feature_id'], o['chrom'], o['begin_pos'], o['end_pos']])  
    except:
        print('Failed to connect to the Web API!')
        break

In [None]:
# create a dataframe from the query results
cols = ['gene_id', 'chrom', 'begin', 'end', 'ortho_id', 'ortho_chrom', 'ortho_begin', 'ortho_end']
orthologs = pd.DataFrame(rows, columns=cols)
orthologs['ortho_begin'] = orthologs['ortho_begin'].fillna(-1)
orthologs['ortho_end'] = orthologs['ortho_end'].fillna(-1)
orthologs['ortho_begin'] = orthologs['ortho_begin'].astype(np.int64)
orthologs['ortho_end'] = orthologs['ortho_end'].astype(np.int64)
orthologs = orthologs.replace([-1], [None]).sort_values(['gene_id', 'ortho_chrom', 'ortho_begin'])

In [None]:
display(orthologs.reset_index(drop=True))

In [None]:
# write results into a CSV file
orthologs.set_index('gene_id').to_csv(outfiles['orthologs'])

In [None]:
# count orthologs per gene
display(orthologs.groupby(['gene_id'])['ortho_id']
        .agg(['count'])
        .sort_values(['count'], ascending=False)
        .rename(columns={'count':'n_orthologs'})
        .reset_index())

### Note: Some tomato genes on chromosome 10, including the genes _Solyc10g076170_ and _Solyc10g076240_ surrounding the candidate gene  _Solyc10g076180_ , do not have an ortholog in potato.

## Gene annotations

In [None]:
# append potato orthologs to the tomato gene set (dataframe)
genes = genes.append(orthologs[['ortho_id', 'ortho_chrom', 'ortho_begin', 'ortho_end']]\
                     .query('ortho_id.notnull()')\
                     .rename(columns=dict(ortho_id='gene_id', ortho_chrom='chrom', ortho_begin='begin', ortho_end='end')))

In [None]:
# for each gene retrieve annotations from SGN, Ensembl Plants and UniProt/GOA
rows = []
for index, g in genes.iterrows():
    try:
        with requests.get(base_url + '/getGeneAnnotations',
                          params={'geneid': "'%s'" % g['gene_id']},
                          headers=headers) as req:
            df = pd.read_csv(io.StringIO(req.text))
            if df.size == 0:
                rows.append([g['gene_id'], g['chrom'], g['begin'], g['end'],
                             None, None, None, None, None, None, None]) # if annotations not found
                continue
            for index, h in df.iterrows():
                rows.append([g['gene_id'], g['chrom'], g['begin'], g['end'],
                             h['gene_name'], h['transcript_id'], h['uniprot_acc'], h['uniprot_reviewed'],
                             h['uniprot_des'], h['uniprot_goa'], h['sgn_des']])
    except:
        print('Failed to connect to the Web API!')
        break

In [None]:
# create a dataframe from the query results
cols = ['gene_id', 'chrom', 'begin', 'end', 'gene_name', 'transcript_id', 'uniprot_acc', 'uniprot_reviewed', 'uniprot_des', 'uniprot_goa', 'sgn_des']
genes_annot = pd.DataFrame(rows, columns=cols).sort_values(['gene_id', 'chrom', 'begin'])

In [None]:
display(genes_annot.reset_index(drop=True))

In [None]:
# write results into a CSV file
genes_annot.set_index('gene_id').to_csv(outfiles['genes_annot'])

## Visualize _Solyc10g0761_ homologs and their relationships in a graph

In [None]:
# get the initial set of paralogous relations of the tomato gene
gene_id = 'Solyc10g076180.1'
edges_para = paralogs.query("gene_id == @gene_id")[['gene_id', 'para_id']]
display(edges_para.reset_index(drop=True))

In [None]:
# get orthologous relations of the tomato genes (homologs)
gene_ids = set([gene_id]) | set(edges_para['para_id'])
edges_ortho = orthologs.query('gene_id in @gene_ids')[['gene_id', 'ortho_id']]
display(edges_ortho.reset_index(drop=True))

In [None]:
# get an extended set of paralogous relations of the tomato/potato genes
gene_ids |= set(edges_ortho['ortho_id']) - set([None])
for g in gene_ids:
    try:
        with requests.get(base_url + '/getParalogs',
                          params={'geneid': "'%s'" % g},
                          headers=headers) as req:
            df = pd.read_csv(io.StringIO(req.text)).rename(columns={'paralog_id': 'para_id'})
            if df.size == 0:
                edges_para = edges_para.append({'gene_id': g, 'para_id': None}, ignore_index=True)
            else:
                edges_para = edges_para.append(df)
    except:
        print('Failed to connect to the Web API!')
        break

In [None]:
display(edges_para.sort_values(['gene_id']).reset_index(drop=True))

In [None]:
# plot a graph of tomato/potato genes (nodes) and ortho/paralogous relations (edges)
G = nx.Graph()
edges_ortho = edges_ortho.query('ortho_id.notnull()')
G.add_edges_from(edges_para.values, style='-')  # add paralogous relations (solid edges)
G.add_edges_from(edges_ortho.values, style='-.') # add orthologous relations (dotted edges)
style_map = [G[u][v]['style'] for u,v in G.edges()]
color_map = []
for i,n in enumerate(G.nodes()):
    if 'Solyc' in n:
        color_map.append('r') # tomato genes in red
    else:
        color_map.append('y') # potato genes in yellow
pos = graphviz_layout(G, prog='neato')
plt.margins(0.2, 0.05)
nx.draw(G, pos, node_size=100, node_color=color_map, style=style_map, width=0.2, font_size=4, with_labels=True)
plt.savefig('homologs_graph.svg')

In [None]:
# show genes annotations for the graph nodes
homologs_annot = genes_annot.query('gene_id in @gene_ids').sort_values(['gene_id', 'chrom', 'begin'])
display(homologs_annot.reset_index(drop=True))

In [None]:
# write results into a CSV file
homologs_annot.set_index('gene_id').to_csv(outfiles['homologs_annot'])