# Compare QTLs for tomato fruit shape and potato tuber shape

## Background

Tomato fruits have a round shape while the potato tubers have an elongated shape. The candidate gene _Solyc10g076180_ (_SlOFP20_, a member of the OVATE family) on the chromosome 10 of the reference tomato genome (Heinz 1706) is responsible for round fruits. However, this gene does not have an ortholog in the reference potato genome (DM), which results in elongated tuber (<a href="https://dx.doi.org/10.1038%2Fs41467-018-07216-8">Wu et al., 2018</a>). This notebook uses the <a href="http://grlc.io/">grlc</a>-based Web API of the <a href="https://doi.org/10.5281/zenodo.1458168">pbg-ld</a> platform to map QTL regions for the traits in both tomato and potato as well as to retrieve annotations for the genes in the QTLs.

### Initialization

In [1]:
import requests
import io
import pandas as pd
import numpy as np

In [27]:
pd.set_option('display.max_rows', 100)

In [2]:
url = dict(local='http://localhost:8088/api/local/local',
           remote='http://pbg-ld.candygene-nlesc.surf-hosted.nl:8088/api/candYgene/queries')
base_url = url['remote']
headers = {'accept': 'text/csv'} # request CSV output via Web API
#print(base_url)

In [3]:
# input genes 'flanking' the candidate gene in tomato
input_data = dict(tomato=dict(genes=['Solyc10g075170.1', 'Solyc10g076240.1'],
                              taxon_id=4081,
                              graph_iri='http://solgenomics.net/genome/Solanum_lycopersicum'))

In [4]:
# retrieve genomic locations for the genes
genes = pd.DataFrame()
for g in input_data['tomato']['genes']:
    try:
        if g is not None:
            with requests.get(base_url + '/getFeatureLocation',
                              params = {'featureid': "'%s'" % g},
                              headers = headers) as req:
                genes = genes.append(pd.read_csv(io.StringIO(req.text)), ignore_index=True)
    except:
        print('Failed to connect to the Web API!')
        break

In [5]:
genes.set_index('feature_id')

Unnamed: 0_level_0,feature_name,chrom,begin_pos,end_pos,taxon_id
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Solyc10g075170.1,protein_coding_gene,chromosome 10,58891402,58895882,4081
Solyc10g076240.1,protein_coding_gene,chromosome 10,59082990,59084119,4081


In [6]:
# compute the QTL interval given the start/end positions of the genes
pos = pd.concat([genes['begin_pos'], genes['end_pos']]).describe()
qtl_interval = dict(chrom=genes['chrom'].unique()[0],
                    taxon_id=int(genes['taxon_id'].unique()[0]),
                    begin=int(pos['min']),
                    end=int(pos['max']))

In [7]:
qtl_interval

{'chrom': 'chromosome 10',
 'taxon_id': 4081,
 'begin': 58891402,
 'end': 59084119}

In [8]:
# retrieve all genes in the genomic interval
all_genes = pd.DataFrame()
params = {'feature': 'protein_coding_gene',
          'chrom': qtl_interval['chrom'],
          'graph': input_data['tomato']['graph_iri'],
          'begin': qtl_interval['begin'],
          'end': qtl_interval['end']}
try:
    with requests.get(base_url + '/getFeaturesInInterval',
                      params=params,
                      headers=headers) as req:
        all_genes = pd.read_csv(io.StringIO(req.text))
except:
    print('Failed to connect to the Web API!')

In [9]:
all_genes.set_index('feature_id')

Unnamed: 0_level_0,feature_name,chrom,begin_pos,end_pos
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Solyc10g075170.1,protein_coding_gene,chromosome 10,58891402,58895882
Solyc10g076170.1,protein_coding_gene,chromosome 10,58981351,58981887
Solyc10g076180.1,protein_coding_gene,chromosome 10,59006329,59007294
Solyc10g076190.1,protein_coding_gene,chromosome 10,59045991,59047132
Solyc10g076200.1,protein_coding_gene,chromosome 10,59051770,59052216
Solyc10g076210.1,protein_coding_gene,chromosome 10,59054478,59055612
Solyc10g076220.1,protein_coding_gene,chromosome 10,59059828,59060961
Solyc10g076230.1,protein_coding_gene,chromosome 10,59074041,59074647
Solyc10g076240.1,protein_coding_gene,chromosome 10,59082990,59084119


In [10]:
# for each tomato gene retrieve its orthologous gene (s) in potato
rows = []
for index, g in all_genes.iterrows():
    try:
        with requests.get(base_url + '/getOrthologs',
                          params={'geneid': "'%s'" % g['feature_id']},
                          headers=headers) as req:
            df = pd.read_csv(io.StringIO(req.text))
            if df.size == 0:
                rows.append([g['feature_id'], g['chrom'], g['begin_pos'], g['end_pos'],
                             None, None, None, None]) # ortholog not found
                continue
            for index, j in df.iterrows():
                with requests.get(base_url + '/getFeatureLocation',
                    params={'featureid': "'%s'" % j['ortholog_id']},
                    headers=headers) as req:
                    for index, o in pd.read_csv(io.StringIO(req.text)).iterrows():
                        rows.append([g['feature_id'], g['chrom'], g['begin_pos'], g['end_pos'],
                                     o['feature_id'], o['chrom'], o['begin_pos'], o['end_pos']])
  
    except:
        print('Failed to connect to the Web API!')
        break

In [36]:
cols = ['gene_id', 'chrom', 'begin', 'end', 'ortho_id', 'ortho_chrom', 'ortho_begin', 'ortho_end']
orthologs = pd.DataFrame(rows, columns=cols)
orthologs['ortho_begin'] = orthologs['ortho_begin'].fillna(-1)
orthologs['ortho_end'] = orthologs['ortho_end'].fillna(-1)
orthologs['ortho_begin'] = orthologs['ortho_begin'].astype(np.int64)
orthologs['ortho_end'] = orthologs['ortho_end'].astype(np.int64)
orthologs = orthologs.replace([-1], [None])
orthologs.set_index('gene_id').sort_values(['gene_id', 'ortho_chrom', 'ortho_begin'])

Unnamed: 0_level_0,chrom,begin,end,ortho_id,ortho_chrom,ortho_begin,ortho_end
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Solyc10g075170.1,chromosome 10,58891402,58895882,Solyc07g032740.2,chromosome 7,40981937.0,40987085.0
Solyc10g075170.1,chromosome 10,58891402,58895882,Solyc07g055210.2,chromosome 7,63320669.0,63325039.0
Solyc10g075170.1,chromosome 10,58891402,58895882,Solyc08g041870.2,chromosome 8,25450517.0,25456311.0
Solyc10g075170.1,chromosome 10,58891402,58895882,Solyc08g068330.2,chromosome 8,57428127.0,57436252.0
Solyc10g076170.1,chromosome 10,58981351,58981887,,,,
Solyc10g076180.1,chromosome 10,59006329,59007294,Solyc01g007800.2,chromosome 1,1955327.0,1956217.0
Solyc10g076180.1,chromosome 10,59006329,59007294,Solyc10g082050.1,chromosome 10,62944463.0,62944999.0
Solyc10g076180.1,chromosome 10,59006329,59007294,Solyc11g068780.1,chromosome 11,53412570.0,53412983.0
Solyc10g076180.1,chromosome 10,59006329,59007294,Solyc02g072030.1,chromosome 2,41330364.0,41331497.0
Solyc10g076180.1,chromosome 10,59006329,59007294,Solyc02g085500.2,chromosome 2,48367481.0,48369215.0


In [12]:
# count orthologs per gene
display(orthologs.groupby(['gene_id'])['ortho_id']
        .agg(['count'])
        .sort_values(['count'], ascending=False)
        .rename(columns={'count':'n_orthologs'}))

Unnamed: 0_level_0,n_ortho
gene_id,Unnamed: 1_level_1
Solyc10g076200.1,2
Solyc10g076210.1,2
Solyc10g076220.1,2
Solyc10g075170.1,1
Solyc10g076190.1,1
Solyc10g076230.1,1
Solyc10g076170.1,0
Solyc10g076180.1,0
Solyc10g076240.1,0


### Note that neither the candidate gene _Solyc10g076180_ (_SlOFP20_) nor the flanking genes _Solyc10g076170_ and _Solyc10g076240_ have orthologs in potato.

In [19]:
# for each tomato gene retrieve its paralogous gene(s)
rows = []
for index, g in all_genes.iterrows():
    try:
        with requests.get(base_url + '/getParalogs',
                          params={'geneid': "'%s'" % g['feature_id']},
                          headers=headers) as req:
            df = pd.read_csv(io.StringIO(req.text))
            if df.size == 0:
                rows.append([g['feature_id'], g['chrom'], g['begin_pos'], g['end_pos'],
                             None, None, None, None]) # paralog not found
                continue
            for index, j in df.iterrows():
                with requests.get(base_url + '/getFeatureLocation',
                    params={'featureid': "'%s'" % j['paralog_id']},
                    headers=headers) as req:
                    for index, o in pd.read_csv(io.StringIO(req.text)).iterrows():
                        rows.append([g['feature_id'], g['chrom'], g['begin_pos'], g['end_pos'],
                                     o['feature_id'], o['chrom'], o['begin_pos'], o['end_pos']])
  
    except:
        print('Failed to connect to the Web API!')
        break

In [37]:
cols = ['gene_id', 'chrom', 'begin', 'end', 'para_id', 'para_chrom', 'para_begin', 'para_end']
paralogs = pd.DataFrame(rows, columns=cols)
paralogs['para_begin'] = paralogs['para_begin'].fillna(-1)
paralogs['para_end'] = paralogs['para_end'].fillna(-1)
paralogs['para_begin'] = paralogs['para_begin'].astype(np.int64)
paralogs['para_end'] = paralogs['para_end'].astype(np.int64)
paralogs = paralogs.replace([-1], [None])
paralogs.set_index('gene_id').sort_values(['gene_id', 'para_chrom', 'para_begin'])

Unnamed: 0_level_0,chrom,begin,end,para_id,para_chrom,para_begin,para_end
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Solyc10g075170.1,chromosome 10,58891402,58895882,Solyc07g032740.2,chromosome 7,40981937.0,40987085.0
Solyc10g075170.1,chromosome 10,58891402,58895882,Solyc07g055210.2,chromosome 7,63320669.0,63325039.0
Solyc10g075170.1,chromosome 10,58891402,58895882,Solyc08g041870.2,chromosome 8,25450517.0,25456311.0
Solyc10g075170.1,chromosome 10,58891402,58895882,Solyc08g068330.2,chromosome 8,57428127.0,57436252.0
Solyc10g076170.1,chromosome 10,58981351,58981887,,,,
Solyc10g076180.1,chromosome 10,59006329,59007294,Solyc01g007800.2,chromosome 1,1955327.0,1956217.0
Solyc10g076180.1,chromosome 10,59006329,59007294,Solyc10g082050.1,chromosome 10,62944463.0,62944999.0
Solyc10g076180.1,chromosome 10,59006329,59007294,Solyc11g068780.1,chromosome 11,53412570.0,53412983.0
Solyc10g076180.1,chromosome 10,59006329,59007294,Solyc02g072030.1,chromosome 2,41330364.0,41331497.0
Solyc10g076180.1,chromosome 10,59006329,59007294,Solyc02g085500.2,chromosome 2,48367481.0,48369215.0


In [38]:
# count paralogs per gene
display(paralogs.groupby(['gene_id'])['para_id']
        .agg(['count'])
        .sort_values(['count'], ascending=False)
        .rename(columns={'count':'n_paralogs'}))

Unnamed: 0_level_0,n_paralogs
gene_id,Unnamed: 1_level_1
Solyc10g076200.1,22
Solyc10g076190.1,11
Solyc10g076210.1,11
Solyc10g076220.1,11
Solyc10g076240.1,11
Solyc10g076180.1,10
Solyc10g075170.1,4
Solyc10g076230.1,1
Solyc10g076170.1,0


### Note that the gene _Solyc10g076170_ does not have a paralog in tomato.

### TODO

Number of orthologs for each gene

# Annotations
Compare GO annotatios in all three classes

Genes, orthologs for potato and tomato and annotations

In [16]:
#%matplotlib inline
#import matplotlib.pyplot as plt
#from matplotlib_venn import venn2

In [17]:
#setT = genesT_in_interval.index.unique()
#setP = genesP_in_interval.index.unique()
#setTP = setT & ortholog_genesP_for_interval["ortholog_gene_id"].dropna().unique()
#setPT = setP & ortholog_genesT_for_interval["ortholog_gene_id"].dropna().unique()
#setT = list(set(setT)-set(setTP))
#setP = list(set(setP)-set(setPT))
#venn2(subsets = (len(setT), len(setTP), len(setP)), set_labels = ("Tomato", "Potato"))
#venn2(subsets = (1,2,3), set_labels = ("Tomato", "Potato"))
#plt.show()

## Only in Tomato interval

In [18]:
print(*setT, sep = "\n")

NameError: name 'setT' is not defined

In [None]:
try:
    termsT = []
    for gene_id in setT:
        annotations = requests.get(url+"/getGeneAnnotations", 
                                   params={"geneid": "'"+gene_id+"'"}, 
                                   headers={"accept": "application/json"})        
        for annotation in annotations.json()["results"]["bindings"]:
            uniprot_goa = annotation["uniprot_goa"]["value"].strip()
            if uniprot_goa=="":
                termsT.append([gene_id, None])
            else:
                for go_id in uniprot_goa.split(";"):                    
                    termsT.append([gene_id, go_id.strip()])
            break        
    termsT = pd.DataFrame(termsT)
    termsT.columns = ["gene_id", "go_id"]
    termsT = termsT.set_index(["gene_id"])
except:
    raise Exception("couldn't get terms for genes") 

In [None]:
goT = set([go_id for go_id in termsT["go_id"] if not (go_id == None)])

In [None]:
len(goT)

In [None]:
print(*goT, sep="\n")

In [None]:
termsT.set_index(["go_id"], append=True)

## Only in Potato

In [None]:
print(*setP, sep = "\n")

In [None]:
try:
    termsP = []
    for gene_id in setP:
        annotations = requests.get(url+"/getGeneAnnotations", 
                                   params={"geneid": "'"+gene_id+"'"}, 
                                   headers={"accept": "application/json"})        
        for annotation in annotations.json()["results"]["bindings"]:
            uniprot_goa = annotation["uniprot_goa"]["value"].strip()
            if uniprot_goa=="":
                termsP.append([gene_id, None])
            else:
                for go_id in uniprot_goa.split(";"):                    
                    termsP.append([gene_id, go_id.strip()])
            break        
    termsP = pd.DataFrame(termsP)
    termsP.columns = ["gene_id", "go_id"]
    termsP = termsP.set_index(["gene_id"])
except:
    raise Exception("couldn't get terms for genes") 

In [None]:
goP = set([go_id for go_id in termsP["go_id"] if not (go_id == None)])

In [None]:
len(goP)

In [None]:
print(*goP, sep="\n")

In [None]:
termsP.set_index(["go_id"], append=True)

## Both in Tomato and Potato

In [None]:
print(*setTP, sep = "\n")
#print(*setPT, sep = "\n")

In [None]:
try:
    termsTP = []
    for gene_id in setTP:
        annotations = requests.get(url+"/getGeneAnnotations", 
                                   params={"geneid": "'"+gene_id+"'"}, 
                                   headers={"accept": "application/json"})        
        for annotation in annotations.json()["results"]["bindings"]:
            uniprot_goa = annotation["uniprot_goa"]["value"].strip()
            if uniprot_goa=="":
                termsTP.append([gene_id, None])
            else:
                for go_id in uniprot_goa.split(";"):                    
                    termsTP.append([gene_id, go_id.strip()])
            break        
    termsTP = pd.DataFrame(termsTP)
    termsTP.columns = ["gene_id", "go_id"]
    termsTP = termsTP.set_index(["gene_id"])
except:
    raise Exception("couldn't get terms for genes") 

In [None]:
print(*set([go_id for go_id in termsTP["go_id"] if not (go_id == None)]), sep="\n")

In [None]:
termsTP.set_index(["go_id"], append=True)

## Exploring the Annotations of Gene *Solyc10g076180.1*

- GO Annotation
- PPI 
- STRING
- KEGG
- ALL Orthologs
- Species with no Orthologs

In [None]:
try:
    ortholog_paralog_genesT_for_interval = []
    for gene_id,gene in genesT_in_interval.iterrows():
        #get paralogs, and then orthologs
        paralogs = requests.get(url+"/getParalogs", 
                        params={"geneid": "'"+gene_id+"'"}, 
                        headers={"accept": "application/json"})
        for paralog in paralogs.json()["results"]["bindings"]:
            paralog_id = paralog["paralog_id"]["value"]
            #get orthologs for paralog
            orthologs = requests.get(url+"/getOrthologs", 
                        params={"geneid": "'"+paralog_id+"'"}, 
                        headers={"accept": "application/json"})
            for ortholog in orthologs.json()["results"]["bindings"]:
                ortholog_id = ortholog["ortholog_id"]["value"]
                #get location
                location = requests.get(url+"/getFeatureLocation", 
                                params={"featureid": "'"+ortholog_id+"'"}, 
                                headers={"accept": "application/json"})
                chrom = location.json()["results"]["bindings"][0]["chrom"]["value"]
                taxon_id = location.json()["results"]["bindings"][0]["taxon_id"]["value"]
                begin_pos = int(location.json()["results"]["bindings"][0]["begin_pos"]["value"])
                end_pos = int(location.json()["results"]["bindings"][0]["end_pos"]["value"]) 
                ortholog_paralog_genesT_for_interval.append([gene_id, gene["chrom"],gene["begin_pos"],
                                                             gene["end_pos"],gene["taxon_id"],paralog_id,
                                                             "PARALOG", ortholog_id,
                                                             chrom, begin_pos, end_pos, taxon_id])
        #get direct orthologs
        orthologs = requests.get(url+"/getOrthologs", 
                        params={"geneid": "'"+gene_id+"'"}, 
                        headers={"accept": "application/json"})
        for ortholog in orthologs.json()["results"]["bindings"]:
            ortholog_id = ortholog["ortholog_id"]["value"]
            #get location
            location = requests.get(url+"/getFeatureLocation", 
                            params={"featureid": "'"+ortholog_id+"'"}, 
                            headers={"accept": "application/json"})
            chrom = location.json()["results"]["bindings"][0]["chrom"]["value"]
            taxon_id = location.json()["results"]["bindings"][0]["taxon_id"]["value"]
            begin_pos = int(location.json()["results"]["bindings"][0]["begin_pos"]["value"])
            end_pos = int(location.json()["results"]["bindings"][0]["end_pos"]["value"]) 
            ortholog_paralog_genesT_for_interval.append([gene_id, gene["chrom"],gene["begin_pos"],
                                                 gene["end_pos"],gene["taxon_id"],None, "ORTHOLOG", ortholog_id,
                                                 chrom, begin_pos, end_pos, taxon_id])    
    #create dataframe        
    ortholog_paralog_genesT_for_interval = pd.DataFrame(ortholog_paralog_genesT_for_interval)            
    ortholog_paralog_genesT_for_interval.columns = ["gene_id", "chrom", "begin_pos", "end_pos", "taxon_id", "paralog_gene_id",
                                            "path", "ortholog_gene_id", "ortholog_chrom", "ortholog_begin_pos",
                                            "ortholog_end_pos", "ortholog_taxon_id"]
    ortholog_paralog_genesT_for_interval = ortholog_paralog_genesT_for_interval.set_index(["gene_id"])
except:
    raise Exception("couldn't get orthologs")  

Number of orthologs for each gene

In [None]:
aggregations = { "ortholog_gene_id" : ["nunique"]}
display(ortholog_paralog_genesT_for_interval.groupby(["gene_id"]).agg(aggregations))

In [None]:
index=pd.MultiIndex.from_tuples([tuple(x) for x in ortholog_paralog_genesT_for_interval[["chrom", "begin_pos", "end_pos", "paralog_gene_id", "ortholog_gene_id"]].to_records()], names=["gene_id", "chrom", "begin_pos", "end_pos", "paralog_gene_id", "ortholog_gene_id"])
display(ortholog_paralog_genesT_for_interval.set_index(index)[["path", "ortholog_chrom", "ortholog_begin_pos", "ortholog_end_pos"]])      