- Description of Notebook: This is for taking the SyntenyScore category slices, and running them through StringDB to see if there is a functional enrichment
- Date: 28th May 2018
- By: Natasha Glover

# Setup

In [4]:
# Scientific libraries
import numpy as np
from scipy import stats
import pandas as pd
from tables import *

#API libraries
import requests
import sys
from pandas.io.json import json_normalize
import time

# Pyoma libraries
from pyoma.browser import db
# from pyoma.browser import homoeologs

# Graphic libraries
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# Extra options
pd.set_option('max_colwidth',200)

In [2]:
#load in hdf5 database
h5file = open_file('/Users/nglover/OmaServerDec2017.h5', mode='r')
#Make pyoma objects
dbObj = db.Database(h5file)
omaIdObj = db.OmaIdMapper(dbObj)
xrefObj = db.XrefIdMapper(dbObj)

be ready to see PyTables asking for *lots* of memory and possibly slow
I/O.  You may want to reduce the rowsize by trimming the value of
dimensions that are orthogonal (and preferably close) to the *main*
dimension of this leave.  Alternatively, in case you have specified a
very small/large chunksize, you may want to increase/decrease it.


# Functions for stringDB

Highly recommended to read this tutorial from string! http://version10.string-db.org/help/api/ (This is where most of these functions are based on)

In [5]:
string_api_url = "https://string-db.org/api"
output_format = "json"
caller_identity  = "natasha.glover@unil.ch"
echo_query = "1"


def get_string_ids_from_xrefs(list_of_uniprot_ids, genome):
    '''Takes a list of xref ids, then returns a df of string ids, xrefs, and function annotations.
       Must give genome of query ids, to make it easier for string.'''
    
    final_df = pd.DataFrame()
    ncbitaxid = str(omaIdObj.genome_from_UniProtCode(genome)['NCBITaxonId'])
    method = "get_string_ids"
    limit=1
    number_of_requests = len(list_of_uniprot_ids)
    number_of_chunks = number_of_requests/500
    if number_of_requests > 500:
        chunks = np.array_split(list_of_uniprot_ids,np.ceil(number_of_chunks))  
    else:
        chunks = [list_of_uniprot_ids]
    for chunk in chunks:
        request_url = construct_string_request(string_api_url, output_format, chunk,
                                     ncbitaxid, method, limit, echo_query, caller_identity)

        response = requests.get(request_url)
        time.sleep(1)
        df = json_normalize(response.json())
        
    final_df = final_df.append(df)
    print("Number of query genes with stringid: "+ str(len(final_df)))

    return final_df


def get_string_interactors(list_of_string_ids, genome, limit=5):
    '''Gets the interactors based on stringids'''
    final_df = pd.DataFrame()
    ncbitaxid = str(omaIdObj.genome_from_UniProtCode(genome)['NCBITaxonId'])
    method = "interaction_partners"
    number_of_requests = len(list_of_string_ids)
    number_of_chunks = number_of_requests/200
    if number_of_requests > 200:
        chunks = np.array_split(list_of_string_ids,np.ceil(number_of_chunks))  
    else:
        chunks = [list_of_string_ids]
    for chunk in chunks:
        request_url = construct_string_request(string_api_url, output_format, chunk,
                                     ncbitaxid, method, limit, echo_query, caller_identity)

        response = requests.get(request_url)
        time.sleep(1)
        df = json_normalize(response.json())
        
        final_df = final_df.append(df)
        if len(final_df)> 0:
            return final_df
        else:
            return []

def construct_string_request(string_api_url, output_format, my_genes,
                             ncbitaxid, method, limit, echo_query, caller_identity):
    ## Construct the request
    request_url = string_api_url + "/" + output_format + "/" + method + "?"
    request_url += "identifiers=" + "%0d".join(my_genes)
    request_url += "&" + "species=" + ncbitaxid 
    request_url += "&" + "limit=" + str(limit)
    request_url += "&" + "echo_query=" + echo_query
    request_url += "&" + "caller_identity=" + caller_identity
    
    return request_url

def get_functional_enrichment(list_of_string_ids, genome):
    final_df = pd.DataFrame()
    ncbitaxid = str(omaIdObj.genome_from_UniProtCode(genome)['NCBITaxonId'])  
    method = "enrichment"
    number_of_requests = len(list_of_string_ids)
    number_of_chunks = number_of_requests/100
    if number_of_requests > 100:
        chunks = np.array_split(list_of_string_ids,np.ceil(number_of_chunks))  
    else:
        chunks = [list_of_string_ids]
    for chunk in chunks:
        request_url = string_api_url + "/" + output_format + "/" + method + "?"
        request_url += "identifiers=" + "%0d".join(chunk)
        request_url += "&" + "species=" + genome
        request_url += "&" + "caller_identity=" + caller_identity

        response = requests.get(request_url)
        time.sleep(1)
        df = json_normalize(response.json())
        
        final_df = final_df.append(df)
        if len(final_df)> 0:
            return final_df
        else:
            return []

#some functions for id conversion
def search(name, sourcelist):
    for x in sourcelist:
        if x['source'] == name:
            return x

# Example: Getting stringids, interactors, and functional enrichment for a list of genes

In [6]:
genes_of_interest = ["ARATH00006", "ARATH00007", "ARATH00008"]

First you need to get the string ids of your genes of interest somehow. So you have to get the cross-references (xref) from oma. An entrynr can have more than 1 xref, so you have to choose which one is best. I used the source id, but you can use uniprot ids or whatever.


In [5]:
#Example using 'SourceAC'

#map omaids to entrynrs
entrynrs = [omaIdObj.omaid_to_entry_nr(x) for x in genes_of_interest] 

#get xrefs for entrynrs
xrefs = [xrefObj.map_entry_nr(x) for x in entrynrs]

#search xrefs for the SourceAC
xrefs_source = [search('SourceAC',x)['xref'] for x in xrefs]

xrefs_source

['ATCG00120.1', 'ATCG00130.1', 'ATCG00140.1']

In [6]:
#Example using Uniprot ids
xrefs = [xrefObj.map_entry_nr(x) for x in entrynrs]
xrefs_uniprot = [search('UniProtKB/SwissProt',x)['xref'] for x in xrefs]
xrefs_uniprot

['ATPA_ARATH', 'ATPF_ARATH', 'ATPH_ARATH']

Get the string ids using the api. You can use either the list of sourceids or uniprot ids. Must give the genome which contains the genes, because it's needed for stringdb

In [7]:
stringids_df = get_string_ids_from_xrefs(xrefs_uniprot, "ARATH")
stringids = stringids_df['stringId']
stringids_df

Number of query genes with stringid: 3


Unnamed: 0,annotation,ncbiTaxonId,preferredName,queryIndex,queryItem,stringId,taxonName
0,ATP synthase subunit alpha; Produces ATP from ADP in the presence of a proton gradient across the membrane. The alpha chain is a regulatory subunit,3702,ATPA,0,ATPA_ARATH,3702.ATCG00120.1,Arabidopsis thaliana
1,"ATPase F subunit; F(1)F(0) ATP synthase produces ATP from ADP in the presence of a proton or sodium gradient. F-type ATPases consist of two structural domains, F(1) containing the extramembraneous...",3702,ATPF,1,ATPF_ARATH,3702.ATCG00130.1,Arabidopsis thaliana
2,"ATPase III subunit; F(1)F(0) ATP synthase produces ATP from ADP in the presence of a proton or sodium gradient. F-type ATPases consist of two structural domains, F(1) containing the extramembraneo...",3702,ATPH,2,ATPH_ARATH,3702.ATCG00140.1,Arabidopsis thaliana


Get interactors of the 3 genes we just got the stringids for. Note you can control how many interactors to output by the limit parameter.

In [8]:
interactors_df = get_string_interactors(stringids, "ARATH", limit=10)
interactors_df

Unnamed: 0,ascore,dscore,escore,fscore,ncbiTaxonId,nscore,preferredName_A,preferredName_B,pscore,score,stringId_A,stringId_B,tscore
0,0.988,0.772,0.93,0,3702,0.454,ATPA,ATPC2,0.51,0.999,ATCG00120.1,AT1G15700.1,0.655
1,0.988,0.772,0.93,0,3702,0.454,ATPA,ATP3,0.51,0.999,ATCG00120.1,AT2G33040.1,0.574
2,0.988,0.772,0.97,0,3702,0.454,ATPA,ATPC1,0.506,0.999,ATCG00120.1,AT4G04640.1,0.727
3,0.988,0.772,0.938,0,3702,0.454,ATPA,ATPD,0.0,0.999,ATCG00120.1,AT4G09650.1,0.592
4,0.983,0.772,0.923,0,3702,0.454,ATPA,AT5G08680,0.183855,0.999,ATCG00120.1,AT5G08680.1,0.168861
5,0.983,0.772,0.923,0,3702,0.454,ATPA,AT5G08690,0.18334,0.999,ATCG00120.1,AT5G08690.1,0.168388
6,0.988,0.772,0.938,0,3702,0.454,ATPA,ATP5,0.0,0.999,ATCG00120.1,AT5G13450.1,0.687
7,0.99,0.772,0.936,0,3702,0.454,ATPA,ATPF,0.0,0.999,ATCG00120.1,ATCG00130.1,0.439
8,0.988,0.772,0.862,0,3702,0.454,ATPA,ATPH,0.0,0.999,ATCG00120.1,ATCG00140.1,0.477
9,0.992,0.772,0.967,0,3702,0.454,ATPA,PB,0.185802,0.999,ATCG00120.1,ATCG00480.1,0.243798


Now get the functional enrichment of the interactors, or really just the functional enrichment of any list of genes

In [9]:
#functional enrichment of original 3 genes
get_functional_enrichment(stringids, "ARATH")

Unnamed: 0,bonferroni,category,description,fdr,inputGenes,ncbiTaxonId,number_of_genes,p_value,preferredNames,term
0,4.12e-06,Process,ATP synthesis coupled proton transport,2.06e-06,"[3702.ATCG00120.1, 3702.ATCG00130.1, 3702.ATCG00140.1]",3702,3,8.52e-10,"[ATPA, ATPF, ATPH]",GO:0015986
1,0.0243,Process,ATP hydrolysis coupled proton transport,0.000426,"[3702.ATCG00120.1, 3702.ATCG00140.1]",3702,2,5.03e-06,"[ATPA, ATPH]",GO:0015991
2,2.54e-06,KEGG,Photosynthesis,2.54e-06,"[3702.ATCG00120.1, 3702.ATCG00130.1, 3702.ATCG00140.1]",3702,3,1.97e-08,"[ATPA, ATPF, ATPH]",00195
3,0.036,KEGG,Metabolic pathways,0.012,"[3702.ATCG00120.1, 3702.ATCG00130.1, 3702.ATCG00140.1]",3702,3,0.000279,"[ATPA, ATPF, ATPH]",01100
4,1.91e-05,KEGG,Oxidative phosphorylation,9.54e-06,"[3702.ATCG00120.1, 3702.ATCG00130.1, 3702.ATCG00140.1]",3702,3,1.48e-07,"[ATPA, ATPF, ATPH]",00190
5,0.00108,Component,chloroplast thylakoid membrane,0.000153,"[3702.ATCG00120.1, 3702.ATCG00130.1, 3702.ATCG00140.1]",3702,3,1.42e-06,"[ATPA, ATPF, ATPH]",GO:0009535
6,1.0,Component,intracellular,0.711,"[3702.ATCG00120.1, 3702.ATCG00130.1, 3702.ATCG00140.1]",3702,3,0.0298,"[ATPA, ATPF, ATPH]",GO:0005622
7,1.0,Component,cytoplasm,0.314,"[3702.ATCG00120.1, 3702.ATCG00130.1, 3702.ATCG00140.1]",3702,3,0.0107,"[ATPA, ATPF, ATPH]",GO:0005737
8,0.00199,Component,chloroplast thylakoid,0.00018,"[3702.ATCG00120.1, 3702.ATCG00130.1, 3702.ATCG00140.1]",3702,3,2.6e-06,"[ATPA, ATPF, ATPH]",GO:0009534
9,1.0,Component,cell,0.931,"[3702.ATCG00120.1, 3702.ATCG00130.1, 3702.ATCG00140.1]",3702,3,0.0439,"[ATPA, ATPF, ATPH]",GO:0005623


In [10]:
#functional enrichment of the interactors of the 3 genes
interactors_stringids = interactors_df['stringId_B']
interactors_stringids

0     AT1G15700.1
1     AT2G33040.1
2     AT4G04640.1
3     AT4G09650.1
4     AT5G08680.1
5     AT5G08690.1
6     AT5G13450.1
7     ATCG00130.1
8     ATCG00140.1
9     ATCG00480.1
10    AT2G07698.1
11    AT2G25610.1
12    AT4G38920.1
13    AT5G08680.1
14    AT5G08690.1
15    ATCG00120.1
16    ATCG00140.1
17    ATCG00470.1
18    ATCG00480.1
19    ATMG01190.1
20    AT2G07698.1
21    AT2G33040.1
22    AT4G09650.1
23    AT5G08690.1
24    ATCG00120.1
25    ATCG00130.1
26    ATCG00150.1
27    ATCG00470.1
28    ATCG00480.1
29    ATMG01190.1
Name: stringId_B, dtype: object

In [11]:
get_functional_enrichment(interactors_stringids, "ARATH")

Unnamed: 0,bonferroni,category,description,fdr,inputGenes,ncbiTaxonId,number_of_genes,p_value,preferredNames,term
0,3.280000e-02,Pfam,ATP synthase subunit C,8.190000e-03,"[AT4G38920.1, ATCG00140.1]",3702,2,1.300000e-05,"[VHA-C3, ATPH]",PF00137
1,5.470000e-03,Pfam,"ATP synthase alpha/beta family, beta-barrel domain",1.820000e-03,"[AT2G07698.1, ATMG01190.1]",3702,2,2.170000e-06,"[AT2G07698, ATP1]",PF02874
2,5.470000e-03,Pfam,"ATP synthase alpha/beta chain, C terminal domain",1.820000e-03,"[AT2G07698.1, ATMG01190.1]",3702,2,2.170000e-06,"[AT2G07698, ATP1]",PF00306
3,5.470000e-03,Pfam,"ATP synthase alpha/beta family, nucleotide-binding domain",1.820000e-03,"[AT2G07698.1, ATMG01190.1]",3702,2,2.170000e-06,"[AT2G07698, ATP1]",PF00006
4,4.010000e-03,Process,single-organism cellular process,4.460000e-05,"[AT2G25610.1, AT2G33040.1, AT4G38920.1, AT5G08680.1, AT5G08690.1, AT5G13450.1, ATCG00120.1, ATCG00130.1, ATCG00140.1, ATCG00150.1, ATCG00470.1, ATCG00480.1, ATMG01190.1]",3702,13,8.290000e-07,"[AT2G25610, ATP3, VHA-C3, AT5G08680, AT5G08690, ATP5, ATPA, ATPF, ATPH, ATPI, ATPE, PB, ATP1]",GO:0044763
5,1.000000e+00,Process,"photosynthesis, light reaction",5.900000e-02,"[AT4G04640.1, AT4G09650.1]",3702,2,1.200000e-03,"[ATPC1, ATPD]",GO:0019684
6,4.270000e-02,Process,cellular metabolic process,4.640000e-04,"[AT1G15700.1, AT2G33040.1, AT4G04640.1, AT5G08680.1, AT5G08690.1, AT5G13450.1, ATCG00120.1, ATCG00130.1, ATCG00140.1, ATCG00150.1, ATCG00470.1, ATCG00480.1, ATMG01190.1]",3702,13,8.840000e-06,"[ATPC2, ATP3, ATPC1, AT5G08680, AT5G08690, ATP5, ATPA, ATPF, ATPH, ATPI, ATPE, PB, ATP1]",GO:0044237
7,4.890000e-02,Process,photosynthetic electron transport in photosystem II,5.210000e-04,"[AT4G04640.1, AT4G09650.1]",3702,2,1.010000e-05,"[ATPC1, ATPD]",GO:0009772
8,1.000000e+00,Process,metabolic process,1.560000e-01,"[AT2G33040.1, AT5G08680.1, AT5G08690.1, AT5G13450.1, ATCG00120.1, ATCG00130.1, ATCG00140.1, ATCG00150.1, ATCG00470.1, ATCG00480.1, ATMG01190.1]",3702,11,3.230000e-03,"[ATP3, AT5G08680, AT5G08690, ATP5, ATPA, ATPF, ATPH, ATPI, ATPE, PB, ATP1]",GO:0008152
9,7.170000e-06,Process,single-organism metabolic process,8.150000e-08,"[AT1G15700.1, AT2G33040.1, AT4G04640.1, AT5G08680.1, AT5G08690.1, AT5G13450.1, ATCG00120.1, ATCG00130.1, ATCG00140.1, ATCG00150.1, ATCG00470.1, ATCG00480.1, ATMG01190.1]",3702,13,1.480000e-09,"[ATPC2, ATP3, ATPC1, AT5G08680, AT5G08690, ATP5, ATPA, ATPF, ATPH, ATPI, ATPE, PB, ATP1]",GO:0044710
