# Generate table of transcripts with associated pathways

Target data structure:

| gene_id | transcript_id | num_detected | num_sampled | source        | gene_symbol | pathway_id    |
|:-------:|:-------------:|:------------:|:-----------:|:-------------:|:-----------:|:-------------:|
| `<str>`   | `<str>`         | `<int>`        | `<int>`       | `[<str>,<str>,...]` | `<str>`       | `[<str>,<str>,...]` |



Workflow:

1. Query all *C. reinhardtii* v5.5 transcripts from Phytozome v12.1 using intermine. Only transcripts with identified pathways are returned in the query results.
2. Export query results to `~/lipid_selection/data/intermediate_data_02/gene_info`

In [18]:
import sys
sys.path.append('/home/chenwe72/local/lib/python3.5/site-packages')
sys.path.append('../../scripts/')
#sys.path

In [19]:
import pickle
import csv
from intermine.webservice import Service
import pandas as pd
import time
import numpy as np
import re
import Search_algorithms as sag
from functools import partial
import psutil #psutil-5.7.0
from multiprocessing import Pool #multiprocessing-2.6.2.1

## 1. Query all C. reinhardtii v5.5 transcripts from Phytozome v12.1 using intermine.

In [None]:
#require intermine
t0= time.time()
service = Service("https://phytozome.jgi.doe.gov/phytomine/service")
query = service.new_query("Transcript")
query.add_view(
    "primaryIdentifier", "gene.primaryIdentifier", "gene.symbol",
    "gene.pathways.identifier", "organism.annotationVersion"
)
query.add_constraint("organism", "LOOKUP", "Chlamydomonas reinhardtii", code = "A")
query.outerjoin("gene.pathways")

In [4]:
#require csv

with open('../../data/intermediate_data_02/query.csv', mode='w') as gene_info:
    gene_info = csv.writer(gene_info, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    gene_info.writerow(['transcript_id', 'gene_id', 'gene_symbol', "pathway_id", "annotation_version"])
    #count = 0
    for row in query.rows():
        #count +=1
        gene_info.writerow([row["primaryIdentifier"], row["gene.primaryIdentifier"], row["gene.symbol"], \
        row["gene.pathways.identifier"], row["organism.annotationVersion"]])
        #if count >100 : break
        
t1= time.time()
print("Query took", t1-t0, "s")

Query took 8.769787788391113 s


## 2. Data wrangling

Reorganize query result to the follow dataframe structure:

| gene_id | transcript_id | gene_symbol | pathway_id    | annotation_version   |
|:-------:|:-------------:|:-----------:|:-------------:|:-------------:|
| `<str>`   | `<str>`         | `<str>`       | `[<str>,<str>,...]` |`<str>`       |


In [None]:
#This is faster than dictionary method
t0= time.time()

#Import query result
gene_info = pd.read_csv("../../data/intermediate_data_02/query.csv")

#Group by 'transcript_id', 'gene_id',and 'annotation_version', because each transcript has only one of these
grouped_by = ['transcript_id', 'gene_id', 'annotation_version']

#Check if any transcript has more than one gene_id's, annotation_versions, or gene_symbols
if len(gene_info.groupby(grouped_by)) == len(gene_info.groupby(['transcript_id'])):
    grouped = gene_info.groupby(grouped_by)
else: print("transcripts may have more than one gene_id's or annotation_versions")
    
#Initialize empty dictionary to convert to dataframe later
temp_dict= dict((key,[]) for key in gene_info.columns)


for name, group in grouped: 

    #each name is a unique combination of 'transcript_id', 'gene_id',and 'annotation_version'
    #group is the groupby dataframe of that combination
    
    for i in ['transcript_id', 'gene_id', 'annotation_version','gene_symbol']:
        
        ls = list(group.dropna(subset=[i])[i])
        if len(ls)==0:temp_dict[i].append(None)
        elif len(list(np.unique(ls)))==1: temp_dict[i].append(ls[0])
            
    i = 'pathway_id'  
    ls = list(group.dropna(subset=[i])[i])
    if len(ls) ==0:temp_dict[i].append(None)
    elif len(ls) >0: temp_dict[i].append(list(np.unique(ls)))
            
df = pd.DataFrame(temp_dict)
    
t1= time.time()
print("Time taken", t1-t0, "s")


## 3. Query result summary

In [6]:
#export results
df.to_csv("../../data/intermediate_data_02/query_result.csv", index = False, header = True)

print("Query returned", gene_info.shape[0], "results and", len(np.unique(gene_info.transcript_id)), "unique transcripts.")
#gene_info.pathway_id = gene_info.pathway_id.apply(lambda x: x.split(" ")[1])
#temp = gene_info.dropna(subset = ['pathway_id'])
print(len(np.unique(gene_info.dropna(subset=['pathway_id']).transcript_id)), "unique transcripts have matching pathways.")

Query returned 22282 results and 19526 unique transcripts.
1216 unique transcripts have matching pathways.


#### Check if all transcripts in int_data02 have matching transcripts in query result

In [22]:
df = pd.read_csv("../../data/intermediate_data_02/query_result.csv")

In [15]:
#require pickle
int_data02 = pd.read_pickle("../../data/intermediate_data_02/sampled_genes.pk")
int_data02 = int_data02.rename(columns = {'gene_id':'transcript_id'})
int_data02 = int_data02.reset_index(drop=True) 

In [36]:
int_data02.transcript_id[:10]

0    Cre01.g000017.t1.1
1    Cre01.g000017.t1.1
2    Cre01.g000033.t1.1
3    Cre01.g000050.t1.1
4    Cre01.g000100.t1.1
5    Cre01.g000150.t1.2
6    Cre01.g000200.t1.1
7    Cre01.g000250.t1.2
8    Cre01.g000300.t1.1
9    Cre01.g000350.t1.1
Name: transcript_id, dtype: object

In [38]:
df.head()

Unnamed: 0,annotation_version,gene_id,gene_symbol,pathway_id,transcript_id
0,v5.5,Cre01.g000017,,,Cre01.g000017.t1.1
1,v5.5,Cre01.g000033,,,Cre01.g000033.t1.1
2,v5.5,Cre01.g000050,,,Cre01.g000050.t1.1
3,v5.5,Cre01.g000100,,,Cre01.g000100.t1.1
4,v5.5,Cre01.g000150,ZRT2,,Cre01.g000150.t1.2


#### 1. Single process

In [32]:
cannot_find_matching_transcript = []
t0= time.time()
for gene in list(int_data02.transcript_id)[:100]:
    
    q =  JumpSearch(np.unique(df.transcript_id), gene) 
    
    if q== -1: cannot_find_matching_transcript.append(gene)
        
    else: pass
    
t1 = time.time()
print(t1-t0, "s")

1.374121904373169 s


#### 2. One process using multiple processing

In [58]:
def cannot_find_match (input_string):
    q =  JumpSearch(transcripts, gene)
    if q == -1: return(gene)
    else: return(None)
    

In [59]:
t0 = time.time()
cannot_find_matching_transcript = []
transcripts = np.unique(df.transcript_id)
global transcripts

def main():
    pool = Pool(processes=1)
    #func = partial (cannot_find_match, transcripts = transcripts)
    
    cannot_find_matching_transcript = list(pool.map(cannot_find_match, int_data02.transcript_id[:100]))
    
    return cannot_find_matching_transcript

if __name__ == '__main__':
    cannot_find_matching_transcript = main()
    
t1 = time.time()
print("100 genes took:", t1-t0, "seconds.")
print("Estimated calculation time:", len(int_data02.transcript_id)/100*(t1-t0)/60, "minutes")

100 genes took: 0.050256967544555664 seconds.
Estimated calculation time: 0.18151979060967763 minutes


#### Complete process using one core.

In [60]:
t0 = time.time()
cannot_find_matching_transcript = []
transcripts = np.unique(df.transcript_id)
global transcripts

def main():
    pool = Pool(processes=1)
    
    #func = partial (cannot_find_match, transcripts = transcripts)
    
    cannot_find_matching_transcript = list(pool.map(cannot_find_match, int_data02.transcript_id))
    
    return cannot_find_matching_transcript

if __name__ == '__main__':
    cannot_find_matching_transcript = main()
    
t1 = time.time()
print("Total time:", t1-t0, "seconds.")

Total time: 2.395581007003784 seconds.


In [71]:
cannot_find_matching_transcript = [i for i in cannot_find_matching_transcript if i != None]
if len(cannot_find_matching_transcript)>0: 
    print(len(cannot_find_matching_transcript), cannot_find_matching_transcript)

else: print("All transcripts from ~/data/intermediate_data_02/sampled_genes.pk have matching transcripts in Phytozome v12.1 query.")

All transcripts from ~/data/intermediate_data_02/sampled_genes.pk have matching transcripts in Phytozome v12.1 query.


## 4. Merge int_data02 and query_result.csv

In [72]:
merged = pd.merge(int_data02, df)
merged[:5]

Unnamed: 0,num_detected,num_manipulated,num_sampled,source,transcript_id,annotation_version,gene_id,gene_symbol,pathway_id
0,0.0,0,1.0,Bajhaiya_2016,Cre01.g000017.t1.1,v5.5,Cre01.g000017,,
1,0.0,0,1.0,Kwak_2017,Cre01.g000017.t1.1,v5.5,Cre01.g000017,,
2,1.0,0,1.0,Bajhaiya_2016,Cre01.g000033.t1.1,v5.5,Cre01.g000033,,
3,0.0,0,1.0,Gargouri_2015,Cre01.g000050.t1.1,v5.5,Cre01.g000050,,
4,0.0,0,1.0,Bajhaiya_2016,Cre01.g000100.t1.1,v5.5,Cre01.g000100,,


#### Export merged dataframe as `~/lipid_selection/data/intermediate_data_02/observational_studies.pickle`

In [73]:
with open('../../data/intermediate_data_02/sampledd_genes.pk', 'wb') as f:
    pickle.dump(merged, f)