# Generate table of transcripts with associated pathways

Target data structure:

| gene_id | transcript_id | num_detected | num_sampled | source        | gene_symbol | pathway_id    |
|:-------:|:-------------:|:------------:|:-----------:|:-------------:|:-----------:|:-------------:|
| `<str>`   | `<str>`         | `<int>`        | `<int>`       | `[<str>,<str>,...]` | `<str>`       | `[<str>,<str>,...]` |



Workflow:

1. Query all *C. reinhardtii* v5.5 transcripts from Phytozome v12.1 using intermine. Only transcripts with identified pathways are returned in the query results.
2. Export query results to `~/lipid_selection/data/intermediate_data_02/gene_info`

In [1]:
import sys
sys.path.append('/home/chenwe72/local/lib/python3.5/site-packages')
#sys.path

In [2]:
import pickle
import csv
from intermine.webservice import Service
import pandas as pd
import time
import numpy as np
import re

## 1. Query all C. reinhardtii v5.5 transcripts from Phytozome v12.1 using intermine.

In [13]:
#require intermine
t0= time.time()
service = Service("https://phytozome.jgi.doe.gov/phytomine/service")
query = service.new_query("Transcript")
query.add_view(
    "primaryIdentifier", "gene.primaryIdentifier", "gene.symbol",
    "gene.pathways.identifier", "organism.annotationVersion"
)
query.add_constraint("organism", "LOOKUP", "Chlamydomonas reinhardtii", code = "A")
query.outerjoin("gene.pathways")

<intermine.query.Query at 0x22d7694dd08>

In [14]:
#require csv

with open('../../data/intermediate_data_02/query.csv', mode='w') as gene_info:
    gene_info = csv.writer(gene_info, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    gene_info.writerow(['transcript_id', 'gene_id', 'gene_symbol', "pathway_id", "annotation_version"])
    #count = 0
    for row in query.rows():
        #count +=1
        gene_info.writerow([row["primaryIdentifier"], row["gene.primaryIdentifier"], row["gene.symbol"], \
        row["gene.pathways.identifier"], row["organism.annotationVersion"]])
        #if count >100 : break
        
t1= time.time()
print("Query took", t1-t0, "s")

Query took 8.237240552902222 s


## 2. Data wrangling

Reorganize query result to the follow dataframe structure:

| gene_id | transcript_id | gene_symbol | pathway_id    | annotation_version   |
|:-------:|:-------------:|:-----------:|:-------------:|:-------------:|
| `<str>`   | `<str>`         | `<str>`       | `[<str>,<str>,...]` |`<str>`       |


In [5]:
#This is faster than dictionary method
t0= time.time()

#Import query result
gene_info = pd.read_csv("../../data/intermediate_data_02/query.csv")

#Group by 'transcript_id', 'gene_id',and 'annotation_version', because each transcript has only one of these
grouped_by = ['transcript_id', 'gene_id', 'annotation_version']

#Check if any transcript has more than one gene_id's, annotation_versions, or gene_symbols
if len(gene_info.groupby(grouped_by)) == len(gene_info.groupby(['transcript_id'])):
    grouped = gene_info.groupby(grouped_by)
else: print("transcripts may have more than one gene_id's or annotation_versions")
    
#Initialize empty dictionary to convert to dataframe later
temp_dict= dict((key,[]) for key in gene_info.columns)


for name, group in grouped: 

    #each name is a unique combination of 'transcript_id', 'gene_id',and 'annotation_version'
    #group is the groupby dataframe of that combination
    
    for i in ['transcript_id', 'gene_id', 'annotation_version','gene_symbol']:
        
        ls = list(group.dropna(subset=[i])[i])
        if len(ls)==0:temp_dict[i].append(None)
        elif len(list(np.unique(ls)))==1: temp_dict[i].append(ls[0])
            
    i = 'pathway_id'  
    ls = list(group.dropna(subset=[i])[i])
    if len(ls) ==0:temp_dict[i].append(None)
    elif len(ls) >0: temp_dict[i].append(list(np.unique(ls)))
            
df = pd.DataFrame(temp_dict)
    
t1= time.time()
print("Time taken", t1-t0, "s")


Time taken 128.25716257095337 s


## 3. Query result summary

In [6]:
#export results
df.to_csv("../../data/intermediate_data_02/gene_info.csv", index = False, header = True)

print("Query returned", gene_info.shape[0], "results and", len(np.unique(gene_info.transcript_id)), "unique transcripts.")
#gene_info.pathway_id = gene_info.pathway_id.apply(lambda x: x.split(" ")[1])
#temp = gene_info.dropna(subset = ['pathway_id'])
print(len(np.unique(gene_info.dropna(subset=['pathway_id']).transcript_id)), "unique transcripts have matching pathways.")

Query returned 22282 results and 19526 unique transcripts.
1216 unique transcripts have matching pathways.


#### Check if all transcripts in int_data02 have matching transcripts in query result

In [7]:
#test cell
import math

def JumpSearch (lys, val):
    
    '''This function returns the position of the element in the list lys that contains the string pattern lys. If no match 
    
    Usage: lys = list of strings to search through; val = string pattern to search for
    
    Warning: This function only works when the beginning of the string matches val'''
    
    length = len(lys)
    jump = int(math.sqrt(length))
    left, right = 0, 0
    index_list = sorted([lys[left],val,lys[right]])
    p = re.compile(re.escape(val))
    while left < length and index_list.index(lys[left]) <= index_list.index(val):
        right = min(length - 1, left + jump)
        index_list = sorted([lys[left],val,lys[right]])
        if index_list.index(lys[left]) <= index_list.index(val) and index_list.index(lys[right]) >= index_list.index(val):
            break
        left += jump;
        
    if left >= length or index_list.index(lys[left]) > index_list.index(val):
        return -1
    right = min(length-1, right)
    i = left
    #index_list = sorted([lys[i],val])
    while i <= right:
        index_list = sorted([lys[i],val])
        #print(p.search(lys[i]), lys[i])
        if p.match(lys[i]):
            return i
        i += 1
      
    return -1

In [8]:
#require pickle
int_data02 = pd.read_pickle("../../data/intermediate_data_02/int_data02")
int_data02 = int_data02.rename(columns = {'gene_id':'transcript_id'})
int_data02.reset_index(drop=True) 

Unnamed: 0,num_detected,num_sampled,transcript_id,proportion,source
0,0,2,Cre01.g000017.t1.1,0.000,"[Bajhaiya_2016, Kwak_2017]"
1,1,1,Cre01.g000033.t1.1,0.125,[Bajhaiya_2016]
2,0,1,Cre01.g000050.t1.1,0.000,[Gargouri_2015]
3,0,1,Cre01.g000100.t1.1,0.000,[Bajhaiya_2016]
4,1,1,Cre01.g000150.t1.2,0.125,[Bajhaiya_2016]
...,...,...,...,...,...
17951,0,1,Cre50.g761447.t1.1,0.000,[Bajhaiya_2016]
17952,1,1,Cre50.g761497.t1.1,0.125,[Bajhaiya_2016]
17953,0,1,Cre51.g761547.t1.1,0.000,[Bajhaiya_2016]
17954,0,1,Cre51.g761597.t1.1,0.000,[Bajhaiya_2016]


In [9]:
cannot_find_matching_transcript = []
t0= time.time()
for gene in list(int_data02.transcript_id):
    
    q =  JumpSearch(np.unique(df.transcript_id), gene) 
    
    if q== -1: cannot_find_matching_transcript.append(gene)
        
    else: pass
    
t1 = time.time()
print(t1-t0, "s")

99.41212439537048 s


In [10]:
if len(cannot_find_matching_transcript)>0: 
    print(len(cannot_find_matching_transcript), cannot_find_matching_transcript)

else: print("All transcripts from ~/data/intermediate_data_02/int_data02 have matching transcripts in Phytozome v12.1 query.")

All transcripts from ~/data/intermediate_data_02/int_data02 have matching transcripts in Phytozome v12.1 query.


## 4. Merge int_data02 and gene_info.csv

In [11]:
merged = pd.merge(int_data02, df)
merged[:5]

Unnamed: 0,num_detected,num_sampled,transcript_id,proportion,source,gene_id,gene_symbol,pathway_id,annotation_version
0,0,2,Cre01.g000017.t1.1,0.0,"[Bajhaiya_2016, Kwak_2017]",Cre01.g000017,,,v5.5
1,1,1,Cre01.g000033.t1.1,0.125,[Bajhaiya_2016],Cre01.g000033,,,v5.5
2,0,1,Cre01.g000050.t1.1,0.0,[Gargouri_2015],Cre01.g000050,,,v5.5
3,0,1,Cre01.g000100.t1.1,0.0,[Bajhaiya_2016],Cre01.g000100,,,v5.5
4,1,1,Cre01.g000150.t1.2,0.125,[Bajhaiya_2016],Cre01.g000150,ZRT2,,v5.5


#### Export merged dataframe as `~/lipid_selection/data/intermediate_data_02/gene_info`

In [12]:
merged.to_csv("../../data/intermediate_data_02/merged.csv", index = False, header = True)