# Get v5.5 names for genes in selective studies

In [1]:
import sys
sys.path.append("../../scripts/")
sys.path.append('/home/chenwe72/local/lib/python3.5/site-packages')

In [2]:
import pandas as pd
import numpy as np
import os
import re
import time
import gzip
import Search_algorithms as sag
from intermine.webservice import Service
import csv

### Find matching v5.5 transcript id's based on transcript id (`value`) and `genome_version`

In [3]:
#Import key to convert transcript id's between annotation versions (used interchangeably with genome_version in this notebook)
file_path='/scratch/research/projects/chlamydomonas/lipid_selection/data/gene_name_conversion/ChlamydomonasTranscriptNameConversionBetweenReleases.Mch12b.txt.gz'
with gzip.open(file_path, "rt", encoding="utf-8") as z:
    conversion_key = pd.read_csv(z, delimiter = r"\s+",skiprows = 1, na_values = "--")
    conversion_key.columns = ['5.5', '3.1', 'Genbank', '4', '4.3', 'u5', 'u9', '5.3.1']
#conversion_key = conversion_key[['5.5', genome_version ]].dropna()

In [4]:
#Modify this cell to find matching v5.5 transcript id's based on transcript id (value) and genome_version
value = '484000'
genome_version = 'Genbank'
#re.match(value, x)
temp = conversion_key[['5.5', genome_version ]].dropna()
temp[temp[genome_version].str.contains(value)]

Unnamed: 0,5.5,Genbank


In [5]:
#Modify value to find matching full transcript id in annotation v5.5
value = '484000'
temp = conversion_key[['5.5']].dropna()
temp[temp['5.5'].str.contains(value)]

Unnamed: 0,5.5
12751,Cre12.g484000.t1.2


### Compile list of genes sampled in selective studies 

In [6]:
df1 = pd.read_excel("../../data/raw_data/Summary_of_primary_data.xlsx", 'Sheet1')
df = df1[['Genes_sampled','Genes_manipulated', 'Genes_with_effect',"Genes_no_effect"]]
temp = []
for col in df.columns:
    temp += list(df[col].dropna())
genes_to_lookup=[]
for i in temp:
    ls = i.split("; ")
    ls = [i.replace(" ","") for i in ls]
    genes_to_lookup += ls
genes_to_lookup = list(np.unique(genes_to_lookup))

In [7]:
print(genes_to_lookup[:10])
print(len(genes_to_lookup))

['ACC', 'ACP', 'ACS2', 'ACT1', 'AMT1D', 'APL1', 'ARG7', 'ARG9', 'BCX1', 'CAH1']
77


### Query Chlamy genes using Phytozome12

In [8]:
#Query Chlamy genes from Phytozome12
#sys.path.append('/home/chenwe72/local/lib/python3.5/site-packages')
#from intermine.webservice import Service
#import csv

service = Service("https://phytozome.jgi.doe.gov/phytomine/service")

query_results = {}
transcript_id = []
symbol = []
no_matching_symbol = []

for i in genes_to_lookup:
    query = service.new_query("Transcript")
    query.add_view("primaryIdentifier", "gene.symbol")
    query.add_constraint("organism", "LOOKUP", "Chlamydomonas reinhardtii", code = "A")
    query.add_constraint("gene.symbol", "CONTAINS", i, code = "B")
    if query.count() ==0: no_matching_symbol.append(i)
    else:
        for row in query.rows():
            transcript_id.append(row.values()[10])
            symbol.append(row.values()[11])

In [9]:
query_results = {'transcript_id': transcript_id+[0]*len(no_matching_symbol),
                'symbol': symbol+no_matching_symbol,
                'source': "Query"}
df = pd.DataFrame(query_results)

In [10]:
print("Number of genes without matching symbol on Phytozome12: ",len(no_matching_symbol))
print("Total number of genes queried: ", len(df))

Number of genes without matching symbol on Phytozome12:  49
Total number of genes queried:  78


In [12]:
df.to_csv('05_Get_v5.5names_output/Query_sampled_genes_key.csv', index=False)

### Merge updated sampled_genes_key.csv with old version 20200716_sampled_genes_key.csv

In [13]:
new_key = pd.read_csv('05_Get_v5.5names_output/Query_sampled_genes_key.csv').dropna()
old_key = pd.read_csv('20200716_sampled_genes_key.csv')


In [14]:
new_key.transcript_id = new_key.transcript_id.replace('0', np.NaN)
#new_key = new_key.set_index('symbol')
print(new_key.head())

  source symbol       transcript_id
0  Query   ACP2  Cre13.g577100.t1.2
1  Query   ARG7  Cre01.g021251.t1.1
2  Query   BCX1  Cre12.g484000.t1.2
3  Query   CTR2  Cre10.g434350.t1.1
4  Query  DGAT1  Cre01.g045903.t1.1


In [15]:
old_key.columns = ['symbol', 'transcript_id', 'source']
#old_key = old_key.set_index('symbol')
old_key.transcript_id = old_key.transcript_id.replace('0', np.NaN)
print(old_key.head())

  symbol       transcript_id source
0   ACP2  Cre13.g577100.t1.2  Query
1   ARG7  Cre01.g021251.t1.1  Query
2   BCX1  Cre12.g484000.t1.2  Query
3   CTR2  Cre10.g434350.t1.1  Query
4  DGAT1  Cre01.g045903.t1.1  Query


In [16]:
updated_key = pd.merge(new_key, old_key, on = ['symbol', 'source'], how = 'outer')
print (updated_key.head())

  source symbol     transcript_id_x     transcript_id_y
0  Query   ACP2  Cre13.g577100.t1.2  Cre13.g577100.t1.2
1  Query   ARG7  Cre01.g021251.t1.1  Cre01.g021251.t1.1
2  Query   BCX1  Cre12.g484000.t1.2  Cre12.g484000.t1.2
3  Query   CTR2  Cre10.g434350.t1.1  Cre10.g434350.t1.1
4  Query  DGAT1  Cre01.g045903.t1.1  Cre01.g045903.t1.1


In [17]:
def get_transcript_id(x,y): 
    if isinstance(x, str): return x
    elif isinstance(y, str): return y
    else: return np.NaN
updated_key['transcript_id'] = list(map(get_transcript_id, updated_key.transcript_id_x, updated_key.transcript_id_y))

In [18]:
updated_key = updated_key.drop(['transcript_id_x','transcript_id_y'], axis = 1).sort_values(['transcript_id'], ascending=False)
updated_key = updated_key.drop_duplicates(subset='symbol', keep='first')
print(updated_key.head())
updated_key.to_csv('05_Get_v5.5names_output/sampled_genes_key.csv', index=False)

                                                source symbol  \
118  Sim, L., Beeren, S. R., Findinier, J., Dauvill...   STA8   
85                                         Phytozome12   APL1   
99                                         Phytozome12  PEPC1   
16                                               Query   NRR1   
22                                               Query   SQD1   

          transcript_id  
118  Cre17.g698850.t1.1  
85   Cre16.g683450.t1.1  
99        Cre16.g673852  
16   Cre16.g673250.t1.1  
22   Cre16.g656400.t1.2  
