# Extract candidate gene info from *Chlamy* gff

gff file path: `scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/annotation/Creinhardtii_v5.3_223_gene.gff3.gz `

Target data structure:


In [90]:
import gzip
import pandas as pd
import re
import time
import pickle

In [2]:
merged = pd.read_csv("../../data/intermediate_data_from_gff/merged_v5_3_1.csv")
gff_file_path='/scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/annotation/Creinhardtii_v5.3_223_gene.gff3.gz'


In [3]:
def BinarySearch(lys, val):
    
    '''requires re'''
    
    '''This function returns the position of the element in the list lys that contains the string pattern lys. If no match 
    
    Usage: lys = list of strings to search through; val = string pattern to search for
    
    Warning: This function only works when the beginning of the string matches val'''
        
    first = 0
    last = len(lys)-1
    index = -1
    
    
    p = re.compile(re.escape(val))
    
    
    while (first <= last) and (index == -1):
        mid = round((first+last)/2)
        
        index_list = sorted([lys[mid],val])
        
        if p.match(lys[mid]):
            index = mid
        else:
            
            if index_list.index(val)<index_list.index(lys[mid]):
                last = mid -1
                
            else:
                first = mid +1
                
    return index

In [4]:
def tryextract(i, pattern):
    
    try: 
        m = re.search(pattern, i).group(1)
        return(m)
    
    except AttributeError:
        return(None)

In [62]:

with gzip.open(gff_file_path, "rt", encoding="utf-8") as z:
    
    df = pd.read_csv(z,delimiter=r"\s+",skiprows=1,header=None)
    df.columns = ['chromosome', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
    df['Name'] = df.attributes.apply(lambda x: tryextract(x, r"Name=(.+);pacid"))
    #df['pacid'] = df.attributes.apply(lambda x: tryextract(x, r"pacid=(.+);longest"))
    df['ID'] = df.attributes.apply(lambda x: tryextract(x, r"ID=(PAC:[0-9]+)"))
        #except AttributeError: 
    #del df['attributes']
    #df = df.sort_values(by="Name")
    df = df[df.feature == 'mRNA']
    

In [6]:
subset = df[df.feature =="mRNA"].sort_values(by="Name").reset_index()
search_list = list(subset.Name)

In [7]:
t0 = time.time()
pacid_list = []

for row in range(0,len(merged)):
    transcript_id = merged.loc[row,'transcript_id_v5.3.1']
    pacid = None
    
    index = BinarySearch(search_list, transcript_id)
    
    if index > -1: 
        pacid = subset.loc[index, "ID"]
                    
    pacid_list.append(pacid)

t1 = time.time()
print("Time taken", t1-t0, "s")

Time taken 11.651297092437744 s


In [8]:
merged['PAC_ID'] = pacid_list

In [47]:
#define candidate gene conditions
A = (merged.num_detected>=2)
B = (merged.num_sampled>=2)
C = merged.pathway_id.str.contains("TRIGLSYN-PWY")==True
D = merged.pathway_id.str.contains("PWY-4381")==True
F = (merged.num_detected>=1)

condition = (A & (C | D))

In [84]:
gene_set= []
filtered_df = df.dropna()
for transcript in list(merged[condition].PAC_ID):
    temp = filtered_df[filtered_df.ID==transcript]
    for i in list(temp.index):
        #tp = (temp.chromosome[i], temp.start[i], temp.end[i])
        gene_set.append((temp.chromosome[i], temp.start[i], temp.end[i]))
    #if count>10: break

In [91]:
with open('../../data/intermediate_data_from_gff/gene_set.pickle', 'wb') as f:
    pickle.dump(gene_set, f)

In [75]:
a.chromosome

13446    chromosome_1
Name: chromosome, dtype: object

In [11]:
print(list(merged.PAC_ID).count(None),"transcripts out of", len(merged), "transcripts did not have a matching PAC ID")

176 transcripts out of 17956 transcripts did not have a matching PAC ID


In [16]:
df[:10]


Unnamed: 0,chromosome,source,feature,start,end,score,strand,phase,attributes,Name,ID
0,chromosome_1,phytozome8_0,gene,24026,30617,.,+,.,ID=Cre01.g000050;Name=Cre01.g000050,,
1,chromosome_1,phytozome8_0,mRNA,24026,30617,.,+,.,ID=PAC:26903339;Name=Cre01.g000050.t1.3;pacid=...,Cre01.g000050.t1.3,PAC:26903339
2,chromosome_1,phytozome8_0,five_prime_UTR,24026,24125,.,+,.,ID=PAC:26903339.five_prime_UTR.1;Parent=PAC:26...,,PAC:26903339
3,chromosome_1,phytozome8_0,CDS,24126,28105,.,+,0,ID=PAC:26903339.CDS.1;Parent=PAC:26903339;paci...,,PAC:26903339
4,chromosome_1,phytozome8_0,CDS,28291,28644,.,+,1,ID=PAC:26903339.CDS.2;Parent=PAC:26903339;paci...,,PAC:26903339
5,chromosome_1,phytozome8_0,CDS,28842,29091,.,+,1,ID=PAC:26903339.CDS.3;Parent=PAC:26903339;paci...,,PAC:26903339
6,chromosome_1,phytozome8_0,CDS,29347,29577,.,+,0,ID=PAC:26903339.CDS.4;Parent=PAC:26903339;paci...,,PAC:26903339
7,chromosome_1,phytozome8_0,three_prime_UTR,29578,30617,.,+,.,ID=PAC:26903339.three_prime_UTR.1;Parent=PAC:2...,,PAC:26903339
8,chromosome_1,phytozome8_0,gene,30776,41037,.,+,.,ID=Cre01.g000100;Name=Cre01.g000100,,
9,chromosome_1,phytozome8_0,mRNA,30776,41037,.,+,.,ID=PAC:26903974;Name=Cre01.g000100.t1.3;pacid=...,Cre01.g000100.t1.3,PAC:26903974


In [17]:
merged[:10]

Unnamed: 0,transcript_id,num_detected,num_sampled,proportion,source,annotation_version,gene_id,gene_symbol,pathway_id,transcript_id_v5.3.1,PAC_ID
0,Cre01.g000017.t1.1,0,2,0.0,"['Bajhaiya_2016', 'Kwak_2017']",v5.5,Cre01.g000017,,,g2.t1,PAC:26903746
1,Cre01.g000033.t1.1,1,1,0.125,['Bajhaiya_2016'],v5.5,Cre01.g000033,,,g3.t1,PAC:26903463
2,Cre01.g000050.t1.1,0,1,0.0,['Gargouri_2015'],v5.5,Cre01.g000050,,,Cre01.g000050.t1.3,PAC:26903339
3,Cre01.g000100.t1.1,0,1,0.0,['Bajhaiya_2016'],v5.5,Cre01.g000100,,,Cre01.g000100.t1.3,PAC:26903974
4,Cre01.g000150.t1.2,1,1,0.125,['Bajhaiya_2016'],v5.5,Cre01.g000150,ZRT2,,Cre01.g000150.t1.2,PAC:26903809
5,Cre01.g000200.t1.1,0,1,0.0,['Bajhaiya_2016'],v5.5,Cre01.g000200,,,Cre01.g000200.t1.3,PAC:26903011
6,Cre01.g000250.t1.2,0,1,0.0,['Bajhaiya_2016'],v5.5,Cre01.g000250,,,Cre01.g000250.t1.2,PAC:26903028
7,Cre01.g000300.t1.1,1,1,0.125,['Bajhaiya_2016'],v5.5,Cre01.g000300,,"['Creinhardtii PWY-5667', 'Creinhardtii PWY-74...",Cre01.g000300.t1.3,PAC:26903629
8,Cre01.g000350.t1.1,1,2,0.125,"['Hemme_2014', 'Bajhaiya_2016']",v5.5,Cre01.g000350,,,Cre01.g000350.t1.3,PAC:26903061
9,Cre01.g000400.t1.2,0,1,0.0,['Bajhaiya_2016'],v5.5,Cre01.g000400,,,Cre01.g000400.t1.2,PAC:26903690


In [24]:
count = 0
gene_set = []
for key in gff_ext_dict.keys():
    #print(key, value)
    for row in gff_ext_dict[key]:
        tp = (row.chromosome, row.start, row.end)
        gene_set.append(tp)
    if count >10: break

AttributeError: 'str' object has no attribute 'chromosome'