# Extract candidate gene info from *Chlamy* gff

gff file path: `scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/annotation/Creinhardtii_v5.3_223_gene.gff3.gz `

Target data structure:


In [30]:
import gzip
import pandas as pd
import re
import time
import pickle

In [31]:
merged = pd.read_csv("../../data/intermediate_data_from_gff/merged_v5_3_1.csv")
gff_file_path='/scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/annotation/Creinhardtii_v5.3_223_gene.gff3.gz'

In [None]:
def BinarySearch(lys, val):
    
    '''requires re'''
    
    '''This function returns the position of the element in the list lys that contains the string pattern lys. If no match 
    Usage: lys = list of strings to search through; val = string pattern to search for
    Warning: This function only works when the beginning of the string matches val'''
        
    first = 0
    last = len(lys)-1
    index = -1
    
    p = re.compile(re.escape(val))
    
    while (first <= last) and (index == -1):
        
        mid = round((first+last)/2)
        index_list = sorted([lys[mid],val])
        
        if p.match(lys[mid]): index = mid
        else:
            
            if index_list.index(val)<index_list.index(lys[mid]): last = mid -1
            else: first = mid +1
                
    return index

In [4]:
def tryextract(i, pattern):
    
    '''This function takes in a string and returns the first matching group in the search pattern'''
    
    try: 
        m = re.search(pattern, i).group(1)
        return(m)
    
    except AttributeError:
        return(None)

In [32]:
#Import gff as working dataframe. Extract Name and PAC id from the column 'attribute' as separate columns
with gzip.open(gff_file_path, "rt", encoding="utf-8") as z:
    
    df = pd.read_csv(z,delimiter=r"\s+",skiprows=1,header=None)
    df.columns = ['chromosome', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
    df['Name'] = df.attributes.apply(lambda x: tryextract(x, r"Name=(.+);pacid"))
    df['ID'] = df.attributes.apply(lambda x: tryextract(x, r"ID=(PAC:[0-9]+)"))
    

In [45]:
#Create mRNA gff dataframe subset to extract PAC_id 
subset = df[df.feature =="mRNA"].sort_values(by="Name").reset_index()
search_list = list(subset.Name)

In [46]:
print(subset[:10])

   index    chromosome        source feature  start    end score strand phase  \
0      1  chromosome_1  phytozome8_0    mRNA  24026  30617     .      +     .   
1      9  chromosome_1  phytozome8_0    mRNA  30776  41037     .      +     .   
2     26  chromosome_1  phytozome8_0    mRNA  41896  46547     .      +     .   
3     35  chromosome_1  phytozome8_0    mRNA  46553  51525     .      -     .   
4     51  chromosome_1  phytozome8_0    mRNA  51590  54035     .      -     .   
5     59  chromosome_1  phytozome8_0    mRNA  54136  57919     .      +     .   
6     71  chromosome_1  phytozome8_0    mRNA  57977  60685     .      -     .   
7     81  chromosome_1  phytozome8_0    mRNA  60740  64955     .      -     .   
8     96  chromosome_1  phytozome8_0    mRNA  65056  70197     .      +     .   
9    109  chromosome_1  phytozome8_0    mRNA  70246  77235     .      +     .   

                                          attributes                Name  \
0  ID=PAC:26903339;Name=Cre01.g0

In [47]:
#Order list of matching PAC id's to 'transcript_id_v5.3.1'
t0 = time.time()
pacid_list = []

for row in range(0,len(merged)):
    transcript_id = merged.loc[row,'transcript_id_v5.3.1']
    pacid = None
    
    index = BinarySearch(search_list, transcript_id)
    
    if index > -1: 
        pacid = subset.loc[index, "ID"]
                    
    pacid_list.append(pacid)

t1 = time.time()
print("Time taken", t1-t0, "s")

Time taken 10.309276580810547 s


In [48]:
#Add column of PAC id's associated with each 'transcript_id_v5.3.1' in merged dataframe
merged['PAC_ID'] = pacid_list

In [9]:
#define candidate gene conditions
A = (merged.num_detected>=2)
B = (merged.num_sampled>=2)
C = merged.pathway_id.str.contains("TRIGLSYN-PWY")==True
D = merged.pathway_id.str.contains("PWY-4381")==True
F = (merged.num_detected>=1)

condition = (A & (C | D))

In [52]:
gene_set= []
features_condition = ['five_prime_UTR', 'CDS', 'three_prime_UTR']
#keep gff rows that contain CDS positions, remove gff rows that are missing PAC_ID's
filtered_df = df[df.feature.isin(features_condition)].dropna(subset = ['ID'])

#loop through sampled transcipts with PAC_ID's; transcript filtered by condition variable
for transcript in list(merged[condition].PAC_ID): 
    
    temp = filtered_df[filtered_df.ID==transcript]
    
    #addd CDS coordinates as tuple (chromosome, start, end) to gene_set
    for i in list(temp.index):
        gene_set.append((temp.chromosome[i], temp.start[i], temp.end[i]))

In [53]:
with open('../../data/intermediate_data_from_gff/gene_set.pickle', 'wb') as f:
    pickle.dump(gene_set, f)

In [12]:
print(list(merged['PAC_ID']).count(None),"transcripts out of", len(merged), "transcripts did not have a matching PAC_ID")

176 transcripts out of 17956 transcripts did not have a matching PAC_ID


In [13]:
merged[:5]

Unnamed: 0,transcript_id,num_detected,num_sampled,proportion,source,annotation_version,gene_id,gene_symbol,pathway_id,transcript_id_v5.3.1,PAC_ID
0,Cre01.g000017.t1.1,0,2,0.0,"['Bajhaiya_2016', 'Kwak_2017']",v5.5,Cre01.g000017,,,g2.t1,PAC:26903746
1,Cre01.g000033.t1.1,1,1,0.125,['Bajhaiya_2016'],v5.5,Cre01.g000033,,,g3.t1,PAC:26903463
2,Cre01.g000050.t1.1,0,1,0.0,['Gargouri_2015'],v5.5,Cre01.g000050,,,Cre01.g000050.t1.3,PAC:26903339
3,Cre01.g000100.t1.1,0,1,0.0,['Bajhaiya_2016'],v5.5,Cre01.g000100,,,Cre01.g000100.t1.3,PAC:26903974
4,Cre01.g000150.t1.2,1,1,0.125,['Bajhaiya_2016'],v5.5,Cre01.g000150,ZRT2,,Cre01.g000150.t1.2,PAC:26903809


In [51]:
df[:10]

Unnamed: 0,chromosome,source,feature,start,end,score,strand,phase,attributes,Name,ID
0,chromosome_1,phytozome8_0,gene,24026,30617,.,+,.,ID=Cre01.g000050;Name=Cre01.g000050,,
1,chromosome_1,phytozome8_0,mRNA,24026,30617,.,+,.,ID=PAC:26903339;Name=Cre01.g000050.t1.3;pacid=...,Cre01.g000050.t1.3,PAC:26903339
2,chromosome_1,phytozome8_0,five_prime_UTR,24026,24125,.,+,.,ID=PAC:26903339.five_prime_UTR.1;Parent=PAC:26...,,PAC:26903339
3,chromosome_1,phytozome8_0,CDS,24126,28105,.,+,0,ID=PAC:26903339.CDS.1;Parent=PAC:26903339;paci...,,PAC:26903339
4,chromosome_1,phytozome8_0,CDS,28291,28644,.,+,1,ID=PAC:26903339.CDS.2;Parent=PAC:26903339;paci...,,PAC:26903339
5,chromosome_1,phytozome8_0,CDS,28842,29091,.,+,1,ID=PAC:26903339.CDS.3;Parent=PAC:26903339;paci...,,PAC:26903339
6,chromosome_1,phytozome8_0,CDS,29347,29577,.,+,0,ID=PAC:26903339.CDS.4;Parent=PAC:26903339;paci...,,PAC:26903339
7,chromosome_1,phytozome8_0,three_prime_UTR,29578,30617,.,+,.,ID=PAC:26903339.three_prime_UTR.1;Parent=PAC:2...,,PAC:26903339
8,chromosome_1,phytozome8_0,gene,30776,41037,.,+,.,ID=Cre01.g000100;Name=Cre01.g000100,,
9,chromosome_1,phytozome8_0,mRNA,30776,41037,.,+,.,ID=PAC:26903974;Name=Cre01.g000100.t1.3;pacid=...,Cre01.g000100.t1.3,PAC:26903974


In [28]:
print(filtered_df.attributes)

3         ID=PAC:26903339.CDS.1;Parent=PAC:26903339;paci...
4         ID=PAC:26903339.CDS.2;Parent=PAC:26903339;paci...
5         ID=PAC:26903339.CDS.3;Parent=PAC:26903339;paci...
6         ID=PAC:26903339.CDS.4;Parent=PAC:26903339;paci...
11        ID=PAC:26903974.CDS.1;Parent=PAC:26903974;paci...
12        ID=PAC:26903974.CDS.2;Parent=PAC:26903974;paci...
13        ID=PAC:26903974.CDS.3;Parent=PAC:26903974;paci...
14        ID=PAC:26903974.CDS.4;Parent=PAC:26903974;paci...
15        ID=PAC:26903974.CDS.5;Parent=PAC:26903974;paci...
16        ID=PAC:26903974.CDS.6;Parent=PAC:26903974;paci...
17        ID=PAC:26903974.CDS.7;Parent=PAC:26903974;paci...
18        ID=PAC:26903974.CDS.8;Parent=PAC:26903974;paci...
19        ID=PAC:26903974.CDS.9;Parent=PAC:26903974;paci...
20        ID=PAC:26903974.CDS.10;Parent=PAC:26903974;pac...
21        ID=PAC:26903974.CDS.11;Parent=PAC:26903974;pac...
22        ID=PAC:26903974.CDS.12;Parent=PAC:26903974;pac...
23        ID=PAC:26903974.CDS.13;Parent=