# Create a Chlamy annotation v5.3 version of `merged` in data/intermediate_data_02/

In [1]:
import gzip
import pandas as pd
import re
import time

## 1. Import relevant files

#### Transcipt id conversion key
Conversion key file path : `/scratch/research/projects/chlamydomonas/lipid_selection/data/gene_name_conversion/ChlamydomonasTranscriptNameConversionBetweenReleases.Mch12b.txt.gz`

In [2]:
file_path='/scratch/research/projects/chlamydomonas/lipid_selection/data/gene_name_conversion/ChlamydomonasTranscriptNameConversionBetweenReleases.Mch12b.txt.gz'

genome_version = '5.3.1'
with gzip.open(file_path, "rt", encoding="utf-8") as z:
    conversion_key = pd.read_csv(z, delimiter = r"\s+",skiprows = 1, na_values = "--")
    conversion_key.columns = ['5.5', '3.1', 'Genbank', '4', '4.3', 'u5', 'u9', '5.3.1']
    conversion_key = conversion_key[['5.5', genome_version ]].dropna()

#sort conversion_key by v5.5 transcript id's
conversion_key = conversion_key.sort_values(by = ['5.5'])

In [3]:
print(conversion_key[:10])

                  5.5               5.3.1
0  Cre01.g000017.t1.1               g2.t1
1  Cre01.g000033.t1.1               g3.t1
2  Cre01.g000050.t1.1  Cre01.g000050.t1.3
3  Cre01.g000100.t1.1  Cre01.g000100.t1.3
4  Cre01.g000150.t1.2  Cre01.g000150.t1.2
5  Cre01.g000200.t1.1  Cre01.g000200.t1.3
6  Cre01.g000250.t1.2  Cre01.g000250.t1.2
7  Cre01.g000300.t1.1  Cre01.g000300.t1.3
8  Cre01.g000350.t1.1  Cre01.g000350.t1.3
9  Cre01.g000400.t1.2  Cre01.g000400.t1.2


#### `data/intermediate_data_02/merged.csv`

In [4]:
merged = pd.read_csv("../../data/intermediate_data_02/merged.csv")
merged = merged.sort_values(by = ['transcript_id'])

In [5]:
print(merged.columns)

Index(['transcript_id', 'num_detected', 'num_sampled', 'proportion', 'source',
       'annotation_version', 'gene_id', 'gene_symbol', 'pathway_id'],
      dtype='object')


## 2.1 Binary search method

In [6]:
def BinarySearch(lys, val):
    
    '''requires re'''
    
    '''This function returns the position of the element in the list lys that contains the string pattern lys. If no match 
    
    Usage: lys = list of strings to search through; val = string pattern to search for
    
    Warning: This function only works when the beginning of the string matches val'''
        
    first = 0
    last = len(lys)-1
    index = -1
    
    
    p = re.compile(re.escape(val))
    
    
    while (first <= last) and (index == -1):
        mid = round((first+last)/2)
        
        index_list = sorted([lys[mid],val])
        
        if p.match(lys[mid]):
            index = mid
        else:
            
            if index_list.index(val)<index_list.index(lys[mid]):
                last = mid -1
                
            else:
                first = mid +1
                
    return index

In [7]:
#Test cell

#When matching string exists
test_list = list(conversion_key['5.5'][:10])
value = 'Cre01.g000150.t1.2'
print(BinarySearch(test_list, value))

#When matching string does not exist
value = 'Does not exist'
print(BinarySearch(test_list, value))

#When matching string is in the middle of an existing string
value = "g000250"
print(BinarySearch(test_list, value))

#When matching string starts an existing string
value = "Cre01.g000150"
print(BinarySearch(test_list, value))

4
-1
-1
4


In [8]:
t0 = time.time()
transcript_id_v5_3_1 =[]

for transcript in list(merged.transcript_id):    
    
    
    index = BinarySearch(list(conversion_key['5.5']), transcript)
                         
    if index == -1: transcript_id_v5_3_1.append(None)
                         
    else: transcript_id_v5_3_1.append(conversion_key.loc[index, '5.3.1'])
        
t1 = time.time()
print(t1-t0,"s")

14.791561603546143 s


In [9]:
print("Number of v5.5 transcripts that does not have a matching v5.3.1 transcript id: ", transcript_id_v5_3_1.count(None))

Number of v5.5 transcripts that does not have a matching v5.3.1 transcript id:  0


## 2.2 JumpSearch method

Search took ~18s. This is slower than the binary search method.

## 3. Create v5.3.1 annotation version of merged.csv

In [10]:
merged['transcript_id_v5.3.1']=  transcript_id_v5_3_1
merged.to_csv('../../data/intermediate_data_from_gff/merged_v5_3_1.csv', index=False) 