# Create a Chlamy annotation v5.3.1 version of `sampled_genes.pk`
output: `../../data/intermediate_data_02/sampled_genes.csv`

In [1]:
import gzip
import pandas as pd
import re
import time
import sys
sys.path.append('../../scripts/')
import Search_algorithms as sag

## 1. Import relevant files

#### Transcipt id conversion key
Conversion key file path : `/scratch/research/projects/chlamydomonas/lipid_selection/data/gene_name_conversion/ChlamydomonasTranscriptNameConversionBetweenReleases.Mch12b.txt.gz`

In [2]:
file_path='/scratch/research/projects/chlamydomonas/lipid_selection/data/gene_name_conversion/ChlamydomonasTranscriptNameConversionBetweenReleases.Mch12b.txt.gz'

genome_version = '5.3.1'
with gzip.open(file_path, "rt", encoding="utf-8") as z:
    conversion_key = pd.read_csv(z, delimiter = r"\s+",skiprows = 1, na_values = "--")
    conversion_key.columns = ['5.5', '3.1', 'Genbank', '4', '4.3', 'u5', 'u9', '5.3.1']
    conversion_key = conversion_key[['5.5', genome_version ]].dropna()

#sort conversion_key by v5.5 transcript id's
conversion_key = conversion_key.sort_values(by = ['5.5'])

In [3]:
print(conversion_key[:10])

                  5.5               5.3.1
0  Cre01.g000017.t1.1               g2.t1
1  Cre01.g000033.t1.1               g3.t1
2  Cre01.g000050.t1.1  Cre01.g000050.t1.3
3  Cre01.g000100.t1.1  Cre01.g000100.t1.3
4  Cre01.g000150.t1.2  Cre01.g000150.t1.2
5  Cre01.g000200.t1.1  Cre01.g000200.t1.3
6  Cre01.g000250.t1.2  Cre01.g000250.t1.2
7  Cre01.g000300.t1.1  Cre01.g000300.t1.3
8  Cre01.g000350.t1.1  Cre01.g000350.t1.3
9  Cre01.g000400.t1.2  Cre01.g000400.t1.2


#### `data/intermediate_data_02/merged.csv`

In [4]:
sampled_genes = pd.read_pickle("../../data/intermediate_data_02/sampled_genes.pk")
sampled_genes = sampled_genes.sort_values(by = ['transcript_id'])

In [5]:
print(sampled_genes.columns)

Index(['num_detected', 'num_manipulated', 'num_sampled', 'source',
       'transcript_id', 'annotation_version', 'gene_id', 'gene_symbol',
       'pathway_id'],
      dtype='object')


## 2.1 Binary search method

In [6]:
#Test cell for BinarySearch

#When matching string exists
test_list = list(conversion_key['5.5'][:10])
value = 'Cre01.g000150.t1.2'
print(sag.BinarySearch(test_list, value))

#When matching string does not exist
value = 'Does not exist'
print(sag.BinarySearch(test_list, value))

#When matching string is in the middle of an existing string
value = "g000250"
print(sag.BinarySearch(test_list, value))

#When matching string starts an existing string
value = "Cre01.g000150"
print(sag.BinarySearch(test_list, value))

4
-1
-1
4


In [7]:
t0 = time.time()
transcript_id_v5_3_1 =[]

for transcript in list(sampled_genes.transcript_id):    
    
    
    index = sag.BinarySearch(list(conversion_key['5.5']), transcript)
                         
    if index == -1: transcript_id_v5_3_1.append(None)
                         
    else: transcript_id_v5_3_1.append(conversion_key.loc[index, '5.3.1'])
        
t1 = time.time()
print(t1-t0,"s")

17.55692958831787 s


In [8]:
print("Number of v5.5 transcripts that does not have a matching v5.3.1 transcript id: ", transcript_id_v5_3_1.count(None))

Number of v5.5 transcripts that does not have a matching v5.3.1 transcript id:  0


## 2.2 JumpSearch method

Search took ~21.4s. This is slower than the binary search method.

## 3. Create v5.3.1 annotation version of merged.csv

In [9]:
sampled_genes['transcript_id_v5.3.1']=  transcript_id_v5_3_1
sampled_genes.to_csv('../../data/intermediate_data_02/sampled_genes.csv', index=False) 