# Generate SFS_and_divergence.pickle

- Use divergence data provided by Rob
- Merge divergence data with `final_SFS.pickle` to generate `SFS_and_divergence.pickle`

In [1]:
import pandas as pd
import pickle

#### Notes from Rob
in this file: `/scratch/research/projects/chlamydomonas/mutability/analysis/calculate_divergence/Cinc/K4K0.out.txt`
you can find a table of divergence

- `transcript_ID` = C. reinhardtii transcript ID 
- `Cincerta_transcript_ID` = C. incerta transcript ID 
- `aln_len` = length of aligned sequences
- `Cr_length` = length of Creinhardtii transcript unaligned
-  `Ci_length` = length of Cincerta transcript unaligned
- `diff_percent` = proportion of sites that differed in the best BLAST hit between these sequences
- `diffs0` = Number of differences at 0-fold degenerate sites
- `sites0` = Number of 0-fold degenerate sites
- `diffs4` = Number of differences at 4-fold degenerate sites
- `sites4` = Number of 4-fold degenerate sites
- `k0` = diffs0/sites0
- `k4` = diffs4/sites4


those are the columns in the table

This file `/scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/annotation/concatenated_GFF/ness_id.translation_table.txt` will convert from the transcript ID above to gene IDs. The first column of the translation table is “ness_id” - that equates to transcript_ID from the K0K4 table.

- Use ness_id  (first column) and CDS (6th column) to convert to 5.3 gene ids
- The diffs and sites columns are the data you need for DFE alpha

**IMPORTANT CAVEAT** - some of the alignments are bound to be shit - you should probably remove any alignment where K4 > 0.6 or so

In [2]:
#Import divergence and translation tables
f = "/scratch/research/projects/chlamydomonas/mutability/analysis/calculate_divergence/Cinc/K4K0.out.txt"
divergence = pd.read_csv(f, sep = "\t")
d1=len(divergence)
print("Num rows in divergence:", len(divergence))

#Remove alignment where K4>0.6
divergence = divergence[divergence.k4 <= 0.6]
print("Num rows in divergence after filter:", len(divergence))
print("Num rows removed in divergence due to filter:", d1-len(divergence))

f = "/scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/annotation/concatenated_GFF/ness_id.translation_table.txt"
ness_id_table = pd.read_csv(f,sep = "\t")
ness_id_table.ness_id = ness_id_table.index
ness_id_table['transcript_ID']= ness_id_table.index
ness_id_table.index = range(len(ness_id_table))


divergence.transcript_ID=divergence.transcript_ID.apply(lambda x: str(x))
ness_id_table.transcript_ID=ness_id_table.transcript_ID.apply(lambda x: str(x))

Num rows in divergence: 12294
Num rows in divergence after filter: 12273
Num rows removed in divergence due to filter: 21


In [3]:
divergence[:5]

Unnamed: 0,transcript_ID,Cincerta_transcript_ID,aln_len,Cr_length,Ci_length,diff_percent,diffs0,sites0,diffs4,sites4,k0,k4
0,26888200,g6935.t1,4230,4230,4230,0.05,24,2777,113,699,0.008642,0.16166
1,26904954,g13204.t1,1041,1041,1032,0.09,28,655,38,217,0.042748,0.175115
2,26894252,g11333.t1,4617,4323,4521,0.15,231,2638,311,950,0.087566,0.327368
3,26891814,g15729.t1,5127,4308,4932,0.22,450,2626,325,930,0.171363,0.349462
4,26901315,g5327.t1,2289,2289,2289,0.05,3,1483,72,412,0.002023,0.174757


In [4]:
ness_id_table[0:5]

Unnamed: 0,ness_id,five_prime_UTR,three_prime_UTR,exon,mRNA,CDS,tRNA,rRNA,misc_RNA,transcript_ID
0,26903746,PAC:26903746,PAC:26903746,PAC:26903746,g2,PAC:26903746,.,.,.,26903746
1,26903463,PAC:26903463,PAC:26903463,PAC:26903463,g3,PAC:26903463,.,.,.,26903463
2,26903339,PAC:26903339,PAC:26903339,PAC:26903339,Cre01.g000050,PAC:26903339,.,.,.,26903339
3,26903974,PAC:26903974,PAC:26903974,PAC:26903974,Cre01.g000100,PAC:26903974,.,.,.,26903974
4,26903809,PAC:26903809,PAC:26903809,PAC:26903809,Cre01.g000150,PAC:26903809,.,.,.,26903809


In [5]:
temp = pd.merge(divergence, ness_id_table[['exon', 'mRNA','transcript_ID']], on = 'transcript_ID')

In [6]:
if len(temp.dropna()) == len(divergence): print("All transcripts in divergence has matching PAC id in ness_id_table")
temp[:5]

All transcripts in divergence has matching PAC id in ness_id_table


Unnamed: 0,transcript_ID,Cincerta_transcript_ID,aln_len,Cr_length,Ci_length,diff_percent,diffs0,sites0,diffs4,sites4,k0,k4,exon,mRNA
0,26888200,g6935.t1,4230,4230,4230,0.05,24,2777,113,699,0.008642,0.16166,PAC:26888200,Cre08.g362650
1,26904954,g13204.t1,1041,1041,1032,0.09,28,655,38,217,0.042748,0.175115,PAC:26904954,Cre10.g442950
2,26894252,g11333.t1,4617,4323,4521,0.15,231,2638,311,950,0.087566,0.327368,PAC:26894252,Cre06.g311400
3,26891814,g15729.t1,5127,4308,4932,0.22,450,2626,325,930,0.171363,0.349462,PAC:26891814,Cre16.g674500
4,26901315,g5327.t1,2289,2289,2289,0.05,3,1483,72,412,0.002023,0.174757,PAC:26901315,Cre03.g180850


In [7]:
reduced_SFS = pickle.load(open('../../data/intermediate_data_02/reduced_SFS.pk', 'rb'))
reduced_SFS[:3]

Unnamed: 0,PAC_id,neutral_SFS,selected_SFS
0,PAC:26887927,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,PAC:26887929,"[126, 9, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0...","[399, 9, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0..."
2,PAC:26887930,"[273, 12, 11, 3, 5, 6, 6, 3, 8, 0, 0, 0, 0, 0,...","[921, 22, 15, 9, 3, 6, 1, 1, 10, 0, 0, 0, 0, 0..."


In [8]:
temp = temp.rename(columns={'exon':'PAC_id'})
output = pd.merge(temp[['Cincerta_transcript_ID','diffs0','sites0','diffs4','sites4','PAC_id']], reduced_SFS, on = 'PAC_id')
output[:3]

Unnamed: 0,Cincerta_transcript_ID,diffs0,sites0,diffs4,sites4,PAC_id,neutral_SFS,selected_SFS
0,g6935.t1,24,2777,113,699,PAC:26888200,"[609, 11, 17, 29, 6, 10, 6, 2, 8, 0, 0, 0, 0, ...","[2775, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,g13204.t1,28,655,38,217,PAC:26904954,"[155, 4, 0, 2, 0, 4, 3, 3, 1, 0, 0, 0, 0, 0, 0...","[519, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0..."
2,g11333.t1,231,2638,311,950,PAC:26894252,"[491, 4, 9, 4, 1, 0, 7, 2, 4, 1, 0, 0, 0, 0, 0...","[1435, 9, 2, 1, 0, 0, 4, 2, 3, 0, 0, 0, 0, 0, ..."


In [9]:
with open('../../data/workflow/SFS_and_divergence.pk', 'wb') as f:
    pickle.dump(output, f)