# dN - dStop - dS on usher SARS-Cov-2 tree
This notebook is based loosely off Katie's original dN-dStop-dS notebook on a nextstrain ncov tree. Here I'm applying the same idea to the Usher SARS-CoV-2 tree. 

One key point that took me some time to get: There is no time in this calculation! It is just the number of synonymous or nonsynonymous or stop mutations over the expected number of such sites based on the genome sequence and a substitution matrix. That's why we divide by dS because these values aren't really interpretable without dS.

In [26]:
import os
from Bio import SeqIO
import pandas as pd
import numpy as np
from Bio import Phylo

In [5]:
os.chdir('/Users/cwagner2/Work/projects/covid/long-deletions/')

## 1. Generate expected number of sites
This is all Kathryn Kistler's OG code

In [6]:
location_by_gene = {}

#with open("reference_seq_S1.gb") as reference_handle:
with open("params/sars2_wS1_ref.gb") as reference_handle:
    for record in SeqIO.parse(reference_handle, "genbank"):
        wuhan_seq = record.seq
        for feature in record.features:
            if feature.type == 'CDS':
                gene = feature.qualifiers['gene'][0]
                location = feature.location
                location_by_gene[gene] = location



In [7]:
location_by_gene

{'ORF1a': SimpleLocation(ExactPosition(265), ExactPosition(13468), strand=1),
 'ORF1b': SimpleLocation(ExactPosition(13467), ExactPosition(21555), strand=1),
 'S': SimpleLocation(ExactPosition(21562), ExactPosition(25384), strand=1),
 'S1': SimpleLocation(ExactPosition(21598), ExactPosition(23617), strand=1),
 'ORF3a': SimpleLocation(ExactPosition(25392), ExactPosition(26220), strand=1),
 'E': SimpleLocation(ExactPosition(26244), ExactPosition(26472), strand=1),
 'M': SimpleLocation(ExactPosition(26522), ExactPosition(27191), strand=1),
 'ORF6': SimpleLocation(ExactPosition(27201), ExactPosition(27387), strand=1),
 'ORF7a': SimpleLocation(ExactPosition(27393), ExactPosition(27759), strand=1),
 'ORF7b': SimpleLocation(ExactPosition(27755), ExactPosition(27887), strand=1),
 'ORF8': SimpleLocation(ExactPosition(27893), ExactPosition(28259), strand=1),
 'N': SimpleLocation(ExactPosition(28273), ExactPosition(29533), strand=1),
 'ORF9b': SimpleLocation(ExactPosition(28283), ExactPosition(28

In [8]:
all_genes = list(location_by_gene.keys())

In [9]:
#extract the sequence of each gene, and find it's constituent codon sequences
codons_by_gene = {}

for g, l in location_by_gene.items():
    gene_sequence = l.extract(wuhan_seq)
    #split into codons
    gene_codons = [gene_sequence[i:i+3] for i in range(0, len(gene_sequence), 3)]
    codons_by_gene[g] = gene_codons
    
#because indexing is 0-based, but codons count from 1, need to use codons_by_gene[gene][codon-1]

In [10]:
#make mapper of nt position to codon position (within gene)
#codon_index_by_pos = {}
#for g, l in location_by_gene.items():
    #map to S, not S1. And map to N, not 9b
 #   if g not in ['S1', 'ORF9b']:
        #start at 1!
  #      positions_grouped_into_codons = [[i+1,i+2,i+3] for i in range(l.start, l.end, 3)]
   #     for i in range(len(positions_grouped_into_codons)):
    #        for j in range(len(positions_grouped_into_codons[i])):
     #           pos = positions_grouped_into_codons[i][j]
      #          codon_index_by_pos[pos] = {'gene':g, 'codon': i+1, 'codon_pos': j}
            
#for all positions not in a gene, specify 'noncoding'
#for x in range(len(wuhan_seq)+1):
 #   if x not in codon_index_by_pos.keys():
  #      codon_index_by_pos[x] = {'gene':'Noncoding'}

In [11]:
#make mapper of nt position to codon position within orf9b
#codon_index_by_pos_orf9b = {}
#for g, l in location_by_gene.items():
 #   if g == 'ORF9b':
  #      #start at 1!
   #     positions_grouped_into_codons = [[i+1,i+2,i+3] for i in range(l.start, l.end, 3)]
    #    for i in range(len(positions_grouped_into_codons)):
     #       for j in range(len(positions_grouped_into_codons[i])):
      #          pos = positions_grouped_into_codons[i][j]
       #         codon_index_by_pos_orf9b[pos] = {'gene':g, 'codon': i+1, 'codon_pos': j}

In [12]:
#substitution matrix from https://www.sciencedirect.com/science/article/pii/S1567134821004287
#format is {from_A:{to_T:x, to_C:x, to_G:x}, from_T:{to_A:x, ...}, ...}

sub_matrix = {'A': {'T': 0.0383, 'C': 0.0219, 'G': 0.0747},
          'T': {'A': 0.0356, 'C': 0.2085, 'G': 0.0234},
          'C': {'A': 0.0356, 'T': 0.3648, 'G': 0.0234},
          'G': {'A': 0.1138, 'T': 0.0383, 'C': 0.0219}}


            




In [13]:
total = 0
for k,v in sub_matrix.items():
    for i,j in v.items():
        total += j
print(total)

1.0002


In [14]:
## Substitution matrix from 4-fold degenerate mutation rate estimated by Bloom & Neher in https://pubmed.ncbi.nlm.nih.gov/36778462/
## Data pulled from here: https://jbloomlab.github.io/SARS2-mut-fitness/avg_counts.html
count_matrix_bloom_neher = {'A': {'T': 20.39, 'C': 8.97, 'G': 73.28},
          'T': {'A': 13.97, 'C': 66.53, 'G': 8.70},
          'C': {'A': 38.80, 'T': 569.75, 'G': 11.98},
          'G': {'A': 157.35, 'T': 440.43, 'C': 18.27}}

In [15]:
mut_sum = 0
for k,v in count_matrix_bloom_neher.items():
    for i,j in v.items():
        mut_sum+= j

In [16]:
sub_matrix_bloom_neher = {}
for k,v in count_matrix_bloom_neher.items():
    sub_matrix_bloom_neher[k] = {}
    for i,j in v.items():
        sub_matrix_bloom_neher[k][i] = j/mut_sum

In [17]:
sub_matrix_bloom_neher

{'A': {'T': 0.014274513098388428,
  'C': 0.006279665644558324,
  'G': 0.05130143795242296},
 'T': {'A': 0.009780036683888492,
  'C': 0.04657593704932723,
  'G': 0.006090645608434493},
 'C': {'A': 0.027162879265202108,
  'T': 0.39886727993167276,
  'G': 0.008386889010235085},
 'G': {'A': 0.11015667660772041,
  'T': 0.30833368337043726,
  'C': 0.012790355777712436}}

In [18]:
sub_matrix = sub_matrix_bloom_neher

In [19]:
all_nts = ['A', 'T', 'C', 'G']

In [20]:
#introduce every possible mutation * probability that that mutation occurs (given A->T, oR A->G, etc), 
#and find whether it is synonymous or not
#total the number of synonymous and nonsynonymous sites per gene

syn_denominators ={}
nonsyn_denominators ={}
stop_denominators = {}

for gene, codons in codons_by_gene.items():
    nonsyn_sites_per_gene = 0
    syn_sites_per_gene = 0
    stop_sites_per_gene = 0
    
    #walk through codons
    for codon in codons:
        original_aa = codon.translate()
        
        #for each position in the codon
        for i in range(len(codon)):
            #wuhan nt
            nt = codon[i]
            #introduce each other mutation
            for mut in [x for x in all_nts if x!=nt]:
                mut_codon = codon[:i]+mut+codon[i+1:]
                mut_aa = mut_codon.translate()
                #find whether nonsynonymous
                if mut_aa != original_aa:
                    #find whether this is a stop codon
                    if mut_aa =='*':
                        #get the probability of this mutation
                        #add to the total number of stop sites
                        stop_sites_per_gene+=sub_matrix[nt][mut]
                    else:
                        #or the total number of nonsyn sites
                        nonsyn_sites_per_gene+=sub_matrix[nt][mut]
                #or synonymous
                elif mut_aa == original_aa:
                    syn_sites_per_gene+=sub_matrix[nt][mut]
                    
    syn_denominators[gene] = syn_sites_per_gene
    nonsyn_denominators[gene] = nonsyn_sites_per_gene
    stop_denominators[gene] = stop_sites_per_gene

In [21]:
syn_denominators

{'ORF1a': 562.4967166519632,
 'ORF1b': 347.32931490738343,
 'S': 165.74262471822036,
 'S1': 86.02880105291102,
 'ORF3a': 42.71477576623101,
 'E': 14.23238263255907,
 'M': 39.82281821873106,
 'ORF6': 7.002667282731963,
 'ORF7a': 18.415270018622,
 'ORF7b': 7.693682530348208,
 'ORF8': 15.818141723022666,
 'N': 70.34230828467773,
 'ORF9b': 22.01921003626385}

In [22]:
nonsyn_denominators

{'ORF1a': 1965.455608294307,
 'ORF1b': 1195.589567494127,
 'S': 559.8160905055943,
 'S1': 297.8870570280485,
 'ORF3a': 121.52105123142958,
 'E': 31.33338233852791,
 'M': 100.93415102000743,
 'ORF6': 21.362869464163275,
 'ORF7a': 52.01715881883437,
 'ORF7b': 14.312933170916141,
 'ORF8': 50.842917349238554,
 'N': 206.35268338444166,
 'ORF9b': 47.172645300401584}

In [23]:
stop_denominators

{'ORF1a': 177.40973243163742,
 'ORF1b': 94.9203245544025,
 'S': 51.92237577183157,
 'S1': 23.88158244773949,
 'ORF3a': 10.741469595777136,
 'E': 1.3628064574844927,
 'M': 8.204379664244401,
 'ORF6': 3.150263927976365,
 'ORF7a': 5.3237493174276445,
 'ORF7b': 1.7254869016115708,
 'ORF8': 5.66270424665014,
 'N': 26.19557973145155,
 'ORF9b': 4.118081516640763}

## 2. Make mutations df

In [30]:
## In order to get dates for nodes, need to load tree

def load_dates(path):
    df = pd.read_csv(path,sep='\t',usecols=['strain','date'],compression='gzip')
    df = df[~df.date.isna()]
    df = df[~df.date.str.contains('?',regex=False)]
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('strain',inplace=True)
    return df

def tabulate_names(tree):
    names = {}
    for idx, clade in enumerate(tree.find_clades()):
        if not clade.name:
            clade.name = str(idx)
        names[clade.name] = clade
    return names

def get_leaves(node, named):
    leaves = named[node].get_terminals()
    names = [leaf.name for leaf in leaves]
    return names

def get_time(node,named,date_df):
    leaves = get_leaves(node,named)
    dates = date_df.loc[leaves]['date']
    first = min(dates)
    time = max(dates) - first
    return time.days, first

def get_times(df,named,date_df):
    time_vect = np.vectorize(get_time,excluded=[1,2])
    df['days_circulated'],df['date_observed'] = time_vect(df['node_id'],named,date_df)
    return df


In [31]:
dates = load_dates('usher/public-2023-05-01.metadata.tsv.gz')

In [28]:
with open('usher/trimmed/trimmed_2023-05-01.all.nwk', 'r') as f:
    tree = Phylo.read(f,'newick')

In [32]:
named = tabulate_names(tree)

In [24]:
with open('usher/trimmed/usher_translations.tsv','r') as f:
    muts = pd.read_csv(f,sep='\t')

In [31]:
muts.iloc[0:1000,:].to_csv('usher/trimmed/test_translations.tsv',sep='\t',index=False)

In [33]:
## Split up nodes with multiple mutations into single mutations
muts['aa_mutations'] = muts.aa_mutations.str.split(';')
muts['nt_mutations'] = muts.nt_mutations.str.split(';')
muts['codon_changes'] = muts.codon_changes.str.split(';')
all_muts = muts.explode(['aa_mutations','nt_mutations','codon_changes'],ignore_index=True)

In [34]:
## Label with gene
all_muts[['gene','aa_mutation']] = all_muts['aa_mutations'].str.split(':',expand=True)
all_muts.drop(columns=['aa_mutations'],inplace=True)

In [35]:
gene_aa = {k:wuhan_seq[v.start:v.end].translate() for k,v in location_by_gene.items()}

In [36]:
gene_aa

{'ORF1a': Seq('MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLV...FLN'),
 'ORF1b': Seq('RVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFAKFLKTNCCRFQEKDEDD...NN*'),
 'S': Seq('MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDL...YT*'),
 'S1': Seq('SQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFH...RAR'),
 'ORF3a': Seq('MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWLIVGVALLA...PL*'),
 'E': Seq('MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNIVNVSLVKP...LV*'),
 'M': Seq('MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFLYIIKLIFL...VQ*'),
 'ORF6': Seq('MFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSLTENKYSQLDE...ID*'),
 'ORF7a': Seq('MKIILFLALITLATCELYHYQECVRGTTVLLKEPCSSGTYEGNSPFHPLADNKF...TE*'),
 'ORF7b': Seq('MIELSLIDFYLCFLAFLLFLVLIMLIIFWFSLELQDHNETCHA*'),
 'ORF8': Seq('MKFLVFLGIITTVAAFHQECSLQSCTQHQPYVVDDPCPIHFYSKWYIRVGARKS...FI*'),
 'N': Seq('MSDNGPQNQRNAPRITFGGPSDSTGSNQNGERSGARSKQRRPQGLPNNTASWFT...QA*'),
 'ORF9b': Seq('MDPKISEMHPALRLVDPQIQLAVTRMENAVGRDQNNVGPKVYPIILRLGSPLSL...VK*')}

In [37]:
def get_aa(row):
    return gene_aa[row['gene']][row['residue']-1]

In [38]:
## Classify mutation type
all_muts['residue'] = pd.to_numeric(all_muts.aa_mutation.str[1:-1])

# What type of mutation compared to Wuhan-Hu-1
#all_muts['new_aa'] = all_muts.aa_mutation.str[-1]
#all_muts['og_aa'] = all_muts['og_aa'] = all_muts.apply(get_aa,axis=1)
#all_muts['mut_type'] = np.where(all_muts.og_aa==all_muts.new_aa,'synonymous','missense')
#all_muts['mut_type'] = np.where(all_muts.aa_mutation.str[-1]=='*','nonsense',all_muts['mut_type'])


# What type of mutation compared to predecessor
all_muts['mut_type'] = np.where(all_muts.aa_mutation.str[-1]==all_muts.aa_mutation.str[0],'synonymous','missense')
all_muts['mut_type'] = np.where(all_muts.aa_mutation.str[-1]=='*','nonsense',all_muts['mut_type'])
all_muts['mut_type'] = np.where(all_muts.aa_mutation.str[0]=='*','undoStop',all_muts['mut_type'])

In [39]:
all_muts

Unnamed: 0,node_id,nt_mutations,codon_changes,leaves_sharing_mutations,gene,aa_mutation,residue,mut_type
0,CHN/YN-0306-466/2020|MT396241.1|2020-03-06,G15910T,GAT>TAT,1,ORF1b,D815Y,815,missense
1,DP0803|LC571037.1|2020-02-17,G4162T,GTG>GTT,1,ORF1a,V1299V,1299,synonymous
2,node_2,T13090C,GAT>GAC,2,ORF1a,D4275D,4275,synonymous
3,England/LEED-2A8B52/2020|OA971832.1|2020-04-04,C1191T,CCA>CTA,1,ORF1a,P309L,309,missense
4,England/LEED-2A8B52/2020|OA971832.1|2020-04-04,C11674T,TAC>TAT,1,ORF1a,Y3803Y,3803,synonymous
...,...,...,...,...,...,...,...,...
5508953,Netherlands/NoordBrabant_60/2020|LR878078.1|20...,T6163A,GAT>GAA,1,ORF1a,D1966E,1966,missense
5508954,Netherlands/NoordBrabant_60/2020|LR878078.1|20...,C12923A,CCT>ACT,1,ORF1a,P4220T,4220,missense
5508955,Netherlands/NoordBrabant_60/2020|LR878078.1|20...,C13430T,CCC>TCC,1,ORF1a,P4389S,4389,missense
5508956,Netherlands/NoordBrabant_60/2020|LR878078.1|20...,C19366T,CCA>TCA,1,ORF1b,P1967S,1967,missense


In [40]:
gene_aa['S'][12:685]==gene_aa['S1']

True

In [41]:
 ## Add S1
S1 = all_muts[(all_muts.gene=='S')&(all_muts.residue >= 13)&(all_muts.residue<=685)].reset_index(drop=True)
S1['gene'] = 'S1'
final_muts = pd.concat([all_muts,S1])

In [42]:
S1.head()

Unnamed: 0,node_id,nt_mutations,codon_changes,leaves_sharing_mutations,gene,aa_mutation,residue,mut_type
0,England/PHEC-1E01E/2020|2020-04-03,C22445T,CCT>TCT,1,S1,P295S,295,missense
1,England/LIVE-A9C05/2020|2020-03-24,C23603T,CCT>TCT,1,S1,P681S,681,missense
2,node_28,G23611T,CGG>CGT,2,S1,R683R,683,synonymous
3,England/BIRM-61D60/2020|2020-03-25,C21767T,CAT>TAT,1,S1,H69Y,69,missense
4,node_31,A23403G,GAT>GGT,2,S1,D614G,614,missense


In [None]:
node_times = get_times(final_muts,named,dates)

In [42]:
final_muts.columns

Index(['node_id', 'nt_mutations', 'codon_changes', 'leaves_sharing_mutations',
       'gene', 'aa_mutation', 'residue', 'mut_type'],
      dtype='object')

In [43]:
final_muts.memory_usage(deep=True)

Index                        47683696
node_id                     548289643
nt_mutations                379695500
codon_changes               381469568
leaves_sharing_mutations     47683696
gene                        362695422
aa_mutation                 370968363
residue                      47683696
mut_type                    392220316
dtype: int64

In [38]:
def make_mutations_df(path):
    '''
    Constructs dataframe with all mutations in Usher tree for each gene.
    '''
    with open(path,'r') as f:
        muts = pd.read_csv(f,sep='\t',usecols=['node_id','aa_mutations'])

    ## Split up nodes with multiple mutations into single mutations
    muts['aa_mutations'] = muts.aa_mutations.str.split(';')
    all_muts = muts.explode(['aa_mutations'],ignore_index=True)

    ## Label with gene
    all_muts[['gene','aa_mutation']] = all_muts['aa_mutations'].str.split(':',expand=True)
    all_muts.drop(columns=['aa_mutations'],inplace=True)

    ## Get residue
    all_muts['residue'] = pd.to_numeric(all_muts.aa_mutation.str[1:-1])

    ## Classify mutation type
    all_muts['mut_type'] = np.where(all_muts.aa_mutation.str[-1]==all_muts.aa_mutation.str[0],'synonymous','missense')
    all_muts['mut_type'] = np.where(all_muts.aa_mutation.str[-1]=='*','nonsense',all_muts['mut_type'])
    all_muts['mut_type'] = np.where(all_muts.aa_mutation.str[0]=='*','undoStop',all_muts['mut_type'])

    ## Add S1
    S1 = all_muts[(all_muts.gene=='S')&(all_muts.residue >= 13)&(all_muts.residue<=685)].reset_index(drop=True)
    S1['gene'] = 'S1'
    final_muts = pd.concat([all_muts,S1])
    final_muts[['gene','mut_type']] = final_muts[['gene','mut_type']].astype("category")

    return final_muts[['node_id','gene','mut_type']]

In [27]:
new_final_muts = make_mutations_df('usher/trimmed/usher_translations.tsv')

In [30]:
new_final_muts.memory_usage(deep=True)

Index        47683696
node_id     548289643
gene          5961803
mut_type      5960896
dtype: int64

In [63]:
test = new_final_muts.iloc[0:50000,:]

In [65]:
test = test.set_index('node_id')

In [75]:
for i, k in enumerate(samples.keys()):
    filt = test.loc[samples[k],:]
    results = filt.groupby(by=['gene','mut_type']).size().reset_index()
    results.rename(columns={0:'count'},inplace=True)
    results['iteration'] = k
    if i == 0:
        output = results
    else:
        output = pd.concat([output,results])

In [76]:
output

Unnamed: 0,gene,mut_type,count,iteration
0,E,missense,190,0
1,E,nonsense,0,0
2,E,synonymous,140,0
3,E,undoStop,0,0
4,M,missense,607,0
...,...,...,...,...
47,S,undoStop,0,999
48,S1,missense,0,999
49,S1,nonsense,0,999
50,S1,synonymous,0,999


In [37]:
from random import choices
from itertools import compress


In [34]:
%load_ext line_profiler

In [216]:
def flatten(l):
    return [item for sublist in l for item in sublist]

def draw_samples(df,n):
    '''
    Draws random samples with replacement of node ids for bootstrap.
    Returns indexes of random samples.
    '''
    ids = df['node_id'].unique()
    idxs = {ID:list(np.where(df['node_id']==ID)[0]) for ID in ids}
    length  = len(ids)
    sample_ids = {iterat:choices(ids,k=length) for iterat in range(n)}
    sample_idxs = {iterat:flatten([idxs[ID] for ID in sample_ids[iterat]]) for iterat in range(n)}
    return sample_idxs
          
def draw_samples_new(df,n,chunksize):
    '''
    Draws random samples with replacement of node ids for bootstrap.
    Returns indexes of random samples.
    '''
    ids = df['id'].unique()
    length = len(ids)
    sample_ids = {iterat:choices(range(length),k=chunksize) for iterat in range(n)}
    print('Finished drawing samples')
    return sample_ids

def draw_samples_new(df,n):
    '''
    Draws random samples with replacement of node ids for bootstrap.
    Returns indexes of random samples.
    '''
    ids = df['id'].unique()
    length = len(ids)
    sample_ids = {iterat:choices(range(length),k=length) for iterat in range(n)}
    return sample_ids


def draw_genes(samples,genes,df):
    '''
    Returns indexes of random samples for each gene.
    '''
    bootstrap_genes = {}
    for k in samples.keys():
        bootstrap_genes[k] = {}
        for gene in genes:
            filt = df.iloc[samples[k],1].values == gene
            subset = compress(samples[k],filt)
            bootstrap_genes[k][gene] = list(subset)
    return bootstrap_genes

def draw_counts(gene_samples,genes,df):
    '''
    Returns df of counts per gene by mutation type for each iteration.
    '''
    iterations = []
    the_genes = []
    mutTypes = []
    counts = []
    for k in gene_samples.keys():
        iterations.extend([k]*len(genes)*3)
        for gene in genes:
            the_genes.extend([gene]*3)
            for mut_type in ['synonymous','missense','nonsense']:
                filt = df.iloc[gene_samples[k][gene],2].values == mut_type
                subset = compress(gene_samples[k][gene],filt)
                count = sum(1 for x in subset)
                mutTypes.append(mut_type)
                counts.append(count)
    bootstrap = pd.DataFrame({'bootstrap':iterations,'gene':the_genes,'mutType':mutTypes,'count':counts})
    return bootstrap

def draw_counts_new(samples,df):
    '''
    Returns df of counts per gene by mutation type for each iteration.
    '''
    df = df.set_index('id')
    for i, k in enumerate(samples.keys()):
        print(f'starting iteration: {i}')
        filt = df.loc[samples[k],:]
        count = filt.groupby(by=['gene','mut_type']).size().reset_index()
        count.rename(columns={0:'count'},inplace=True)
        count['iteration'] = k
        if i == 0:
            bootstrap = count
        else:
            bootstrap = pd.concat([bootstrap,count])
    return bootstrap

def draw_counts_test(samples,df):
    '''
    Returns df of counts per gene by mutation type for each iteration.
    '''
    df = df.set_index('id')
    keys = list(samples.keys())
    for i, k in enumerate(keys[0:10]):
        filt = df.loc[samples[k],:]
        count = filt.groupby(by=['gene','mut_type']).size().reset_index()
        count.rename(columns={0:'count'},inplace=True)
        count['iteration'] = k
        if i == 0:
            bootstrap = count
        else:
            bootstrap = pd.concat([bootstrap,count])
    return bootstrap


def recode_nodeID(df):
    '''
    Recodes node id as an integer.
    '''
    ids = df['node_id'].unique()

    encoder = dict(zip(ids,range(len(ids))))

    df['id'] = df['node_id'].map(encoder)

    return df[['id', 'gene','mut_type']]
        
## Next figure out how to draw muts

def run_old(df,n,genes):
    sampled = draw_samples(df,n)
    gene_sampled = draw_genes(sampled,genes,df)
    counted = draw_counts(gene_sampled,genes,df)
    return counted

def run_new(df,n):
    sampled = draw_samples_new(df,n)
    counted = draw_counts_new(sampled,df)
    return counted

            


In [158]:
recoded = recode_nodeID(new_final_muts)

In [199]:
samples = draw_samples_new(recoded,5)

In [201]:
counts = draw_counts_new(samples,recoded)

KeyboardInterrupt: 

In [185]:
samples.head()

AttributeError: 'dict' object has no attribute 'head'

In [184]:
len(samples)

10000

In [169]:
print([type(k) for k in samples.keys() if k<5])

[<class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>]


In [170]:
print([type(v[0]) for k,v in samples.items() if k<5])

[<class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>]


In [159]:
recoded.head()

Unnamed: 0,id,gene,mut_type
0,0,ORF1b,missense
1,1,ORF1a,synonymous
2,2,ORF1a,synonymous
3,3,ORF1a,missense
4,3,ORF1a,synonymous


In [116]:
%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [202]:
lprun -f run_old counted = run_old(new_final_muts.iloc[0:5000,:],1000,all_genes)

*** KeyboardInterrupt exception caught in code being profiled.

Timer unit: 1e-09 s

Total time: 3.69131 s
File: /var/folders/b5/2grxct1x69395r8j6vkk07bc0000gp/T/ipykernel_28694/594931291.py
Function: run_old at line 119

Line #      Hits         Time  Per Hit   % Time  Line Contents
   119                                           def run_old(df,n,genes):
   120         1 2922750000.0 2922750000.0     79.2      sampled = draw_samples(df,n)
   121         1  768557000.0 768557000.0     20.8      gene_sampled = draw_genes(sampled,genes,df)
   122                                               counted = draw_counts(gene_sampled,genes,df)
   123                                               return counted

In [217]:
lprun -f run_new counted = run_new(new_final_muts,1)

starting iteration: 0
*** KeyboardInterrupt exception caught in code being profiled.

Timer unit: 1e-09 s

Total time: 18.0458 s
File: /var/folders/b5/2grxct1x69395r8j6vkk07bc0000gp/T/ipykernel_28694/835523423.py
Function: run_new at line 127

Line #      Hits         Time  Per Hit   % Time  Line Contents
   127                                           def run_new(df,n):
   128         1 1253281000.0 1253281000.0      6.9      sampled = draw_samples_new(df,n)
   129         1 16792551000.0 16792551000.0     93.1      counted = draw_counts_new(sampled,df)
   130                                               return counted

In [213]:
%load_ext autoreload
%autoreload 2

In [218]:
from test import run_new

In [219]:
mprun -f run_new counted = run_new(new_final_muts,10)




Filename: /Users/cwagner2/Work/projects/covid/long-deletions/notebooks/test.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    79                                         def draw_counts_test(samples,df):
    80                                             '''
    81                                             Returns df of counts per gene by mutation type for each iteration.
    82                                             '''
    83                                             df = df.set_index('id')
    84                                             keys = list(samples.keys())
    85                                             for i, k in enumerate(keys[0:10]):
    86                                                 filt = df.loc[samples[k],:]
    87                                                 count = filt.groupby(by=['gene','mut_type']).size().reset_index()
    88                                                 count.rename(columns={0:'count'},inplace=True)
   

In [35]:
counted

NameError: name 'counted' is not defined

In [207]:
counted.groupby(by=['gene','mut_type']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,iteration
gene,mut_type,Unnamed: 2_level_1,Unnamed: 3_level_1
E,missense,119731,10
E,nonsense,83,10
E,synonymous,72624,10
E,undoStop,0,10
M,missense,318786,10
M,nonsense,612,10
M,synonymous,403134,10
M,undoStop,11,10
N,missense,1218276,10
N,nonsense,1588,10


In [130]:
len(new_final_muts['node_id'].unique())

3422474

In [132]:
mprun -f run_new counted = run_new(new_final_muts.iloc[0:50000,:],10000)

*** KeyboardInterrupt exception caught in code being profiled.


Filename: /Users/cwagner2/Work/projects/covid/long-deletions/notebooks/test.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    90    103.6 MiB    103.6 MiB           1   def run_new(df,n):
    91    872.9 MiB    769.4 MiB           1       sampled = draw_samples_new(df,n)
    92    785.2 MiB    -87.7 MiB           1       counted = draw_counts_new(sampled,df)
    93                                             return counted

In [144]:
mprun -f run_new counted = run_new(new_final_muts.iloc[0:50,:],100)




Filename: /Users/cwagner2/Work/projects/covid/long-deletions/notebooks/test.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    90    103.3 MiB    103.3 MiB           1   def run_new(df,n):
    91    110.8 MiB      7.5 MiB           1       sampled = draw_samples_new(df,n)
    92    151.4 MiB     40.6 MiB           1       counted = draw_counts_new(sampled,df)
    93    151.4 MiB      0.0 MiB           1       return counted

In [145]:
mprun -f run_new counted = run_new(new_final_muts.iloc[0:50,:],1000)




Filename: /Users/cwagner2/Work/projects/covid/long-deletions/notebooks/test.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    90    154.6 MiB    154.6 MiB           1   def run_new(df,n):
    91    156.4 MiB      1.9 MiB           1       sampled = draw_samples_new(df,n)
    92    159.8 MiB      3.3 MiB           1       counted = draw_counts_new(sampled,df)
    93    159.9 MiB      0.1 MiB           1       return counted

In [146]:
mprun -f run_new counted = run_new(new_final_muts.iloc[0:500,:],100)




Filename: /Users/cwagner2/Work/projects/covid/long-deletions/notebooks/test.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    90    196.6 MiB    196.6 MiB           1   def run_new(df,n):
    91    198.3 MiB      1.7 MiB           1       sampled = draw_samples_new(df,n)
    92    203.0 MiB      4.8 MiB           1       counted = draw_counts_new(sampled,df)
    93    203.0 MiB      0.0 MiB           1       return counted

In [147]:
mprun -f run_new counted = run_new(new_final_muts.iloc[0:500,:],1000)




Filename: /Users/cwagner2/Work/projects/covid/long-deletions/notebooks/test.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    90    203.3 MiB    203.3 MiB           1   def run_new(df,n):
    91    209.9 MiB      6.7 MiB           1       sampled = draw_samples_new(df,n)
    92    224.9 MiB     15.0 MiB           1       counted = draw_counts_new(sampled,df)
    93    224.9 MiB      0.0 MiB           1       return counted

In [149]:
mprun -f run_new counted = run_new(new_final_muts.iloc[0:100000,:],1000)




Filename: /Users/cwagner2/Work/projects/covid/long-deletions/notebooks/test.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    90    103.0 MiB    103.0 MiB           1   def run_new(df,n):
    91    774.8 MiB    671.9 MiB           1       sampled = draw_samples_new(df,n)
    92   1746.4 MiB    971.6 MiB           1       counted = draw_counts_new(sampled,df)
    93   1746.5 MiB      0.1 MiB           1       return counted

In [108]:
lprun -f draw_samples_new samples = draw_samples_new(new_final_muts.iloc[0:1000,:],1000)

Timer unit: 1e-09 s

Total time: 0.14607 s
File: /var/folders/b5/2grxct1x69395r8j6vkk07bc0000gp/T/ipykernel_28694/2478551183.py
Function: draw_samples_new at line 16

Line #      Hits         Time  Per Hit   % Time  Line Contents
    16                                           def draw_samples_new(df,n):
    17                                               '''
    18                                               Draws random samples with replacement of node ids for bootstrap.
    19                                               Returns indexes of random samples.
    20                                               '''
    21         1     425000.0 425000.0      0.3      ids = df['node_id'].unique()
    22         1       1000.0   1000.0      0.0      length = len(ids)
    23         1  145643000.0 145643000.0     99.7      sample_ids = {iterat:choices(ids,k=length) for iterat in range(n)}
    24         1       1000.0   1000.0      0.0      return sample_ids

In [46]:
lprun -f draw_samples samples = draw_samples(new_final_muts.iloc[0:50000,:],10)

Timer unit: 1e-09 s

Total time: 55.3704 s
File: /var/folders/b5/2grxct1x69395r8j6vkk07bc0000gp/T/ipykernel_28694/2588227640.py
Function: draw_samples at line 30

Line #      Hits         Time  Per Hit   % Time  Line Contents
    30                                           def draw_samples(df,n):
    31                                               '''
    32                                               Draws random samples with replacement of node ids for bootstrap.
    33                                               Returns indexes of random samples.
    34                                               '''
    35         1   17741000.0 17741000.0      0.0      ids = df['node_id'].unique()
    36         1 55186231000.0 55186231000.0     99.7      idxs = {ID:np.flatnonzero(df['node_id']==ID) for ID in ids}
    37         1       4000.0   4000.0      0.0      length  = len(ids)
    38         1   58932000.0 58932000.0      0.1      sample_ids = {iterat:choices(ids,k=length) for iter

In [158]:
gene_samples = draw_genes(samples,all_genes,final_muts)

In [159]:
counts = draw_counts(gene_samples,all_genes,final_muts)

In [189]:
counts1 = draw_counts_test(samples,recoded)

In [192]:
counts2 = draw_counts_test(samples,recoded)

In [196]:
pd.concat([counts2,counts1]).groupby(by=['gene','mut_type','iteration']).sum().reset_index()

Unnamed: 0,gene,mut_type,iteration,count
0,E,missense,0,22
1,E,missense,1,10
2,E,missense,2,8
3,E,missense,3,8
4,E,missense,4,12
...,...,...,...,...
515,S1,undoStop,5,0
516,S1,undoStop,6,0
517,S1,undoStop,7,0
518,S1,undoStop,8,0


In [166]:
def get_ci_counts(df,genes,ci):
    '''
    Given df of bootstrap counts, returns dictionary of min & max ci counts.
    '''
    min_counts = {}
    max_counts = {}
    for mutType in df['mutType'].unique():
        min_counts[mutType] = {}
        max_counts[mutType] = {}
        for gene in genes:
            values = df[(df.gene==gene) & (df.mutType==mutType)]['count']
            minimum = 100-ci/2
            maximum = 100-minimum
            min_counts[mutType][gene] = np.percentile(values,minimum)
            max_counts[mutType][gene] = np.percentile(values,maximum)
    return min_counts, max_counts

In [167]:
minimum_counts,maximum_counts = get_ci_counts(counts,all_genes,95)

In [168]:
minimum_counts

{'synonymous': {'ORF1a': 181.0,
  'ORF1b': 83.0,
  'S': 40.0,
  'S1': 0.0,
  'ORF3a': 11.0,
  'E': 6.0,
  'M': 12.0,
  'ORF6': 5.0,
  'ORF7a': 4.0,
  'ORF7b': 2.0,
  'ORF8': 6.0,
  'N': 28.0,
  'ORF9b': 3.0},
 'missense': {'ORF1a': 241.0,
  'ORF1b': 136.0,
  'S': 80.0,
  'S1': 0.0,
  'ORF3a': 37.0,
  'E': 9.0,
  'M': 11.0,
  'ORF6': 5.0,
  'ORF7a': 5.0,
  'ORF7b': 2.0,
  'ORF8': 12.0,
  'N': 61.0,
  'ORF9b': 15.0},
 'nonsense': {'ORF1a': 0.0,
  'ORF1b': 1.0,
  'S': 1.0,
  'S1': 0.0,
  'ORF3a': 1.0,
  'E': 0.0,
  'M': 0.0,
  'ORF6': 1.0,
  'ORF7a': 1.0,
  'ORF7b': 0.0,
  'ORF8': 3.0,
  'N': 0.0,
  'ORF9b': 0.0}}

In [53]:
grab = samples[0]

final_muts.iloc[grab,]._is_view

False

In [144]:
denominators = {'synonymous':syn_denominators,'missense':nonsyn_denominators,'nonsense':stop_denominators}


In [151]:
def get_counts(df, genes):
    all_syn = {gene:len(df[(df.gene==gene)&(df.mut_type=='synonymous')]) for gene in genes}
    all_missense = {gene:len(df[(df.gene==gene)&(df.mut_type=='missense')]) for gene in genes}
    all_nonsense = {gene:len(df[(df.gene==gene)&(df.mut_type=='nonsense')]) for gene in genes}
    counts = {'synonymous':all_syn,'missense':all_missense,'nonsense':all_nonsense}
    return counts

def get_differences(counts,denominators,genes):
    diffs = {}
    for mutType in counts.keys():
        diffs[mutType] = {gene:counts[mutType][gene]/denominators[mutType][gene] for gene in genes}
    return diffs

In [149]:
counted = get_counts(final_muts.iloc[0:1000],all_genes)

In [None]:
counte

In [153]:
get_differences(counted,denominators,all_genes)

{'synonymous': {'ORF1a': 0.1864679388061965,
  'ORF1b': 0.14086567558329405,
  'S': 0.13997138285077795,
  'S1': 0.0,
  'ORF3a': 0.1697515146464685,
  'E': 0.3080999476230088,
  'M': 0.21937922990577594,
  'ORF6': 0.3968789439845057,
  'ORF7a': 0.14652175664934064,
  'ORF7b': 0.17931751752828728,
  'ORF8': 0.22262211750737437,
  'N': 0.2932222719071044,
  'ORF9b': 0.13157029142819554},
 'missense': {'ORF1a': 0.11834472945611134,
  'ORF1b': 0.1084064461203143,
  'S': 0.13273030093319402,
  'S1': 0.0,
  'ORF3a': 0.28080507574147284,
  'E': 0.24323666927920887,
  'M': 0.1039266315772553,
  'ORF6': 0.18016719515710647,
  'ORF7a': 0.08403431961613078,
  'ORF7b': 0.09505884142284086,
  'ORF8': 0.21049418772924017,
  'N': 0.31832774607256925,
  'ORF9b': 0.3200805749500666},
 'nonsense': {'ORF1a': 0.0,
  'ORF1b': 0.013531140566900574,
  'S': 0.025303771780221574,
  'S1': 0.0,
  'ORF3a': 0.1252960118279438,
  'E': 0.0,
  'M': 0.0,
  'ORF6': 0.4948780125699015,
  'ORF7a': 0.2916132042458882,
  '

In [38]:
denominators = {'synonymous':syn_denominators,'missense':nonsyn_denominators,'nonsense':stop_denominators}

In [125]:
counted = pd.read_csv('usher/trimmed/bootstrap/counts2.tsv',sep='\t')

In [126]:
counted.head()

Unnamed: 0,gene,mut_type,count,iteration
0,E,missense,24219,0
1,E,nonsense,23,0
2,E,synonymous,14594,0
3,E,undoStop,0,0
4,M,missense,64127,0


In [136]:
counted.columns

Index(['gene', 'mut_type', 'count', 'iteration'], dtype='object')

In [139]:
cols = list(counted.columns)
cols.remove('mut_type')
cols.remove('count')
print(cols)

['gene', 'iteration']


In [152]:
def get_counts(df, genes):
    '''
    Get observed counts
    '''
    geneList = []
    mutTypes = []
    counts = []
    for gene in genes:
        geneList.extend([gene]*4)
        for mut_type in df.mut_type.unique():
            mutTypes.append(mut_type)
            count = len(df[(df.gene==gene)&(df.mut_type==mut_type)])
            counts.append(count)
    result = pd.DataFrame({'gene':geneList,'mut_type':mutTypes,'count':counts})
    return result

In [153]:
get_counts(final_muts,all_genes)

Unnamed: 0,gene,mut_type,count
0,ORF1a,missense,1328866
1,ORF1a,synonymous,1006032
2,ORF1a,nonsense,880
3,ORF1a,undoStop,27
4,ORF1b,missense,592141
5,ORF1b,synonymous,530552
6,ORF1b,nonsense,418
7,ORF1b,undoStop,8
8,S,missense,478556
9,S,synonymous,273119


In [150]:
def get_differences(counts,denominators, genes):
    '''
    Given counts & denominators, get dS, dN, dStop & dN_dS and dStop_dS values.
    '''
    df = counts.copy()
    cols = list(df.columns)
    cols.remove('mut_type')
    cols.remove('count')
    df = df.pivot_table(index=cols, columns='mut_type',values='count').reset_index()
    for gene in genes:
        df.loc[(df.gene==gene),'dS'] = df.loc[(df.gene==gene),'synonymous']/denominators['synonymous'][gene]
        df.loc[(df.gene==gene),'dN'] = df.loc[(df.gene==gene),'missense']/denominators['missense'][gene]
        df.loc[(df.gene==gene),'dStop'] = df.loc[(df.gene==gene),'nonsense']/denominators['nonsense'][gene]
    df['dN_dS'] = df['dN']/df['dS']
    df['dStop_dS'] = df['dStop']/df['dS']
    return df

In [165]:
diffs = get_dS_dN_dStop(counted,denominators,all_genes)

In [166]:
ci = 95

In [170]:
def get_percentile(df,p,suffix):
    transformed = df.groupby('gene').quantile(p).add_suffix(suffix).reset_index()
    return transformed

def get_ci(df,ci):
    new_df = df.drop(columns=['iteration','missense','synonymous','nonsense','undoStop'])
    minimum = (100-ci)/200
    maximum = 1-minimum
    minimums = get_percentile(new_df,minimum,'_min')
    maximums = get_percentile(new_df,maximum,'_max')
    intervals = minimums.merge(maximums, on='gene')
    return intervals

In [171]:
get_ci(diffs,95)

mut_type,gene,dS_min,dN_min,dStop_min,dN_dS_min,dStop_dS_min,dS_max,dN_max,dStop_max,dN_dS_max,dStop_dS_max
0,E,734.856888,644.495419,10.387935,0.854313,0.013974,758.379035,655.740385,20.264258,0.886089,0.027071
1,M,1464.798774,597.70615,20.575281,0.405566,0.013957,1481.39527,607.871592,25.905618,0.414758,0.017658
2,N,1596.459655,1262.81504,14.278803,0.790042,0.008942,1605.462364,1273.650316,17.656436,0.797797,0.010997
3,ORF1a,1040.361348,651.760776,6.771186,0.625168,0.006492,1043.274029,654.137135,7.808704,0.628226,0.007503
4,ORF1b,898.948862,474.42989,5.343447,0.526976,0.005934,902.335621,476.023064,6.207749,0.529259,0.006897
5,ORF3a,1180.321633,1485.312946,199.217526,1.24888,0.167951,1190.487434,1496.738297,216.674393,1.267381,0.182076
6,ORF6,1569.642333,999.452292,793.648241,0.628174,0.502548,1606.496511,1015.817779,841.342109,0.639703,0.531657
7,ORF7a,1249.402008,1840.467567,2508.019363,1.438972,1.971554,1279.145009,1865.370717,2569.688265,1.484164,2.047144
8,ORF7b,741.433106,1050.915892,1365.871929,1.399486,1.826378,759.947639,1066.270271,1485.467854,1.434667,1.988191
9,ORF8,878.751646,1491.544273,2163.246638,1.683194,2.438555,890.440235,1502.235624,2204.217064,1.706534,2.484667


In [142]:
def get_ci_counts(df,genes,ci):
    '''
    Given df of bootstrap counts, returns dictionary of min & max ci counts.
    '''
  
    df.group_by(genes)
    for gene in genes:
        values = df[(df.gene==gene)]['count']
        print(len(values))
        print(np.percentile(values,minimum))
        print(np.percentile(values,maximum))
        print('--------')
        min_counts[mutType][gene] = np.percentile(values,minimum)
        max_counts[mutType][gene] = np.percentile(values,maximum)
    return min_counts, max_counts

mut_type,iteration,gene,missense,nonsense,synonymous,undoStop,dS,dN,dStop,dN_dS,dStop_dS
0,0,E,24219,23,14594,0,749.401773,654.549877,20.464454,0.873430,0.027308
1,0,M,64127,127,80613,2,1473.734822,605.863918,22.831461,0.411108,0.015492
2,0,N,243398,363,153455,19,1607.015133,1270.169455,18.197860,0.790390,0.011324
3,0,ORF1a,1331967,921,1005958,33,1042.105082,654.071677,7.484264,0.627645,0.007182
4,0,ORF1b,592824,413,531737,9,902.451708,476.044022,5.588361,0.527501,0.006192
...,...,...,...,...,...,...,...,...,...,...,...
125,9,ORF7b,22406,1642,8403,21,753.402550,1064.944200,1405.701567,1.413513,1.865804
126,9,ORF8,85031,9361,23889,4411,886.369961,1491.544273,2181.593605,1.682756,2.461268
127,9,ORF9b,63144,1333,20198,46,885.818915,1347.411188,485.238979,1.521091,0.547786
128,9,S,478570,335,273059,6,955.511146,804.060001,8.476764,0.841497,0.008871


In [48]:
counted

Unnamed: 0,gene,mut_type,count,iteration,dS,dN,dStop,dN_dS,dStop_dS
0,E,missense,24219,0,-1.000000,654.549877,-1.000000,-654.549877,1.000000
1,E,nonsense,23,0,-1.000000,-1.000000,20.464454,1.000000,-20.464454
2,E,synonymous,14594,0,749.401773,-1.000000,-1.000000,-0.001334,-0.001334
3,E,undoStop,0,0,-1.000000,-1.000000,-1.000000,1.000000,1.000000
4,M,missense,64127,0,-1.000000,605.863918,-1.000000,-605.863918,1.000000
...,...,...,...,...,...,...,...,...,...
515,S,undoStop,6,9,-1.000000,-1.000000,-1.000000,1.000000,1.000000
516,S1,missense,298211,9,-1.000000,942.659523,-1.000000,-942.659523,1.000000
517,S1,nonsense,220,9,-1.000000,-1.000000,11.676725,1.000000,-11.676725
518,S1,synonymous,153905,9,1001.222375,-1.000000,-1.000000,-0.000999,-0.000999


In [34]:
count

NameError: name 'count' is not defined

## 3. Calculate dS - dN - dStop

In [44]:
all_syn = {gene:len(final_muts[(final_muts.gene==gene)&(final_muts.mut_type=='synonymous')]) for gene in all_genes}
all_missense = {gene:len(final_muts[(final_muts.gene==gene)&(final_muts.mut_type=='missense')]) for gene in all_genes}
all_nonsense = {gene:len(final_muts[(final_muts.gene==gene)&(final_muts.mut_type=='nonsense')]) for gene in all_genes}

In [45]:
all_syn

{'ORF1a': 1006032,
 'ORF1b': 530552,
 'S': 273119,
 'S1': 153726,
 'ORF3a': 76980,
 'E': 14541,
 'M': 80598,
 'ORF6': 19985,
 'ORF7a': 34325,
 'ORF7b': 8369,
 'ORF8': 23949,
 'N': 152729,
 'ORF9b': 20238}

In [46]:
all_missense

{'ORF1a': 1328866,
 'ORF1b': 592141,
 'S': 478556,
 'S1': 297532,
 'ORF3a': 196513,
 'E': 23979,
 'M': 63792,
 'ORF6': 27946,
 'ORF7a': 110025,
 'ORF7b': 22295,
 'ORF8': 85267,
 'N': 243342,
 'ORF9b': 63362}

In [47]:
all_nonsense

{'ORF1a': 880,
 'ORF1b': 418,
 'S': 367,
 'S1': 239,
 'ORF3a': 1684,
 'E': 16,
 'M': 126,
 'ORF6': 1670,
 'ORF7a': 8688,
 'ORF7b': 1670,
 'ORF8': 9313,
 'N': 317,
 'ORF9b': 1358}

In [48]:
dS = {gene:all_syn[gene]/syn_denominators[gene] for gene in all_genes}
dN = {gene:all_missense[gene]/nonsyn_denominators[gene] for gene in all_genes}
dStop = {gene:all_nonsense[gene]/stop_denominators[gene] for gene in all_genes}

In [50]:
to_print = []
for gene in all_genes:
    to_print.append({'gene':gene, 'dS': dS[gene],'dN': dN[gene],  'dStop': dStop[gene], 'dN_dS':dN[gene]/dS[gene], 'dStop_dS':dStop[gene]/dS[gene]})
    
df_to_print = pd.DataFrame(to_print)
print(df_to_print)
df_to_print.to_csv('usher/trimmed/dn_ds_dstop_usher_4foldmutBN.tsv')

     gene           dS           dN        dStop     dN_dS  dStop_dS
0   ORF1a  1788.511773   676.110920     4.960269  0.378030  0.002773
1   ORF1b  1527.518632   495.271133     4.403693  0.324232  0.002883
2       S  1647.850096   854.845025     7.068244  0.518764  0.004289
3      S1  1786.913198   998.808082    10.007712  0.558957  0.005601
4   ORF3a  1802.186682  1617.110764   156.775568  0.897305  0.086992
5       E  1021.684167   765.286037    11.740479  0.749044  0.011491
6       M  2023.915022   632.016016    15.357651  0.312274  0.007588
7    ORF6  2853.912544  1308.157598   530.114314  0.458373  0.185750
8   ORF7a  1863.942259  2115.167427  1631.932588  1.134782  0.875527
9   ORF7b  1087.775583  1557.682114   967.842757  1.431988  0.889745
10   ORF8  1514.021079  1677.067416  1644.620590  1.107691  1.086260
11      N  2171.225308  1179.252899    12.101278  0.543128  0.005573
12  ORF9b   919.106542  1343.193700   329.765206  1.461412  0.358789


In [None]:
## I'm really struggling to understand why I'm getting such different values for S & S1 compared to Katie.

## 4. Calculate dN - dS - dStop, excluding terminal branches

In [173]:
noTerm_syn = {gene:len(final_muts[(final_muts.gene==gene)&(final_muts.mut_type=='synonymous')&(final_muts.node_id.str.contains('node'))]) for gene in all_genes}
noTerm_missense = {gene:len(final_muts[(final_muts.gene==gene)&(final_muts.mut_type=='missense')&(final_muts.node_id.str.contains('node'))]) for gene in all_genes}
noTerm_nonsense = {gene:len(final_muts[(final_muts.gene==gene)&(final_muts.mut_type=='nonsense')&(final_muts.node_id.str.contains('node'))]) for gene in all_genes}

In [174]:
dS_noTerm = {gene:noTerm_syn[gene]/syn_denominators[gene] for gene in all_genes}
dN_noTerm = {gene:noTerm_missense[gene]/nonsyn_denominators[gene] for gene in all_genes}
dStop_noTerm = {gene:noTerm_nonsense[gene]/stop_denominators[gene] for gene in all_genes}

In [175]:
to_print_noTerm = []
for gene in all_genes:
    to_print_noTerm.append({'gene':gene, 'dS': dS_noTerm[gene],'dN': dN_noTerm[gene],  'dStop': dStop_noTerm[gene], 'dN_dS':dN_noTerm[gene]/dS_noTerm[gene], 'dStop_dS':dStop_noTerm[gene]/dS_noTerm[gene]})
    
df_to_print_noTerm = pd.DataFrame(to_print_noTerm)
print(df_to_print_noTerm)
df_to_print_noTerm.to_csv('usher/trimmed/dn_ds_dstop_usher_noTerminalBranches.tsv')

     gene          dS          dN       dStop     dN_dS  dStop_dS
0   ORF1a  399.959226  236.201348    0.479448  0.590564  0.001199
1   ORF1b  348.023078  168.751095    0.257092  0.484885  0.000739
2       S  371.015146  267.927033    0.860328  0.722146  0.002319
3      S1  388.057022  307.709134    1.326901  0.792948  0.003419
4   ORF3a  464.995694  572.933426   55.004949  1.232126  0.118291
5       E  274.619753  205.048512    4.448794  0.746663  0.016200
6       M  566.985620  184.677624    3.415730  0.325718  0.006024
7    ORF6  605.081638  344.695878  247.439006  0.569668  0.408935
8   ORF7a  489.236145  652.392037  894.960924  1.333491  1.829303
9   ORF7b  270.052181  378.761954  556.459207  1.402551  2.060562
10   ORF8  339.684248  575.543733  891.887483  1.694349  2.625637
11      N  621.788300  437.345794    2.707120  0.703368  0.004354
12  ORF9b  314.102142  491.515731  105.201849  1.564828  0.334929


In [None]:
## Think about bootstrapping these values??

## ORF7a dN/dS is much lower when comparing to OG sequence --> suggests high rate of back mutation

## Actually katie did not compare to OG -- she compared to predecessor

## 5. Just by gene length

In [183]:
lengths = {k:(v.end - v.start)/3 for k,v in location_by_gene.items()}

In [184]:
syn_scaled = {gene:all_syn[gene]/lengths[gene] for gene in all_genes}
missense_scaled = {gene:all_missense[gene]/lengths[gene] for gene in all_genes}
stop_scaled = {gene:all_nonsense[gene]/lengths[gene] for gene in all_genes}

In [186]:
to_print_scaled = []
for gene in all_genes:
    to_print_scaled.append({'gene':gene, 'syn': syn_scaled[gene],'missense':missense_scaled[gene], 'stop': stop_scaled[gene]})
    
df_to_print_scaled = pd.DataFrame(to_print_scaled)
print(df_to_print_scaled)
#df_to_print_noTerm.to_csv('usher/trimmed/dn_ds_dstop_usher_noTerminalBranches.tsv')

     gene         syn    missense       stop
0   ORF1a  228.591684  301.946376   0.199955
1   ORF1b  196.792285  219.636869   0.155045
2       S  214.379121  375.632653   0.288069
3      S1  228.419019  442.098068   0.355126
4   ORF3a  278.913043  712.003623   6.101449
5       E  191.328947  315.513158   0.210526
6       M  361.426009  286.062780   0.565022
7    ORF6  322.338710  450.741935  26.935484
8   ORF7a  281.352459  901.844262  71.213115
9   ORF7b  190.204545  506.704545  37.954545
10   ORF8  196.303279  698.909836  76.336066
11      N  363.640476  579.385714   0.754762
12  ORF9b  206.510204  646.551020  13.857143


In [180]:
df.chunkSize

0      10000
1      10000
2      10000
3      10000
4      10000
       ...  
338    10000
339    10000
340    10000
341    10000
342     2474
Name: chunkSize, Length: 343, dtype: int64

In [179]:
df = pd.read_table('usher/trimmed/bootstrap_chunksizes.txt',header=None,names=['chunkSize'])