In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import sys 
sys.path.append('/home/anat/Research/ExAC/5.HMM_alter_align') 
from calc_exac_freq_func import codon_table

In [23]:
curr_dir = !pwd

input_path = curr_dir[0]+"/../5.HMM_alter_align/domains_states_dicts/pfam-v30/"

#Read binding scores
with open(curr_dir[0]+"/../binding_score/domains_binding_dict.pik", 'rb') as handle:
    binding_scores_dict = pickle.load(handle)
with open(curr_dir[0]+"/../binding_score/domains_binding_dict.pik", 'rb') as handle:
    binding_scores_all_dict = pickle.load(handle)   


#Read the list of domains
with open(curr_dir[0]+"/../5.domains_stats/filtered50_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

#Creating a list of the intersection of domains with binding scores and domains with states dicts
domains = []
for domain in filtered_domains_list:
    if (domain in binding_scores_dict.keys()):
        domains.append(domain)
len(domains)

125

In [5]:
def calculate_ns(codon):
    """Given a codon (string of len=3 of chars from {A,T,G,C}
    Calculate n = number of nonsynonymous sites possible for this codon,
    and s = number of synnonymous sites possible for this codon.
    n+s=3, since this is a site measurment"""
    
    ref_aa = codon_table[codon]
    bp1 = codon[:1]
    bp2 = codon[1:2]
    bp3 = codon[2:]
    
    syn = 0
    nonsyn = 0
    nucletoides = ["A", "T", "G", "C"]
    
    #Mutating bp1
    for n in nucletoides:
        if (bp1 == n):
            continue  
        alt_codon = n+bp2+bp3
        alt_aa = codon_table[alt_codon]
        if (alt_aa == ref_aa):
            syn += 1
        else:
            nonsyn += 1
    
    #Mutating bp2
    for n in nucletoides:
        if (bp2 == n):
            continue    
        alt_codon = bp1+n+bp3
        alt_aa = codon_table[alt_codon]
        if (alt_aa == ref_aa):
            syn += 1
        else:
            nonsyn += 1
            
    #Mutating bp3
    for n in nucletoides:
        if (bp3 == n):
            continue    
        alt_codon = bp1+bp2+n
        alt_aa = codon_table[alt_codon]
        if (alt_aa == ref_aa):
            syn += 1
        else:
            nonsyn += 1

    n = float('{:.5e}'.format(float(nonsyn/float(3))))
    s = float('{:.5e}'.format(float(syn/float(3))))

    return (n,s)

In [128]:
def seq_ns(sequence):
    """Given a sequence of nucletides that comprise full codons triplets
    calculate and return N = the total number of nonsynnonymous sites,
    and S = the total number of synnonymous sites
    Each codon is mutiliplied by the individual frequency"""
    
    N = 0
    S = 0
    
    for i in range(0,len(sequence),3):
        codon = sequence[i:i+3]
        N += codon_ns_table[codon]["N"] 
        S += codon_ns_table[codon]["S"]
        
    return (N,S)

In [125]:
#Creating the substitutions table
codon_ns_table = dict.fromkeys(codon_table.keys())
for codon in codon_ns_table.keys():
    (n,s) = calculate_ns(codon)
    codon_ns_table[codon] = dict.fromkeys(["N","S"])
    codon_ns_table[codon]["N"] = n
    codon_ns_table[codon]["S"] = s
    
#Exporting the table to file (to enable usage of other scripts)
with open(curr_dir[0]+"/codon_ns_table.pik", 'wb') as handle:
    pickle.dump(codon_ns_table, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [129]:
for domain_name in domains:
    
    domain_name = "zf-C2H2"
    dirfiles = !ls -t $input_path$domain_name
    filename = dirfiles[0]
    with open(input_path+domain_name+"/"+filename, 'rb') as handle:
        states_dict = pickle.load(handle)
    
    dn_ds_list = []
    
    for state in states_dict.keys():
        
        #Iterating the state dict to compute dn/ds
        ref_seq = ""
        #ref_af_list = []
        Nd = 0
        Sd = 0
        Nd_n = 0
        Sd_n = 0
        
        for d in states_dict[state]:
            ref_codon = d["bp_ref"]
            ref_seq = ref_seq+ref_codon

            #ref_af = d["af_adj"]
            #ref_af_list.append(1-ref_af)
            
            #Calculating frequency-based N/S
            bp_af_adj_dict = d["bp_af_adj_dict"]
            for alt_codon in bp_af_adj_dict.keys():
                alt_aa = codon_table[alt_codon]
                #syn
                if (alt_aa == d["aa_ref"]):
                    Sd += bp_af_adj_dict[alt_codon]
                    Sd_n += 1
                #Non-syn
                else:
                    Nd += bp_af_adj_dict[alt_codon]
                    Nd_n += 1
        
        #Refrence expected syn/nonsyn per site
        (N,S) = seq_ns(ref_seq)
        
        #Proportion of nonsyn
        PN = Nd/float(N)
        
        #Proportion of syn
        PS = Sd/float(S)
        
        #num of nonsyn substitutions per syn site
        dN = -0.75 * (np.log(1-4*PN/float(3)))
        
        #num of syn substitutions per nonsyn site
        dS = -0.75 * (np.log(1-4*PS/float(3)))
        
        dN_dS = dN/dS
        
        dn_ds_list.append(dN_dS)
        
        print "finished state "+str(state)
    
    break

finished state 1
finished state 2
finished state 3
finished state 4
finished state 5
finished state 6
finished state 7
finished state 8
finished state 9
finished state 10
finished state 11
finished state 12
finished state 13
finished state 14
finished state 15
finished state 16
finished state 17
finished state 18
finished state 19
finished state 20
finished state 21
finished state 22
finished state 23


In [141]:
Nd = 0.4
Sd = 0.4
ref_seq = "ATGAAACCCGGGTTTTAA"

#Refrence expected syn/nonsyn per site
(N,S) = seq_ns(ref_seq)
        
#Proportion of nonsyn
PN = Nd/float(N)

#Proportion of syn
PS = Sd/float(S)

#num of nonsyn substitutions per syn site
dN = -0.75 * (np.log(1-4*PN/float(3)))

#num of syn substitutions per nonsyn site
dS = -0.75 * (np.log(1-4*PS/float(3)))

dN_dS = dN/dS

In [154]:
(N,S)

(14.66667, 3.333333)

In [139]:
PS

1.500921856592186

In [140]:
Sd

5.003072355000001

In [76]:
Nd

4.9597999999999995e-05

In [78]:
Nd_n

4

In [124]:
dn_ds_list

[0.038714287873137632,
 0.22676700183044393,
 0.017861971094347742,
 0.18814651843985269,
 0.42056128336809684,
 0.014518099402607237,
 0.080160642894976805,
 0.062768098093980471,
 0.16916943428093492,
 0.03907715859500796,
 0.16071040618676827,
 0.032554478761844759,
 0.051138723865264482,
 0.071976944437929902,
 0.058088854810055761,
 0.45378505594304669,
 0.12939677879175382,
 0.44833222599564532,
 0.022100931871325436,
 0.11447379160250555,
 0.12613540430786338,
 0.38611899419581669,
 0.0038105636672948738]