In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import sys 
sys.path.append('/home/anat/Research/ExAC/8.states_analysis') 
from emd_func import compute_pymed_emd

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [23]:
curr_dir = !pwd
#dicts_path = curr_dir[0]+"/../7.filters/filtered_dicts/pfam-v30/"
hmm_filename = curr_dir[0]+"/../2.parse_Pfam/v30/domains_hmm_prob_dict.pik"
pfam_aa_order = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
input_path = curr_dir[0]+"/../5.HMM_alter_align/domains_states_dicts/pfam-v30/"

#Read binding scores
with open(curr_dir[0]+"/../binding_score/domains_binding_dict.pik", 'rb') as handle:
    binding_scores_dict = pickle.load(handle)

#Read the list of domains
with open(curr_dir[0]+"/../5.domains_stats/filtered50_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

#Creating a list of the intersection of domains with binding scores and domains with states dicts
domains = []
for domain in filtered_domains_list:
    if (domain in binding_scores_dict.keys()):
        domains.append(domain)

In [8]:
#Open the HMM dict
with open(hmm_filename, 'rb') as handle:
    hmm_prob_dict = pickle.load(handle)

In [32]:
%%time
features_dict = defaultdict(list)
states_features_dict = defaultdict(list)

for domain_name in domains:
    
    dirfiles = !ls -t $input_path$domain_name
    filename = dirfiles[0]
    with open(input_path+domain_name+"/"+filename, 'rb') as handle:
        states_dict = pickle.load(handle)
    
    #Create af_adj flat dict
    states_af_adj_dict = defaultdict(list)
    for state in states_dict.keys():
        for d in states_dict[state]:
            states_af_adj_dict[state].append(d["af_adj"])
        
    #scale the af_dict
    states_MAF_adj_dict_scaled = defaultdict(list)
    for state in states_dict.keys():
        state_len = len(states_dict[state])
        for d in states_dict[state]:
            states_MAF_adj_dict_scaled[state].append(float(d["af_adj"]/state_len))
    
    #Create EMD dict
    af_adj_emd_dict = compute_pymed_emd(states_MAF_adj_dict_scaled, 1000)
    
    #Create a dict of conserved states
    con_states_dict = {}
    con_threshold = 0.9
    for state in hmm_prob_dict[domain_name].keys():
        prob_list = hmm_prob_dict[domain_name][state]
        for i in range(len(prob_list)):
            p = prob_list[i]
            if (p > con_threshold):
                major_allele = pfam_aa_order[i]
                con_states_dict[state] = major_allele
    
    #Adding states features
    for state in states_dict.keys():
        
        #Init counters & paramters
        maf_sum = 0
        sites_aa_alter_num = 0
        sites_snp_alter_num = 0
        sites_aa_num = len(states_dict[state])
        sites_snp_num = 3*sites_aa_num
        sites_poly_aa_num = 0 #The number of different aa in all the altered sites (most are 1)
        sites_poly_aa_several = 0
        
        #Rare-poly-counters
        maft_5 =  0.005
        maft_05 = 0.0005
        maft_005 = 0.00005
        rare_5_num = 0
        rare_05_num = 0
        rare_005_num = 0
        
        #Iterating the state dict to get properties
        for d in states_dict[state]:
            if (d["af_adj"] > 0):
                sites_aa_alter_num += 1
                sites_snp_alter_num += len(d["an_adj"])
                maf_sum += d["af_adj"]
                
                #Number of different polymorphisms at this site
                site_poly_num = len(d["alterations_af_adj_dict"].keys())
                sites_poly_aa_num += site_poly_num
                if (site_poly_num > 1):
                    sites_poly_aa_several += 1
                
                #Rare poly features
                if (d["af_adj"] <= maft_005):
                    rare_005_num += 1
                    rare_05_num += 1
                    rare_5_num += 1
                elif (d["af_adj"] <= maft_05):
                    rare_05_num += 1
                    rare_5_num += 1
                elif (d["af_adj"] <= maft_5):
                    rare_5_num += 1
        
        #Feature: EMD to all 0s distribution
        features_dict[domain_name+"_"+str(state)].append(af_adj_emd_dict[state])
        
        #Feature: average MAF overall aa sites
        if (sites_aa_num == 0):
            avg_maf_overall = 0
        else:
            avg_maf_overall = maf_sum/float(sites_aa_num)
        features_dict[domain_name+"_"+str(state)].append(avg_maf_overall)
        
        #Feature: average MAF of all the altered sites
        if (sites_aa_alter_num == 0):
            avg_maf_only_altered = 0
        else:
            avg_maf_only_altered = maf_sum/float(sites_aa_alter_num)
        features_dict[domain_name+"_"+str(state)].append(avg_maf_only_altered)
        
        #Feature: number of alterations - aa level (raw and normalized by total number of matched positions)
        if (sites_aa_num == 0):
            norm_aa_alter_num = 0
        else:
            norm_aa_alter_num = sites_aa_alter_num/float(sites_aa_num)
        features_dict[domain_name+"_"+str(state)].append(sites_aa_alter_num)
        features_dict[domain_name+"_"+str(state)].append(norm_aa_alter_num)
        
        #Feature: number of alterations - DNA level (raw and normalized by total number of matched positions)
        if (sites_snp_num == 0):
            norm_snp_alter_num = 0
        else:
            norm_snp_alter_num = sites_snp_alter_num/float(sites_snp_num)
        features_dict[domain_name+"_"+str(state)].append(sites_snp_alter_num)
        features_dict[domain_name+"_"+str(state)].append(norm_snp_alter_num)
        
        #Feature: average number of poymorphisms at one site
        if (sites_aa_alter_num == 0):
            avg_poly_aa = 0
        else:
            avg_poly_aa = sites_poly_aa_num/float(sites_aa_alter_num)
        features_dict[domain_name+"_"+str(state)].append(avg_poly_aa)

        #Feature: fraction of altered sites with more than 1 polymorphism
        if (sites_aa_alter_num == 0):
            frac_poly_several = 0
        else:
            frac_poly_several = sites_poly_aa_several/float(sites_aa_alter_num)
        features_dict[domain_name+"_"+str(state)].append(frac_poly_several)
        
        #Feature: fraction of rare SNPs (0.5%)
        if (sites_aa_alter_num == 0):
            frac_rare_5 = 0
        else:
            frac_rare_5 = rare_5_num/float(sites_aa_alter_num)
        features_dict[domain_name+"_"+str(state)].append(frac_rare_5)

        #Feature: fraction of rare SNPs (0.05%)
        if (sites_aa_alter_num == 0):
            frac_rare_05 = 0
        else:
            frac_rare_05 = rare_05_num/float(sites_aa_alter_num)
        features_dict[domain_name+"_"+str(state)].append(frac_rare_05)

        #Feature: fraction of rare SNPs (0.005%)
        if (sites_aa_alter_num == 0):
            frac_rare_005 = 0
        else:
            frac_rare_005 = rare_005_num/float(sites_aa_alter_num)
        features_dict[domain_name+"_"+str(state)].append(frac_rare_005)
        
        #Feature: state type (0=non-important, 1=structural, 2=functionally-important)
        state_type = "Neutral" #Initializing as "non-important"
        if (state in binding_scores_dict[domain_name]["states"]):
            state_type = "Functional"
        if (state in con_states_dict.keys()):
            state_type = "Structural"
        features_dict[domain_name+"_"+str(state)].append(state_type)
        
        #Feature: domain name
        features_dict[domain_name+"_"+str(state)].append(domain_name)
    
    print "Finished "+domain_name

Finished 7TM_GPCR_Srsx
Finished 7tm_1
Finished AAA
Finished AAA_5
Finished ABC_membrane
Finished ABC_tran
Finished Ank
Finished Ank_2
Finished Ank_3
Finished Ank_4
Finished Ank_5
Finished Annexin
Finished Arf
Finished Arm
Finished BTB
Finished BTB_2
Finished Bromodomain
Finished C1-set
Finished C1_1
Finished C2
Finished C2-set_2
Finished CBFD_NFYB_HMF
Finished CH
Finished CUB
Finished Cadherin
Finished Cadherin_2
Finished DEAD
Finished EF-hand_1
Finished EF-hand_5
Finished EF-hand_6
Finished EF-hand_7
Finished EF-hand_8
Finished EGF
Finished EGF_CA
Finished FXa_inhibition
Finished Filamin
Finished Forkhead
Finished HLH
Finished HMG_box
Finished HMG_box_2
Finished Helicase_C
Finished Hemopexin
Finished Histone
Finished Homeobox
Finished Homeobox_KN
Finished Hormone_recep
Finished I-set
Finished IQ
Finished Ig_2
Finished Ig_3
Finished Ion_trans
Finished Ion_trans_2
Finished KH_1
Finished Kelch_1
Finished Kelch_2
Finished Kelch_3
Finished Kelch_4
Finished Kelch_6
Finished LIM
Finished LRR

In [33]:
#Exporting to data-frames table
domains_features_df = pd.DataFrame.from_dict(features_dict,orient='index')
domains_features_df.columns = ["EMD", "avg_maf_all", "avg_maf_altered", "alter_num_aa", "alter_num_aa_norm", 
                               "alter_num_dna", "alter_num_dna_norm", "avg_poly", "frac_poly_several", "rare_poly_0.5%", "rare_poly_0.05%", "rare_poly_0.005%", "state_type","domain_name"]
domains_features_df = domains_features_df.sort_index()

#Save to file
domains_features_df.to_csv(curr_dir[0]+"/binding_50instances_states_features_df.csv", sep='\t')

In [34]:
domains_features_df

Unnamed: 0,EMD,avg_maf_all,avg_maf_altered,alter_num_aa,alter_num_aa_norm,alter_num_dna,alter_num_dna_norm,avg_poly,frac_poly_several,rare_poly_0.5%,rare_poly_0.05%,rare_poly_0.005%,state_type,domain_name
7TM_GPCR_Srsx_1,3.487256e-09,6.786182e-05,0.000313,28,0.217054,30,0.077519,1.035714,0.035714,0.964286,0.928571,0.821429,Neutral,7TM_GPCR_Srsx
7TM_GPCR_Srsx_10,1.661450e-10,1.675740e-05,0.000094,23,0.178295,28,0.072351,1.217391,0.217391,1.000000,0.956522,0.869565,Neutral,7TM_GPCR_Srsx
7TM_GPCR_Srsx_100,3.595270e-10,2.660871e-05,0.000084,41,0.317829,47,0.121447,1.146341,0.146341,1.000000,0.975610,0.707317,Neutral,7TM_GPCR_Srsx
7TM_GPCR_Srsx_101,5.012538e-11,1.344598e-05,0.000053,33,0.255814,43,0.111111,1.303030,0.272727,1.000000,1.000000,0.696970,Neutral,7TM_GPCR_Srsx
7TM_GPCR_Srsx_102,4.197513e-09,7.395236e-05,0.000238,40,0.310078,51,0.131783,1.250000,0.250000,0.975000,0.975000,0.700000,Neutral,7TM_GPCR_Srsx
7TM_GPCR_Srsx_103,3.280226e-08,2.024831e-04,0.000539,47,0.376000,63,0.168000,1.319149,0.276596,0.978723,0.957447,0.787234,Neutral,7TM_GPCR_Srsx
7TM_GPCR_Srsx_104,1.882823e-09,6.393542e-05,0.000262,31,0.244094,38,0.099738,1.225806,0.225806,1.000000,0.903226,0.677419,Neutral,7TM_GPCR_Srsx
7TM_GPCR_Srsx_105,2.386254e-11,1.127571e-05,0.000045,32,0.248062,38,0.098191,1.187500,0.187500,1.000000,1.000000,0.750000,Neutral,7TM_GPCR_Srsx
7TM_GPCR_Srsx_106,4.487727e-09,8.297504e-05,0.000297,36,0.279070,44,0.113695,1.222222,0.194444,0.972222,0.944444,0.694444,Neutral,7TM_GPCR_Srsx
7TM_GPCR_Srsx_107,3.627340e-10,2.333800e-05,0.000086,35,0.271318,38,0.098191,1.085714,0.085714,1.000000,0.971429,0.828571,Neutral,7TM_GPCR_Srsx
