In [4]:
import pandas as pd
import numpy as np
import cPickle as pickle
import datetime
import sys
curr_dir = !pwd
sys.path.append(curr_dir[0] + "/../5.HMM_alter_align") 
from calc_exac_freq_func import codon_table
from dnds_func import calculate_ns, seq_ns
from collections import defaultdict
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [5]:
#Getting path
curr_dir = !pwd
intance_cutoff = "10"

#Reading the list of filtered domains
with open(curr_dir[0]+"/../5.domains_stats/filtered"+intance_cutoff+"_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

#Reading the table of all domains stats
filtered_domains_df = pd.read_csv(curr_dir[0]+"/../5.domains_stats/filtered"+intance_cutoff+"_domains_df.csv", sep='\t', index_col=0)

#Read the substitutions table (for the dN/dS calculation)
with open(curr_dir[0]+"/codon_ns_table.pik", 'rb') as handle:
    codon_ns_table = pickle.load(handle)

#Reading the BLOSUM62 dict
with open(curr_dir[0]+"/../BLOSUM62/BLOSUM62_dict.pik", 'rb') as handle:
    blosum62_dict = pickle.load(handle)

In [6]:
import math

# Calculates a normalized Shannon entropy where a is a vector of probabilities (from Miller et al, 2015)
def entropy(a):
    entropy = 0
    for val in a:
        if val == 0:
            continue
        entropy += val * math.log(val)
    return(-entropy / math.log(len(a)))

In [18]:
#%%time
#Calculate domains features
input_path = curr_dir[0]+"/../5.HMM_alter_align/domains_states_dicts/pfam-v30/"
features_dict = defaultdict(list)
missing_domains = []

for domain_name in filtered_domains_list:
    
    #Reading the domain states dictionary
    domain_dirfiles = !ls -t $input_path$domain_name
    #Find the most recent file
    recent_priority = -1
    recent_filename = ""
    for f in domain_dirfiles:
        tokens = f.split("_")
        date = tokens[len(tokens)-1].split(".")
        month = int(date[0])
        day = int(date[1])
        #Not all files have years, but those that do are the most recent
        if date[2] != "pik":
            year = int(date[2])
        else:
            year = 0
        priority = year*1000 + month*50 + day
        if priority > recent_priority:
            recent_priority = priority
            recent_filename = f
    with open(input_path+domain_name+"/"+recent_filename, 'rb') as handle:
        states_dict = pickle.load(handle)
    #Initializing feature counters
    maf_sum = 0
    sites_aa_num = 0
    sites_aa_alter_num = 0
    sites_snp_alter_num = 0
    sites_poly_aa_num = 0 #The number of different aa in all the altered sites (most are 1)
    sites_poly_aa_several = 0
    
    #Rare-poly-counters
    maft_5 =  0.005
    maft_05 = 0.0005
    maft_005 = 0.00005
    rare_5_num = 0
    rare_05_num = 0
    rare_005_num = 0
    
    #BLOSUM62_vals
    blosum62_list = []
    weigted_blosum62_list = []
    
    #SIFT counters
    sift_sum = 0
    sift_cnt = 0
    
    #PolyPhen counters
    polyphen_sum = 0
    polyphen_cnt = 0
    
    #dn/ds counters and variables
    ref_seq = ""
    #ref_af_list = []
    Nd = 0
    Sd = 0
    
    #Entropy
    mutations_by_pos = np.zeros(states_dict.keys()[len(states_dict.keys())-1])
    nonsyn_by_pos = np.zeros(states_dict.keys()[len(states_dict.keys())-1])
    mutations_by_gene = {}
    nonsyn_by_gene = {}
    
    #Conservation scores
    counter_phyloP = 0
    counter_phastCons = 0
    sum_phyloP = 0
    sum_phastCons = 0
    num_noncon_bp_phyloP = 0
    num_noncon_bp_phastCons = 0
    num_noncon_aa_phyloP = 0
    num_noncon_aa_phastCons = 0
    
    for state in states_dict:
        sites_aa_num += len(states_dict[state])
        for d in states_dict[state]:
            #Creating a position pseudo-ref sequence
            ref_codon = d["bp_ref"]
            ref_seq = ref_seq+ref_codon
            
            #Calculating frequency-based N/S
            #Calculate entropy
            bp_af_adj_dict = d["bp_af_adj_dict"]
            for alt_codon in bp_af_adj_dict.keys():
                alt_aa = codon_table[alt_codon]
                
                #Entropy: all SNPs
                mutations_by_pos[state-1] += bp_af_adj_dict[alt_codon]
                key = d["ens_gene"]
                if key in mutations_by_gene.keys():
                    mutations_by_gene[key] += bp_af_adj_dict[alt_codon]
                else:
                    mutations_by_gene[key] = bp_af_adj_dict[alt_codon]
                
                #syn
                if (alt_aa == d["aa_ref"]):
                    Sd += bp_af_adj_dict[alt_codon]
                #Non-syn
                else:
                    Nd += bp_af_adj_dict[alt_codon]
                    
                    #Entropy — nonsyn
                    nonsyn_by_pos[state-1] += bp_af_adj_dict[alt_codon]
                    if key in nonsyn_by_gene.keys():
                        nonsyn_by_gene[key] += bp_af_adj_dict[alt_codon]
                    else:
                        nonsyn_by_gene[key] = bp_af_adj_dict[alt_codon]
                        
            #Conservation
            for score in d["phyloP"]:
                counter_phyloP += 1
                sum_phyloP += score
                if score < 0:
                    num_noncon_bp_phyloP += 1
            if len(d["phyloP"]) > 0 and sum(d["phyloP"]) / len(d["phyloP"]) < 0:
                num_noncon_aa_phyloP += 1
            for score in d["phastCons"]:
                counter_phastCons += 1
                sum_phastCons += score
                if score < 0.5:
                    num_noncon_bp_phastCons += 1
            if len(d["phastCons"]) > 0 and sum(d["phastCons"]) / len(d["phastCons"]) < 0.5:
                num_noncon_aa_phastCons += 1
            
            if (d["af_adj"] > 0):
                sites_aa_alter_num += 1
                sites_snp_alter_num += len(d["an_adj"])
                maf_sum += d["af_adj"]
                
                #Number of different polymorphisms at this site
                site_poly_num = len(d["alterations_af_adj_dict"].keys())
                sites_poly_aa_num += site_poly_num
                if (site_poly_num > 1):
                    sites_poly_aa_several += 1
                
                #Rare poly features
                if (d["af_adj"] < maft_005):
                    rare_005_num += 1
                    rare_05_num += 1
                    rare_5_num += 1
                elif (d["af_adj"] < maft_05):
                    rare_05_num += 1
                    rare_5_num += 1
                elif (d["af_adj"] < maft_5):
                    rare_5_num += 1
                    
                #BLOSUM62 features
                ref = d["aa_ref"]
                for alt in d["alterations_af_adj_dict"].keys():
                    blosum_val = blosum62_dict[ref][alt]
                    af_adj = np.mean(d["alterations_af_adj_dict"][alt])
                    blosum62_list.append(blosum_val)
                    weigted_blosum62_list.append(blosum_val*af_adj)
                
                #SIFT
                sift_list = d["SIFT"]
                for s in sift_list:
                    if (s != ""):
                        sift_sum += float(s[s.find("(")+1:s.find(")")])
                        sift_cnt += 1
                        
                #PolyPhen
                polyphen_list = d["PolyPhen"]      
                for s in polyphen_list:
                    if (s != ""):
                        polyphen_sum += float(s[s.find("(")+1:s.find(")")])
                        polyphen_cnt += 1
        
    #Feature: domain length
    domain_len = len(states_dict.keys())
    features_dict[domain_name].append(domain_len)
    
    #Feature: average MAF overall aa sites
    avg_maf_overall = maf_sum/float(sites_aa_num)
    features_dict[domain_name].append(avg_maf_overall)
    
    #Feature: average MAF of all the altered sites
    avg_maf_only_altered = maf_sum/float(sites_aa_alter_num)
    features_dict[domain_name].append(avg_maf_only_altered)
    
    #Feature: number of alterations - aa level (raw and normalized by domain length)
    norm_aa_alter_num = sites_aa_alter_num/float(domain_len)
    features_dict[domain_name].append(sites_aa_alter_num)
    features_dict[domain_name].append(norm_aa_alter_num)
    
    #Feature: number of alterations - DNA level (raw and normalized by domain length)
    norm_snp_alter_num = sites_snp_alter_num/float(domain_len)
    features_dict[domain_name].append(sites_snp_alter_num)
    features_dict[domain_name].append(norm_snp_alter_num)
    
    #Feature: fraction of aa alterations (fraction of non-zero alterations)
    frac_alter_aa = sites_aa_alter_num/float(sites_aa_num)
    features_dict[domain_name].append(frac_alter_aa)
    
    #Feature: average number of poymorphisms at one site
    avg_poly_aa = sites_poly_aa_num/float(sites_aa_alter_num)
    features_dict[domain_name].append(avg_poly_aa)
    
    #Feature: fraction of altered sites with more than 1 polymorphism
    frac_poly_several = sites_poly_aa_several/float(sites_aa_alter_num)
    features_dict[domain_name].append(frac_poly_several)
    
    #Feature: fraction of rare SNPs (0.5%)
    frac_rare_5 = rare_5_num/float(sites_aa_alter_num)
    features_dict[domain_name].append(frac_rare_5)
    
    #Feature: fraction of rare SNPs (0.05%)
    frac_rare_05 = rare_05_num/float(sites_aa_alter_num)
    features_dict[domain_name].append(frac_rare_05)
    
    #Feature: fraction of rare SNPs (0.005%)
    frac_rare_005 = rare_005_num/float(sites_aa_alter_num)
    features_dict[domain_name].append(frac_rare_005)
    
    #Feature: BLOSUM62 average
    blosum62_avg = sum(blosum62_list)/float(len(blosum62_list))
    weigted_blosum62_avg = sum(weigted_blosum62_list)/float(len(weigted_blosum62_list))
    features_dict[domain_name].append(blosum62_avg)
    features_dict[domain_name].append(weigted_blosum62_avg)
    
    #Feature: pseudo-sequence dN/dS        
    (N,S) = seq_ns(ref_seq) #Refrence expected syn/nonsyn per site
    if (N == 0): 
        PN = 0
    else:
        PN = Nd/float(N) #Proportion of nonsyn
    if (S == 0):
        PS = 0
    else:
        PS = Sd/float(S) #Proportion of syn

    #num of nonsyn substitutions per syn site
    dN = -0.75 * (np.log(1-4*PN/float(3)))
    #num of syn substitutions per nonsyn site
    dS = -0.75 * (np.log(1-4*PS/float(3)))

    if (dN ==0 or dS == 0):
        dN_dS = 1 #There isn't enough information to calculate dN/dS
    else:
        dN_dS = dN/dS

    features_dict[domain_name].append(dN_dS)
    
    #Feature: SIFT average
    if (sift_cnt > 0):
        sift_avg = sift_sum/float(sift_cnt)
    else:
        sift_avg = -1
    features_dict[domain_name].append(sift_avg)
    
    #Feature: PolyPhen average
    if (polyphen_cnt > 0):
        polyphen_avg = polyphen_sum/float(polyphen_cnt)
    else:
        polyphen_avg = -1
    features_dict[domain_name].append(polyphen_avg)
    
    #Feature: Fraction of DNA sites altered
    frac_snp_alter = float(sites_snp_alter_num)/(3*sites_aa_num)
    features_dict[domain_name].append(frac_snp_alter)
    
    #Feature: Entropy
    p_mut = mutations_by_pos/sum(mutations_by_pos)
    mut_entropy = entropy(p_mut)
    features_dict[domain_name].append(mut_entropy)
    
    p_nonsyn = nonsyn_by_pos/sum(nonsyn_by_pos)
    nonsyn_entropy = entropy(p_nonsyn)
    features_dict[domain_name].append(nonsyn_entropy)
    
    mut_vals = np.fromiter(iter(mutations_by_gene.values()), dtype=float)
    p_mut_gene = mut_vals/sum(mut_vals)
    mut_gene_entropy = entropy(p_mut_gene)
    features_dict[domain_name].append(mut_gene_entropy)
    
    nonsyn_vals = np.fromiter(iter(nonsyn_by_gene.values()), dtype=float)
    p_nonsyn_gene = nonsyn_vals/sum(nonsyn_vals)
    nonsyn_gene_entropy = entropy(p_nonsyn_gene)
    features_dict[domain_name].append(nonsyn_gene_entropy)
    
    #Feature: Conservation
    avg_phyloP = sum_phyloP / counter_phyloP
    features_dict[domain_name].append(avg_phyloP)
    
    avg_phastCons = sum_phastCons / counter_phastCons
    features_dict[domain_name].append(avg_phastCons)
    
    frac_noncon_bp_phyloP = float(num_noncon_bp_phyloP) / counter_phyloP
    features_dict[domain_name].append(frac_noncon_bp_phyloP)
    
    frac_noncon_bp_phastCons = float(num_noncon_bp_phastCons) / counter_phastCons
    features_dict[domain_name].append(frac_noncon_bp_phastCons)
    
    frac_noncon_aa_phyloP = 3*float(num_noncon_aa_phyloP) / counter_phyloP
    features_dict[domain_name].append(frac_noncon_aa_phyloP)
    
    frac_noncon_aa_phastCons = 3*float(num_noncon_aa_phastCons) / counter_phastCons
    features_dict[domain_name].append(frac_noncon_aa_phastCons)
    
    print("Finished domain "+domain_name)

Finished domain 2OG-FeII_Oxy_3
Finished domain 7TM_GPCR_Srsx
Finished domain 7tm_1
Finished domain 7tm_2
Finished domain 7tm_3
Finished domain 7tm_4
Finished domain A2M
Finished domain A2M_N
Finished domain A2M_N_2
Finished domain A2M_comp
Finished domain A2M_recep
Finished domain AAA
Finished domain AAA_11
Finished domain AAA_12
Finished domain AAA_17
Finished domain AAA_18
Finished domain AAA_33
Finished domain AAA_5
Finished domain AAA_6
Finished domain AAA_7
Finished domain AAA_8
Finished domain AAA_9
Finished domain AA_permease
Finished domain AA_permease_2
Finished domain ABC2_membrane_3
Finished domain ABC_membrane
Finished domain ABC_tran
Finished domain ADAM_CR
Finished domain ADAM_spacer1
Finished domain ADH_N
Finished domain ADH_zinc_N
Finished domain ADK
Finished domain AMP-binding
Finished domain AMP-binding_C
Finished domain ANAPC3
Finished domain ANAPC4_WD40
Finished domain ANATO
Finished domain ANF_receptor
Finished domain APC_r
Finished domain APOBEC_C
Finished domain 

  # Remove the CWD from sys.path while we load stuff.


Finished domain CNH
Finished domain COLFI
Finished domain COesterase
Finished domain CRAL_TRIO
Finished domain CRAL_TRIO_2
Finished domain CS
Finished domain CSD
Finished domain CT47
Finished domain CUB
Finished domain CUT
Finished domain Cadherin
Finished domain Cadherin_2
Finished domain Cadherin_3
Finished domain Cadherin_C
Finished domain Cadherin_C_2
Finished domain Cadherin_tail
Finished domain Calpain_III
Finished domain Calponin
Finished domain Calx-beta
Finished domain Carb_anhydrase
Finished domain CarboxypepD_reg
Finished domain Cation_ATPase
Finished domain Cation_ATPase_C
Finished domain Cation_ATPase_N
Finished domain Choline_transpo
Finished domain Chromo
Finished domain Cir_N
Finished domain Clat_adaptor_s
Finished domain Clathrin
Finished domain Clathrin_propel
Finished domain Claudin_2
Finished domain Cnd1
Finished domain Cofilin_ADF
Finished domain Collagen
Finished domain Complex1_LYR
Finished domain Connexin
Finished domain Cpn60_TCP1
Finished domain Crystall
Finis

In [20]:
#features_dict

In [24]:
#Exporting to data-frames table
domains_features_df = pd.DataFrame.from_dict(features_dict,orient='index')
domains_features_df.columns = ["length", "avg_maf_all", "avg_maf_altered", "alter_num_aa", "alter_num_aa_norm", 
                               "alter_num_dna", "alter_num_dna_norm", "frac_alter_aa", "avg_poly", "frac_poly_several", 
                               "rare_poly_0.5%", "rare_poly_0.05%", "rare_poly_0.005%", "BLOSUM_avg", "weighted_BLOSUM_avg", 
                               "pseudo_dNdS", "SIFT", "PolyPhen","frac_alter_dna","entropy_pos_alter","entropy_pos_nonsyn",
                               "entropy_gene_alter","entropy_gene_nonsyn","avg_phyloP","avg_phastCons","frac_noncon_bp_phyloP",
                               "frac_noncon_bp_phastCons","frac_noncon_aa_phyloP","frac_noncon_aa_phastCons"]
domains_features_df = domains_features_df.sort_index()

#Adding the data from the df
domains_features_df["num_genes"] = filtered_domains_df["genes"]
domains_features_df["num_instances"] = filtered_domains_df["instances"]

#Computing log2 of genes number
domains_features_df["num_genes_log2"] = domains_features_df["num_genes"].apply(lambda x: np.log2(x))
domains_features_df["num_instances_log2"] = domains_features_df["num_instances"].apply(lambda x: np.log2(x))

#Save to file
domains_features_df.to_csv(curr_dir[0]+"/domains_features_df_filtered"+intance_cutoff+"_test.csv", sep='\t')

In [23]:
domains_features_df

Unnamed: 0,length,avg_maf_all,avg_maf_altered,alter_num_aa,alter_num_aa_norm,alter_num_dna,alter_num_dna_norm,frac_alter_aa,avg_poly,frac_poly_several,...,avg_phyloP,avg_phastCons,frac_noncon_bp_phyloP,frac_noncon_bp_phastCons,frac_noncon_aa_phastCons,frac_noncon_aa_phyloP,num_genes,num_instances,num_genes_log2,num_instances_log2
2OG-FeII_Oxy_3,96,0.000013,0.000050,256,2.666667,346,3.604167,0.258325,1.156250,0.128906,...,4.854861,0.910980,0.100572,0.084763,0.004036,0.016145,11,11,3.459432,3.459432
7TM_GPCR_Srsx,257,0.000502,0.001846,8887,34.579767,12621,49.108949,0.272057,1.211207,0.186115,...,3.356244,0.717283,0.197555,0.280567,0.108461,0.218515,145,145,7.179909,7.179909
7tm_1,268,0.001191,0.004008,47227,176.220149,67859,253.205224,0.297169,1.232431,0.201558,...,1.591294,0.436266,0.367939,0.564156,0.316040,0.521850,768,768,9.584963,9.584963
7tm_2,245,0.000395,0.001481,3028,12.359184,4333,17.685714,0.266432,1.209379,0.183950,...,3.485768,0.775320,0.156856,0.221880,0.049010,0.150462,49,49,5.614710,5.614710
7tm_3,234,0.000090,0.000357,1249,5.337607,1801,7.696581,0.252374,1.216974,0.191353,...,4.038676,0.815387,0.128241,0.180777,0.036775,0.117600,29,29,4.857981,4.857981
7tm_4,280,0.001838,0.005813,33656,120.200000,48190,172.107143,0.316185,1.244860,0.210720,...,0.318924,0.207917,0.510284,0.794725,0.487665,0.772416,478,478,8.900867,8.900867
A2M,92,0.001609,0.008448,155,1.684783,216,2.347826,0.190418,1.212903,0.174194,...,2.329198,0.642471,0.235463,0.359541,0.143735,0.282555,16,16,4.000000,4.000000
A2M_N,96,0.000184,0.000752,205,2.135417,286,2.979167,0.244048,1.204878,0.175610,...,1.804648,0.672252,0.220238,0.318254,0.123810,0.276190,16,16,4.000000,4.000000
A2M_N_2,139,0.000605,0.002875,256,1.841727,352,2.532374,0.210526,1.148438,0.136719,...,1.631708,0.577871,0.255757,0.421601,0.162829,0.386513,16,16,4.000000,4.000000
A2M_comp,249,0.000572,0.002455,507,2.036145,720,2.891566,0.233103,1.218935,0.177515,...,2.090421,0.622667,0.243525,0.376398,0.143448,0.316322,16,16,4.000000,4.000000
