In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import sys 
sys.path.append('/home/anat/Research/ExAC/8.states_analysis') 
from dnds_func import calculate_ns, seq_ns
sys.path.append('/home/anat/Research/ExAC/5.HMM_alter_align') 
from calc_exac_freq_func import codon_table
from aa_chemical_properties import aa_charge, aa_charge_dict, aa_functional_group, aa_functional_group_dict, hindex_Kyte_Doolitle, aa_propensity, propensity_chou_fasman
import random
from enum import Enum

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Constants

In [2]:
curr_dir = !pwd
domains_th = "10"
SIFT_THRESHOLD = 0.05

#Rare SNP thresholds
MAFT_5 =  0.005
MAFT_05 = 0.0005
MAFT_005 = 0.00005

class sift_codes(Enum):
    SIFT_DELETERIOUS = 0
    SIFT_TOLERATED = 1
    SIFT_TIE = 2
        
class polyphen_codes(Enum):
    POLYPHEN_BENIGN = 0
    POLYPHEN_POSSIBLY = 1
    POLYPHEN_PROBABLY = 2
    PLOYPHEN_EQUAL = 3
    POLYPHEN_UNKNOWN = 4
        
hmm_filename = curr_dir[0]+"/../2.parse_Pfam/v30/domains_hmm_prob_dict.pik"
pfam_aa_order = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
amino_acids_sym = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', "*"]

### Reading input files

In [3]:
input_path = curr_dir[0]+"/../5.HMM_alter_align/domains_states_dicts/pfam-v30/"

#Reading the BLOSUM62 dict
with open(curr_dir[0]+"/../BLOSUM62/BLOSUM62_dict.pik", 'rb') as handle:
    blosum62_dict = pickle.load(handle)

#Read binding scores
with open(curr_dir[0]+"/../binding_score/domains_binding_dict.pik", 'rb') as handle:
    binding_scores_dict = pickle.load(handle)
with open(curr_dir[0]+"/../binding_score/domains_all_binding_dict.pik", 'rb') as handle:
    binding_all_scores_dict = pickle.load(handle)

#Read the list of domains
with open(curr_dir[0]+"/../5.domains_stats/filtered"+domains_th+"_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

#Read the substitutions table (for the dN/dS calculation)
with open(curr_dir[0]+"/codon_ns_table.pik", 'rb') as handle:
    codon_ns_table = pickle.load(handle)

#Open the HMM dict - takes some time
with open(hmm_filename, 'rb') as handle:
    hmm_prob_dict = pickle.load(handle)    
    
#Creating a list of the intersection of domains with binding scores and domains with states dicts
domains = []
for domain in filtered_domains_list:
    if (domain in binding_scores_dict.keys()):
        domains.append(domain)
print "number of domains = "+str(len(domains))

number of domains = 409


In [4]:
def ExAC_MAF_features(states_features_dict, state_id, table_columns, first_pass, sites_aa_num, sites_aa_alter_num, maf_list):
    
    #Feature: avg MAF
    if (sites_aa_num == 0):
        avg_maf_overall = 0
    else:
        avg_maf_overall = np.sum(maf_list)/float(sites_aa_num)
    features_dict[state_id].append(avg_maf_overall)
    if (first_pass): table_columns.append("avg_maf_all")

    #Feature: avg MAF of all the altered sites
    if (sites_aa_alter_num == 0):
        avg_maf_only_altered = 0
    else:
        avg_maf_only_altered = np.sum(maf_list)/float(sites_aa_alter_num)
    features_dict[state_id].append(avg_maf_only_altered)
    if (first_pass): table_columns.append("avg_maf_altered")
    
    #Feature: MAF histogram,  #bins = [0.0, 0.02, 0.04, 0.06, 0.08, 0.1, 0.12, 0.14, 0.16, 0.18, 0.2, 0.3, 0.4, 0.5]
    bins = (np.linspace(0,0.2, 11)).tolist()
    bins.extend((np.linspace(0.3,0.5, 3)).tolist())
    maf_hist = np.histogram(maf_list, bins)[0]
    
    features_dict[state_id].extend(maf_hist)
    if (first_pass): 
        for i in range(len(bins)-1):
            hist_col_title = "maf_hist_"+str(bins[i])+"-"+str(bins[i+1])
            table_columns.append(hist_col_title)

In [5]:
def ExAC_count_features(states_features_dict, state_id, table_columns, first_pass, sites_aa_num, sites_aa_alter_num, sites_snp_num, sites_snp_alter_num):
    
    #Feature: number of alterations - aa level (raw and normalized by total number of matched positions)
    if (sites_aa_num == 0):
        norm_aa_alter_num = 0
    else:
        norm_aa_alter_num = sites_aa_alter_num/float(sites_aa_num)
    features_dict[state_id].append(sites_aa_alter_num)
    if (first_pass): table_columns.append("alter_num_aa")
    features_dict[state_id].append(norm_aa_alter_num)
    if (first_pass): table_columns.append("alter_num_aa_norm")

    #Feature: number of alterations - DNA level (raw and normalized by total number of matched positions)
    if (sites_snp_num == 0):
        norm_snp_alter_num = 0
    else:
        norm_snp_alter_num = sites_snp_alter_num/float(sites_snp_num)
    features_dict[state_id].append(sites_snp_alter_num)
    if (first_pass): table_columns.append("alter_num_snp")
    features_dict[state_id].append(norm_snp_alter_num)
    if (first_pass): table_columns.append("alter_num_snp_norm")

    #Feature: average number of poymorphisms at one site
    if (sites_aa_alter_num == 0):
        avg_poly_aa = 0
    else:
        avg_poly_aa = sites_poly_aa_num/float(sites_aa_alter_num)
    features_dict[state_id].append(avg_poly_aa)
    if (first_pass): table_columns.append("avg_aa_polymorphisms")

    #Feature: fraction of altered sites with more than 1 polymorphism
    if (sites_aa_alter_num == 0):
        frac_poly_several = 1
    else:
        frac_poly_several = sites_poly_aa_several/float(sites_aa_alter_num)
    features_dict[state_id].append(frac_poly_several)
    if (first_pass): table_columns.append("frac_poly_aa")

In [6]:
def ExAC_rareSNP_features(states_features_dict, state_id, table_columns, first_pass, sites_aa_alter_num, rare_5_num, rare_05_num, rare_005_num):
    
    #Feature: fraction of rare SNPs (0.5%, 0.05%, 0.005%)
    if (sites_aa_alter_num == 0):
        frac_rare_5 = 0
        frac_rare_05 = 0
        frac_rare_005 = 0
    else:
        frac_rare_5 = rare_5_num/float(sites_aa_alter_num)
        frac_rare_05 = rare_05_num/float(sites_aa_alter_num)
        frac_rare_005 = rare_005_num/float(sites_aa_alter_num)
        
    features_dict[state_id].append(frac_rare_5)
    if (first_pass): table_columns.append("rare_poly_0.5")
    features_dict[state_id].append(frac_rare_05)
    if (first_pass): table_columns.append("rare_poly_0.05")
    features_dict[state_id].append(frac_rare_005)
    if (first_pass): table_columns.append("rare_poly_0.005")

In [127]:
def conservation_features(states_features_dict, state_id, table_columns, first_pass, phastCons_dict, phyloP_dict):
    
    #Features: conservation scores avg for each codon position - phastCons
    features_dict[state_id].append(np.average(phastCons_dict[1]))
    if (first_pass): table_columns.append("phastCons1_avg")
    features_dict[state_id].append(np.average(phastCons_dict[2]))
    if (first_pass): table_columns.append("phastCons2_avg")
    features_dict[state_id].append(np.average(phastCons_dict[3]))
    if (first_pass): table_columns.append("phastCons3_avg")
    
    #Features: conservation scores avg for each codon position - phyloP
    features_dict[state_id].append(np.average(phyloP_dict[1]))
    if (first_pass): table_columns.append("phyloP1_avg")
    features_dict[state_id].append(np.average(phyloP_dict[2]))
    if (first_pass): table_columns.append("phyloP2_avg")
    features_dict[state_id].append(np.average(phyloP_dict[3]))
    if (first_pass): table_columns.append("phyloP3_avg")
        
    #Features: conservation scores histograms for each codon position - phastCons
    phastCons_bins = np.concatenate((np.linspace(0,0.75, 4), np.linspace(0.8,1.0, 5)), axis=0)
    phastCons1_hist = np.histogram(phastCons_dict[1], phastCons_bins)[0]
    phastCons2_hist = np.histogram(phastCons_dict[2], phastCons_bins)[0]
    phastCons3_hist = np.histogram(phastCons_dict[3], phastCons_bins)[0]
    
    features_dict[state_id].extend(phastCons1_hist)
    features_dict[state_id].extend(phastCons2_hist)
    features_dict[state_id].extend(phastCons3_hist)
    if (first_pass): 
        for i in range(len(phastCons_bins)-1):
            hist_col_title = "phastCons1_hist_"+str(phastCons_bins[i])+"-"+str(phastCons_bins[i+1])
            table_columns.append(hist_col_title)
        for i in range(len(phastCons_bins)-1):
            hist_col_title = "phastCons2_hist_"+str(phastCons_bins[i])+"-"+str(phastCons_bins[i+1])
            table_columns.append(hist_col_title)
        for i in range(len(phastCons_bins)-1):
            hist_col_title = "phastCons3_hist_"+str(phastCons_bins[i])+"-"+str(phastCons_bins[i+1])
            table_columns.append(hist_col_title)
    
    #Features: conservation scores histograms for each codon position - phyloP
    phyloP_bins = np.concatenate((np.array([-14, -1]), np.linspace(0, 3, 4), np.linspace(3.5, 6, 6)), axis=0)
    phyloP_hist1 = np.histogram(phyloP_dict[1], phyloP_bins)[0]
    phyloP_hist2 = np.histogram(phyloP_dict[2], phyloP_bins)[0]
    phyloP_hist3 = np.histogram(phyloP_dict[3], phyloP_bins)[0]
    
    features_dict[state_id].extend(phyloP_hist1)
    features_dict[state_id].extend(phyloP_hist2)
    features_dict[state_id].extend(phyloP_hist3)
    if (first_pass): 
        for i in range(len(phyloP_bins)-1):
            hist_col_title = "phyloP1_hist_"+str(phyloP_bins[i])+"-"+str(phyloP_bins[i+1])
            table_columns.append(hist_col_title)
        for i in range(len(phyloP_bins)-1):
            hist_col_title = "phyloP2_hist_"+str(phyloP_bins[i])+"-"+str(phyloP_bins[i+1])
            table_columns.append(hist_col_title)
        for i in range(len(phyloP_bins)-1):
            hist_col_title = "phyloP3_hist_"+str(phyloP_bins[i])+"-"+str(phyloP_bins[i+1])
            table_columns.append(hist_col_title)
            
    #Features: histogram of avg in each codon
    phastCons_codons_avg = []
    phyloP_codons_avg = []
    for i in range(len(phastCons_dict[1])):
        phastCons_score_avg = np.average([phyloP_dict[1][i], phyloP_dict[2][i], phyloP_dict[3][i]])
        phastCons_codons_avg.append(phastCons_score_avg)
        phyloP_score_avg = np.average([phyloP_dict[1][i], phyloP_dict[2][i], phyloP_dict[3][i]])
        phastCons_codons_avg.append(phyloP_score_avg)
        
    phastCons_codons_hist = np.histogram(phastCons_codons_avg, phastCons_bins)[0]
    phyloP_codons_hist = np.histogram(phyloP_codons_avg, phyloP_bins)[0]
    
    features_dict[state_id].extend(phastCons_codons_hist)
    features_dict[state_id].extend(phyloP_codons_hist)
    if (first_pass): 
        for i in range(len(phastCons_bins)-1):
            hist_col_title = "phastCons_codons_hist_"+str(phastCons_bins[i])+"-"+str(phastCons_bins[i+1])
            table_columns.append(hist_col_title)
        for i in range(len(phyloP_bins)-1):
            hist_col_title = "phyloP_codons_hist_"+str(phyloP_bins[i])+"-"+str(phyloP_bins[i+1])
            table_columns.append(hist_col_title)

In [7]:
def blosum62_features(states_features_dict, state_id, table_columns, first_pass, blosum62_list, weigted_blosum62_list):
    
    if (len(blosum62_list) == 0):
        blosum62_avg = 0
        weigted_blosum62_avg = 0
        blosum62_postivies = 0
        blosum62_negatives = 0
        blosum62_ratio = 1
    else:
        #Feature: BLOSUM62 average and frequency weighted-average
        blosum62_avg = sum(blosum62_list)/float(len(blosum62_list))
        weigted_blosum62_avg = sum(weigted_blosum62_list)/float(len(weigted_blosum62_list))

        #Feature: BLOSUM62 count of positives and negatives
        blosum62_postivies = sum(1 for x in blosum62_list if x > 0)
        blosum62_negatives = sum(1 for x in blosum62_list if x < 0)

        #Feature: BLOSUM62 positives/negatives ratio
        if (blosum62_postivies == 0 or blosum62_negatives == 0):
            blosum62_ratio = 0
        else:
            blosum62_ratio = blosum62_postivies/float(blosum62_negatives)

    features_dict[state_id].append(blosum62_avg)
    if (first_pass): table_columns.append("blosum_avg")
    features_dict[state_id].append(weigted_blosum62_avg)
    if (first_pass): table_columns.append("blosum_avg_weighted")
    features_dict[state_id].append(blosum62_postivies)
    if (first_pass): table_columns.append("blosum_positive_num")
    features_dict[state_id].append(blosum62_negatives)
    if (first_pass): table_columns.append("blosum_megative_num")
    features_dict[state_id].append(blosum62_ratio)
    if (first_pass): table_columns.append("blosum_ratio")

In [136]:
def SIFT_features(states_features_dict, state_id, table_columns, first_pass, sift_scores_list):
    
    if (len(sift_scores_list) > 0):
        #Feature: SIFT average
        sift_avg = np.sum(sift_scores_list)/len(sift_scores_list)

        #Feature: SIFT number of deleterious (score <=0.05)
        sift_deleterious_num = sum(1 for x in sift_scores_list if x <= SIFT_THRESHOLD)

        #Feature: SIFT number of tolerated (score > 0.05)
        sift_tolerated_num = sum(1 for x in sift_scores_list if x > SIFT_THRESHOLD)

        #Feature: deleterious/tolerated ratio
        if (sift_tolerated_num == 0 or sift_deleterious_num == 0):
            sift_ratio = 0
        else:
            sift_ratio = sift_deleterious_num/float(sift_tolerated_num)

        #Feature: SIFT "majority-decision" (deleterious/tolerated)
        if (sift_deleterious_num > sift_tolerated_num):
            sift_majority = sift_codes.SIFT_DELETERIOUS.value
        elif (sift_tolerated_num > sift_deleterious_num):
            sift_majority = sift_codes.SIFT_TOLERATED.value
        else:
            sift_majority = sift_codes.SIFT_TIE.value

    else:
        sift_avg = -1
        sift_deleterious_num = 0
        sift_tolerated_num = 0
        sift_ratio = 1
        sift_majority = sift_codes.SIFT_TIE.value

    features_dict[state_id].append(sift_avg)
    if (first_pass): table_columns.append("sift_avg")
    features_dict[state_id].append(sift_deleterious_num)
    if (first_pass): table_columns.append("sift_deleterious_num")
    features_dict[state_id].append(sift_tolerated_num)
    if (first_pass): table_columns.append("sift_tolerated_num")
    features_dict[state_id].append(sift_ratio)
    if (first_pass): table_columns.append("sift_ratio")
    features_dict[state_id].append(sift_majority)
    if (first_pass): table_columns.append("sift_majority")

In [134]:
def PolyPhen_features(states_features_dict, state_id, table_columns, first_pass, polyphen_scores_list):
    
    if (len(polyphen_scores_list) > 0):
        #Feature: PolyPhen average
        polyphen_avg = np.sum(polyphen_scores_list)/float(len(polyphen_scores_list))

        #Feature: polyPhen number of benign
        polyphen_benign_num = polyphen_pred_list.count("benign")

        #Feature: polyPhen number of possibly_damaging
        polyphen_possibly_num = polyphen_pred_list.count("possibly_damaging")

        #Feature: polyPhen number of probably_damaging
        polyphen_probably_num = polyphen_pred_list.count("probably_damaging")

        #Feature: polyPhen "majority-decision" (benign/possibly_damaging/probably_damaging/unknown)
        if ((polyphen_benign_num > polyphen_probably_num and polyphen_benign_num > polyphen_possibly_num) or 
            (polyphen_benign_num > polyphen_probably_num and polyphen_benign_num == polyphen_possibly_num)):
            polyphen_majority = polyphen_codes.POLYPHEN_BENIGN

        elif ((polyphen_probably_num > polyphen_benign_num and polyphen_probably_num > polyphen_possibly_num) or 
              (polyphen_probably_num > polyphen_benign_num and polyphen_probably_num == polyphen_possibly_num)):
            polyphen_majority = polyphen_codes.POLYPHEN_PROBABLY.value

        elif (polyphen_possibly_num > polyphen_benign_num and polyphen_possibly_num > polyphen_probably_num):
            polyphen_majority = polyphen_codes.POLYPHEN_POSSIBLY.value

        elif (polyphen_benign_num == polyphen_probably_num == polyphen_possibly_num):
            polyphen_majority = polyphen_codes.PLOYPHEN_EQUAL.value

        else:
            polyphen_majority = polyphen_codes.POLYPHEN_UNKNOWN.value

    else:
        polyphen_avg = -1
        polyphen_benign_num = 0
        polyphen_possibly_num = 0
        polyphen_probably_num = 0
        polyphen_majority = polyphen_codes.POLYPHEN_UNKNOWN.value

    features_dict[state_id].append(polyphen_avg)
    if (first_pass): table_columns.append("polyphen_avg")
    features_dict[state_id].append(polyphen_benign_num)
    if (first_pass): table_columns.append("polyphen_benign_num")
    features_dict[state_id].append(polyphen_possibly_num)
    if (first_pass): table_columns.append("polyphen_possibly_num")
    features_dict[state_id].append(polyphen_probably_num)
    if (first_pass): table_columns.append("polyphen_probably_num")
    features_dict[state_id].append(polyphen_majority)
    if (first_pass): table_columns.append("polyphen_majority")

In [10]:
def pseudo_dNdS_features(states_features_dict, state_id, table_columns, first_pass, ref_seq, Nd, Sd):
    
    (N,S) = seq_ns(ref_seq) #Refrence expected syn/nonsyn per site
    if (N == 0): 
        PN = 0
    else:
        PN = Nd/float(N) #Proportion of nonsyn
    if (S == 0):
        PS = 0
    else:
        PS = Sd/float(S) #Proportion of syn

    #num of nonsyn substitutions per syn site
    dN = -0.75 * (np.log(1-4*PN/float(3)))
    #num of syn substitutions per nonsyn site
    dS = -0.75 * (np.log(1-4*PS/float(3)))

    if (dN ==0 or dS == 0):
        dN_dS = 1 #There isn't enough information to calculate dN/dS
    else:
        dN_dS = dN/dS

    features_dict[state_id].append(dN_dS)
    if (first_pass): table_columns.append("pseudo_dNdS")

In [11]:
def pfam_emission_prob_features(states_features_dict, state_id, table_columns, first_pass, domain_name, state):
    
    #Feature: Max. emission probability
    state_max_emiss_prob = max(hmm_prob_dict[domain_name][state])
    features_dict[state_id].append(state_max_emiss_prob)
    if (first_pass): table_columns.append("pfam_prob_max")

    #Features: emission prob. for each amino acid
    for i in range(len(hmm_prob_dict[domain_name][state])):
        features_dict[state_id].append(hmm_prob_dict[domain_name][state][i])
        if (first_pass):
            prob_aa_title = "pfam_prob_"+str(pfam_aa_order[i])
            table_columns.append(prob_aa_title)

In [105]:
def pfam_conserved_state_feature(states_features_dict, state_id, table_columns, first_pass, state, con_states_dict):
    
    #Feature: is state is conserved according to Pfam?
    con_state = False
    if (state in con_states_dict.keys()):
        con_state = True
        
    features_dict[state_id].append(con_state)
    if (first_pass): table_columns.append("is_pfam_conserved")

In [13]:
def major_allele_aa_features(states_features_dict, state_id, table_columns, first_pass, aa_ref_hist):
    
    #===Features: major allele aa histogram===#
    for i in range(len(amino_acids_sym)):
        features_dict[state_id].append(aa_ref_hist[i])
        if (first_pass): table_columns.append("aa_ref_hist_"+str(amino_acids_sym[i]))

    #===Features: major allele aa prob. vector===#
    aa_ref_prob = np.asarray(aa_ref_hist)/float(np.sum(aa_ref_hist))
    for i in range(len(amino_acids_sym)):
        features_dict[state_id].append(aa_ref_prob[i])
        if (first_pass): table_columns.append("aa_ref_prob_"+str(amino_acids_sym[i]))

In [14]:
def major_allele_charge(states_features_dict, state_id, table_columns, first_pass, aa_ref_hist):
    
    #===Feature: major allele aa charge counts===#
    charge_positive_count = 0
    charge_negative_count = 0
    charge_neutral_count = 0
    for i in range(len(amino_acids_sym)):
        aa_count = aa_ref_hist[i]
        if (aa_count > 0):
            charge = aa_charge_dict[amino_acids_sym[i]]
            if (charge == 0):
                charge_neutral_count += aa_count
            elif (charge == 1):
                charge_positive_count += aa_count
            else:
                charge_negative_count += aa_count
    
    features_dict[state_id].append(charge_positive_count)
    if (first_pass): table_columns.append("aa_ref_charge_positive_count")
    features_dict[state_id].append(charge_negative_count)
    if (first_pass): table_columns.append("aa_ref_charge_negative_count")
    features_dict[state_id].append(charge_neutral_count)
    if (first_pass): table_columns.append("aa_ref_charge_neutral_count")
        
    #===Feature: major allele majority charge===#
    majority = aa_charge.NEUTRAL.value
    if (charge_positive_count > charge_neutral_count and charge_positive_count > charge_negative_count):
        charge_majority = aa_charge.POSITIVE.value
    elif (charge_negative_count > charge_neutral_count and charge_negative_count > charge_positive_count):
        charge_majority = aa_charge.NEGATIVE.value
        
    features_dict[state_id].append(charge_majority)
    if (first_pass): table_columns.append("aa_ref_charge_majority")

In [15]:
def major_allele_functional_group(states_features_dict, state_id, table_columns, first_pass, aa_ref_hist):
    
    #===Feature: major allele aa functional group counts===#
    func_counters = [0] * len(aa_functional_group)
    for i in range(len(amino_acids_sym)):
        aa_count = aa_ref_hist[i]
        if (aa_count > 0):
            func_group_num = aa_functional_group_dict[amino_acids_sym[i]].value #getting numeric functional group value
            func_counters[func_group_num] += aa_count
    
    features_dict[state_id].extend(func_counters)
    if (first_pass): 
        for group in aa_functional_group:
            func_str = "aa_ref_"+str(group)+"_count"
            table_columns.append(func_str)

In [16]:
def major_allele_hydrophobicity(states_features_dict, state_id, table_columns, first_pass, aa_ref_hist):
    
    #===Feature: major allele hydrophicity average, hydrophobic and polar counts===#
    h_avg = 0
    hydrophobic_cnt = 0
    polar_charge_cnt = 0
    for i in range(len(amino_acids_sym)):
        aa_count = aa_ref_hist[i]
        if (aa_count > 0):
            hindex = hindex_Kyte_Doolitle[amino_acids_sym[i]]
            h_avg += hindex * aa_count
            
            if (hindex > 0):
                hydrophobic_cnt += aa_count
            else:
                polar_charge_cnt += aa_count
                
    features_dict[state_id].append(h_avg)
    if (first_pass): table_columns.append("hindex_avg")
    features_dict[state_id].append(hydrophobic_cnt)
    if (first_pass): table_columns.append("hindex_pos_cnt")
    features_dict[state_id].append(polar_charge_cnt)
    if (first_pass): table_columns.append("hindex_neg_cnt")
            

In [33]:
def major_allele_propensity(states_features_dict, state_id, table_columns, first_pass, aa_ref_hist):
    
    prop_avg = [0, 0, 0]
    prop_majority_counts = [0, 0, 0]
    for i in range(len(amino_acids_sym)):
        aa_count = aa_ref_hist[i]
        if (aa_count > 0):
            curr_prop = propensity_chou_fasman[amino_acids_sym[i]]
            prop_avg = [sum(x) for x in zip(prop_avg, curr_prop)]
            
            if (curr_prop[aa_propensity.ALPHA_HELIX.value] == max(curr_prop)):
                prop_majority_counts[aa_propensity.ALPHA_HELIX.value] += 1
            if (curr_prop[aa_propensity.BETA_SHEET.value] == max(curr_prop)):
                prop_majority_counts[aa_propensity.BETA_SHEET.value] += 1
            if (curr_prop[aa_propensity.TURN.value] == max(curr_prop)):
                prop_majority_counts[aa_propensity.TURN.value] += 1
            
    #===Feature: major allele propensity avgs===#
    features_dict[state_id].extend(prop_avg)
    if (first_pass): table_columns.extend(["aa_ref_alpha_prop_avg", "aa_ref_beta_prop_avg", "aa_ref_turn_prop_avg"])
        
    #===Feature: major allele majority propensity===#
    max_idx = np.where(np.array(prop_majority_counts) == max(prop_majority_counts))[0]
    majority_vec = [0, 0, 0]
    for i in max_idx:
        majority_vec[i] = 1 #put 1 in the propensities that has max. count
    
    features_dict[state_id].extend(majority_vec)
    if (first_pass): table_columns.extend(["aa_ref_alpha_is_majority", "aa_ref_beta_is_majority", "aa_ref_turn_is_majority"])

In [18]:
def binding_scores_features(states_features_dict, state_id, table_columns, first_pass, domain_name):
    
    #Feature: Max. Binding-score
    if (state in binding_all_scores_dict[domain_name]["states"]):
        score_idx = np.where(np.array(binding_all_scores_dict[domain_name]["states"]) == state)[0]
        binding_score = max(np.array(binding_all_scores_dict[domain_name]["scores"])[score_idx])
    else:
        binding_score = 0
    features_dict[state_id].append(binding_score)
    if (first_pass): table_columns.append("binding_score")

In [None]:
#%%time
features_dict = defaultdict(list)
states_features_dict = defaultdict(list)
table_columns = []
first_pass = True

for domain_name in domains:
    
    dirfiles = !ls -t $input_path$domain_name
    filename = dirfiles[0]
    with open(input_path+domain_name+"/"+filename, 'rb') as handle:
        states_dict = pickle.load(handle)
    
    #Create af_adj flat dict
    states_af_adj_dict = defaultdict(list)
    for state in states_dict.keys():
        for d in states_dict[state]:
            states_af_adj_dict[state].append(d["af_adj"])
        
    #scale the af_dict
    states_MAF_adj_dict_scaled = defaultdict(list)
    for state in states_dict.keys():
        state_len = len(states_dict[state])
        for d in states_dict[state]:
            states_MAF_adj_dict_scaled[state].append(float(d["af_adj"]/state_len))
    
    #Create a dict of conserved states
    con_states_dict = {}
    con_threshold = 0.5
    for state in hmm_prob_dict[domain_name].keys():
        prob_list = hmm_prob_dict[domain_name][state]
        for i in range(len(prob_list)):
            p = prob_list[i]
            if (p > con_threshold):
                major_allele = pfam_aa_order[i]
                con_states_dict[state] = major_allele
    
    #Adding states features
    for state in states_dict.keys():
        
        state_id = domain_name+"_"+str(state)
        
        #Init counters & paramters
        maf_list = []
        sites_aa_alter_num = 0
        sites_snp_alter_num = 0
        sites_aa_num = len(states_dict[state])
        sites_snp_num = 3*sites_aa_num
        sites_poly_aa_num = 0 #The number of different aa in all the altered sites (most are 1)
        sites_poly_aa_several = 0
        
        #Rare-poly-counters
        rare_5_num = 0
        rare_05_num = 0
        rare_005_num = 0
        
        #Conservation params
        phastCons_dict = defaultdict(list)
        phyloP_dict = defaultdict(list)
        
        #BLOSUM62_vals
        blosum62_list = []
        weigted_blosum62_list = []
        
        #dn/ds counters and variables
        ref_seq = ""
        Nd = 0
        Sd = 0
        
        #SIFT params
        sift_scores_list = []
        
        #PolyPhen params
        polyphen_scores_list = []
        polyphen_pred_list = []
        
        #Major allele params
        aa_ref_hist = [0] *len(amino_acids_sym)
        aa_ref_hist_last_idx = 20
        
        #Iterating the state dict to get properties
        for d in states_dict[state]:
            
            #Creating a position pseudo-ref sequence
            ref_codon = d["bp_ref"]
            ref_seq = ref_seq+ref_codon
            
            #Calculating frequency-based N/S
            bp_af_adj_dict = d["bp_af_adj_dict"]
            for alt_codon in bp_af_adj_dict.keys():
                alt_aa = codon_table[alt_codon]
                #syn
                if (alt_aa == d["aa_ref"]):
                    Sd += bp_af_adj_dict[alt_codon]
                #Non-syn
                else:
                    Nd += bp_af_adj_dict[alt_codon]
            
            #Major allele parameters
            aa_ref = d["aa_ref"]
            try:
                aa_ref_pos = pfam_aa_order.index(aa_ref)
            except:
                aa_ref_pos = aa_ref_hist_last_idx #ref is a stop codon
            aa_ref_hist[aa_ref_pos] += 1
            
            #Conservation scores
            phastCons_curr_list = d["phastCons"]
            phastCons_dict[1].append(phastCons_curr_list[0])
            phastCons_dict[2].append(phastCons_curr_list[1])
            phastCons_dict[3].append(phastCons_curr_list[2])
            
            phyloP_curr_list = d["phyloP"]
            phyloP_dict[1].append(phyloP_curr_list[0])
            phyloP_dict[2].append(phyloP_curr_list[1])
            phyloP_dict[3].append(phyloP_curr_list[2])
            
            if (d["af_adj"] > 0):
                sites_aa_alter_num += 1
                sites_snp_alter_num += len(d["an_adj"])
                maf_list.append(d["af_adj"])
                
                #Number of different polymorphisms at this site
                site_poly_num = len(d["alterations_af_adj_dict"].keys())
                sites_poly_aa_num += site_poly_num
                if (site_poly_num > 1):
                    sites_poly_aa_several += 1
                
                #Rare poly features
                if (d["af_adj"] < MAFT_005):
                    rare_005_num += 1
                    rare_05_num += 1
                    rare_5_num += 1
                elif (d["af_adj"] < MAFT_05):
                    rare_05_num += 1
                    rare_5_num += 1
                elif (d["af_adj"] < MAFT_5):
                    rare_5_num += 1
                
                #BLOSUM62 features
                ref = d["aa_ref"]
                for alt in d["alterations_af_adj_dict"].keys():
                    blosum_val = blosum62_dict[ref][alt]
                    af_adj = np.mean(d["alterations_af_adj_dict"][alt])
                    blosum62_list.append(blosum_val)
                    weigted_blosum62_list.append(blosum_val*af_adj)
                    
                #SIFT
                sift_list = d["SIFT"]
                for s in sift_list:
                    if (s != ""):
                        sift_scores_list.append(float(s[s.find("(")+1:s.find(")")]))
                
                #PolyPhen
                polyphen_list = d["PolyPhen"]      
                for s in polyphen_list:
                    if (s != ""):
                        polyphen_scores_list.append(float(s[s.find("(")+1:s.find(")")]))
                        polyphen_pred_list.append(s[:s.find("(")])
        
        #===domain_regular_features===#
        features_dict[state_id].append(domain_name)
        if (first_pass): table_columns.append("domain_name")
        
        #===ExAC MAF Features===#
        ExAC_MAF_features(states_features_dict, state_id, table_columns, first_pass, sites_aa_num, sites_aa_alter_num, maf_list)
        
        ExAC_count_features(states_features_dict, state_id, table_columns, first_pass, sites_aa_num, sites_aa_alter_num, sites_snp_num, sites_snp_alter_num)
        
        ExAC_rareSNP_features(states_features_dict, state_id, table_columns, first_pass, sites_aa_alter_num, rare_5_num, rare_05_num, rare_005_num)
        
        #===Conservation scores features===#
        conservation_features(states_features_dict, state_id, table_columns, first_pass, phastCons_dict, phyloP_dict)
        
        #===BLOSUM62 Features===#
        blosum62_features(states_features_dict, state_id, table_columns, first_pass, blosum62_list, weigted_blosum62_list)

        #===pseudo-sequence dN/dS feature===#        
        pseudo_dNdS_features(states_features_dict, state_id, table_columns, first_pass, ref_seq, Nd, Sd)
        
        #===Pfam HMM-emission probabilities features===#
        pfam_emission_prob_features(states_features_dict, state_id, table_columns, first_pass, domain_name, state)
        
        pfam_conserved_state_feature(states_features_dict, state_id, table_columns, first_pass, state, con_states_dict)
        
        #===SIFT score features===#
        SIFT_features(states_features_dict, state_id, table_columns, first_pass, sift_scores_list)
        
        #===Polyphen score features===#
        PolyPhen_features(states_features_dict, state_id, table_columns, first_pass, polyphen_scores_list)
        
        #===Major allele aa chemical features===#
        major_allele_aa_features(states_features_dict, state_id, table_columns, first_pass, aa_ref_hist)
        
        major_allele_charge(states_features_dict, state_id, table_columns, first_pass, aa_ref_hist)
        
        major_allele_functional_group(states_features_dict, state_id, table_columns, first_pass, aa_ref_hist)
        
        major_allele_hydrophobicity(states_features_dict, state_id, table_columns, first_pass, aa_ref_hist)
        
        major_allele_propensity(states_features_dict, state_id, table_columns, first_pass, aa_ref_hist)
        
        #secondary structure prediction
        
        #Solven accessibility prediction
        
        #===Substitution features===#
        
        
        #===Binding-score features===#
        binding_scores_features(states_features_dict, state_id, table_columns, first_pass, domain_name)
        
        first_pass = False
        
        
    print "Finished "+domain_name

Finished 2OG-FeII_Oxy_3


In [132]:
#Exporting to data-frames table
domains_features_df = pd.DataFrame.from_dict(features_dict,orient='index')
domains_features_df.columns = table_columns
domains_features_df = domains_features_df.sort_index()
#Save to file
domains_features_df.to_csv(curr_dir[0]+"/binding_df/"+domains_th+"/positions_features.csv", sep='\t')
domains_features_df

Unnamed: 0,domain_name,avg_maf_all,avg_maf_altered,maf_hist_0.0-0.02,maf_hist_0.02-0.04,maf_hist_0.04-0.06,maf_hist_0.06-0.08,maf_hist_0.08-0.1,maf_hist_0.1-0.12,maf_hist_0.12-0.14,...,hindex_avg,hindex_pos_cnt,hindex_neg_cnt,aa_ref_alpha_prop_avg,aa_ref_beta_prop_avg,aa_ref_turn_prop_avg,aa_ref_alpha_is_majority,aa_ref_beta_is_majority,aa_ref_turn_is_majority,binding_score
2OG-FeII_Oxy_3_1,2OG-FeII_Oxy_3,5.284545e-06,0.000029,2,0,0,0,0,0,0,...,25.6,9,2,6.54,6.82,4.87,0,1,0,0.000000
2OG-FeII_Oxy_3_10,2OG-FeII_Oxy_3,3.018000e-06,0.000011,3,0,0,0,0,0,0,...,-5.6,2,9,6.13,5.02,6.26,1,1,1,0.000000
2OG-FeII_Oxy_3_11,2OG-FeII_Oxy_3,1.055370e-05,0.000026,4,0,0,0,0,0,0,...,-11.8,2,8,4.87,4.31,5.58,1,0,0,0.000000
2OG-FeII_Oxy_3_12,2OG-FeII_Oxy_3,4.429500e-06,0.000022,2,0,0,0,0,0,0,...,-13.0,1,9,3.88,4.57,3.63,1,1,0,0.000000
2OG-FeII_Oxy_3_13,2OG-FeII_Oxy_3,1.454900e-05,0.000040,4,0,0,0,0,0,0,...,-1.2,4,7,6.18,5.98,5.26,0,1,0,0.033175
2OG-FeII_Oxy_3_14,2OG-FeII_Oxy_3,1.721000e-05,0.000038,5,0,0,0,0,0,0,...,-3.5,5,6,5.81,6.80,5.29,0,1,0,0.000000
2OG-FeII_Oxy_3_15,2OG-FeII_Oxy_3,1.864264e-05,0.000051,4,0,0,0,0,0,0,...,-35.2,0,11,1.00,0.87,0.95,1,0,0,0.519703
2OG-FeII_Oxy_3_16,2OG-FeII_Oxy_3,5.254727e-06,0.000014,4,0,0,0,0,0,0,...,24.2,9,2,4.62,4.45,3.22,0,1,0,0.000000
2OG-FeII_Oxy_3_17,2OG-FeII_Oxy_3,7.488182e-07,0.000008,1,0,0,0,0,0,0,...,-38.5,0,11,2.52,0.91,2.20,1,0,1,0.519703
2OG-FeII_Oxy_3_18,2OG-FeII_Oxy_3,5.268091e-06,0.000012,5,0,0,0,0,0,0,...,-20.7,2,9,5.77,5.73,6.42,1,1,1,0.000000


In [133]:
table_columns

['domain_name',
 'avg_maf_all',
 'avg_maf_altered',
 'maf_hist_0.0-0.02',
 'maf_hist_0.02-0.04',
 'maf_hist_0.04-0.06',
 'maf_hist_0.06-0.08',
 'maf_hist_0.08-0.1',
 'maf_hist_0.1-0.12',
 'maf_hist_0.12-0.14',
 'maf_hist_0.14-0.16',
 'maf_hist_0.16-0.18',
 'maf_hist_0.18-0.2',
 'maf_hist_0.2-0.3',
 'maf_hist_0.3-0.4',
 'maf_hist_0.4-0.5',
 'alter_num_aa',
 'alter_num_aa_norm',
 'alter_num_snp',
 'alter_num_snp_norm',
 'avg_aa_polymorphisms',
 'frac_poly_aa',
 'rare_poly_0.5',
 'rare_poly_0.05',
 'rare_poly_0.005',
 'phastCons1_avg',
 'phastCons2_avg',
 'phastCons3_avg',
 'phyloP1_avg',
 'phyloP2_avg',
 'phyloP3_avg',
 'phastCons1_hist_0.0-0.25',
 'phastCons1_hist_0.25-0.5',
 'phastCons1_hist_0.5-0.75',
 'phastCons1_hist_0.75-0.8',
 'phastCons1_hist_0.8-0.85',
 'phastCons1_hist_0.85-0.9',
 'phastCons1_hist_0.9-0.95',
 'phastCons1_hist_0.95-1.0',
 'phastCons2_hist_0.0-0.25',
 'phastCons2_hist_0.25-0.5',
 'phastCons2_hist_0.5-0.75',
 'phastCons2_hist_0.75-0.8',
 'phastCons2_hist_0.8-0.85'

### Old lableling

#### Instances >= 100:
Structural = 713, 
Binding=909, 
together=1622, 
Neutral=5243, 

#### Instances >=50:
Structural=1142, 
Binding=1376, 
Neutral=9479, 

#### Instances >= 10:
Strctural=4531
Binding=4978
Neutral=44692

### New labeling
#### Instances >=50:
Structural=860,
Binding=1658, 
Neutral=9479, 


### Ceate a table for 3 classes PCA

In [10]:
#Choose random neutral states in the number of min(binding, structural)
rand_idx = np.random.choice(44692, 4531, replace=False)
Neutral_states = domains_features_df[domains_features_df["state_type"] == "Neutral"]
Neutral_states = Neutral_states.reset_index(drop=True)
rand_Neutral_states = Neutral_states.iloc[rand_idx]

In [11]:
#Choose random binding states in the number of min(binding, structural)
rand_idx = np.random.choice(4978, 4531, replace=False)
Binding_states = domains_features_df[domains_features_df["state_type"] == "Binding"]
Binding_states = Binding_states.reset_index(drop=True)
rand_Binding_states = Binding_states.iloc[rand_idx]

In [12]:
structural_states = domains_features_df[domains_features_df["state_type"] == "Structural"]

#Concate everything together
rand_domains_features_df = pd.concat([rand_Neutral_states, rand_Binding_states, structural_states], ignore_index=True)
#randomize the rows
rand_domains_features_df = rand_domains_features_df.reindex(np.random.permutation(rand_domains_features_df.index))
rand_domains_features_df = rand_domains_features_df.reset_index(drop=True)
rand_domains_features_df.to_csv(curr_dir[0]+"/rand_binding_10instances_states_features_df.csv", sep='\t')

rand_domains_features_df

### Create a table for 2 classes PCA

In [7]:
#Choose random neutral states in the number of min(binding, structural)
rand_idx = np.random.choice(5243, 1622, replace=False)
Neutral_states = domains_features_df[domains_features_df["state_type"] == "Neutral"]
Neutral_states = Neutral_states.reset_index(drop=True)
rand_Neutral_states = Neutral_states.iloc[rand_idx]

In [8]:
structural_binding_states = domains_features_df[domains_features_df["state_type"] == "Binding/Structural"]

#Concate everything together
rand_domains_features_df = pd.concat([rand_Neutral_states, structural_binding_states], ignore_index=True)
rand_domains_features_df = rand_domains_features_df.reindex(np.random.permutation(rand_domains_features_df.index))
rand_domains_features_df = rand_domains_features_df.reset_index(drop=True)
rand_domains_features_df.to_csv(curr_dir[0]+"/rand_binding_100instances_states_features_df_2classes_2.csv", sep='\t')

In [46]:
def get_train_test_inds(y ,train_proportion=0.7):
    '''Generates indices, making random stratified split into training set and testing sets
    with proportions train_proportion and (1-train_proportion) of initial sample.
    y is any iterable indicating classes of each observation in the sample.
    Initial proportions of classes inside training and 
    testing sets are preserved (stratified sampling).
    '''

    y=np.array(y)
    train_inds = np.zeros(len(y),dtype=bool)
    test_inds = np.zeros(len(y),dtype=bool)
    values = np.unique(y)
    for value in values:
        value_inds = np.nonzero(y==value)[0]
        np.random.shuffle(value_inds)
        n = int(train_proportion*len(value_inds))

        train_inds[value_inds[:n]]=True
        test_inds[value_inds[n:]]=True

    return train_inds,test_inds

In [50]:
train_inds,test_inds = get_train_test_inds(rand_domains_features_df["state_type"], train_proportion=0.8)

In [53]:
train_set = rand_domains_features_df[train_inds]
test_set = rand_domains_features_df[test_inds]
train_set.to_csv(curr_dir[0]+"/rand_binding_100_train_df.csv", sep='\t')
test_set.to_csv(curr_dir[0]+"/rand_binding_100_test_df.csv", sep='\t')