In [1]:
import pandas as pd
import numpy as np
import pickle
import datetime
from collections import defaultdict
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [6]:
#Getting path
curr_dir = !pwd

#Reading the list of filtered domains
with open(curr_dir[0]+"/../5.domains_stats/filtered100_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

#Reading the table of all domains stats
filtered_domains_df = pd.read_csv(curr_dir[0]+"/../5.domains_stats/filtered50_domains_df.csv", sep='\t', index_col=0)

In [7]:
len(filtered_domains_list)

93

In [8]:
#Calculate domains features
input_path = curr_dir[0]+"/../5.HMM_alter_align/domains_states_dicts/pfam-v30/"
features_dict = defaultdict(list)

for domain_name in filtered_domains_list:
    
    #Reading the domain states dictionary
    domain_dirfiles = !ls -t $input_path$domain_name
    recent_filename = domain_dirfiles[0]
    with open(input_path+domain_name+"/"+recent_filename, 'rb') as handle:
        states_dict = pickle.load(handle)
    
    #Initializing feature counters
    maf_sum = 0
    sites_aa_num = 0
    sites_aa_alter_num = 0
    sites_snp_alter_num = 0
    sites_poly_aa_num = 0 #The number of different aa in all the altered sites (most are 1)
    sites_poly_aa_several = 0
    
    #Rare-poly-counters
    maft_5 =  0.005
    maft_05 = 0.0005
    maft_005 = 0.00005
    rare_5_num = 0
    rare_05_num = 0
    rare_005_num = 0
    
    for state in states_dict:
        sites_aa_num += len(states_dict[state])
        for d in states_dict[state]:
            if (d["af_adj"] > 0):
                sites_aa_alter_num += 1
                sites_snp_alter_num += len(d["filter"])
                maf_sum += d["af_adj"]
                
                #Number of different polymorphisms at this site
                site_poly_num = len(d["alterations_af_adj_dict"].keys())
                sites_poly_aa_num += site_poly_num
                if (site_poly_num > 1):
                    sites_poly_aa_several += 1
                
                #Rare poly features
                if (d["af_adj"] <= maft_005):
                    rare_005_num += 1
                    rare_05_num += 1
                    rare_5_num += 1
                elif (d["af_adj"] <= maft_05):
                    rare_05_num += 1
                    rare_5_num += 1
                elif (d["af_adj"] <= maft_5):
                    rare_5_num += 1
    
    #Feature: domain length
    domain_len = len(states_dict.keys())
    features_dict[domain_name].append(domain_len)
    
    #Feature: average MAF overall aa sites
    avg_maf_overall = maf_sum/float(sites_aa_num)
    features_dict[domain_name].append(avg_maf_overall)
    
    #Feature: average MAF of all the altered sites
    avg_maf_only_altered = maf_sum/float(sites_aa_alter_num)
    features_dict[domain_name].append(avg_maf_only_altered)
    
    #Feature: number of alterations - aa level (raw and normalized by domain length)
    norm_aa_alter_num = sites_aa_alter_num/float(domain_len)
    features_dict[domain_name].append(sites_aa_alter_num)
    features_dict[domain_name].append(norm_aa_alter_num)
    
    #Feature: number of alterations - DNA level (raw and normalized by domain length)
    norm_snp_alter_num = sites_snp_alter_num/float(domain_len)
    features_dict[domain_name].append(sites_snp_alter_num)
    features_dict[domain_name].append(norm_snp_alter_num)
    
    #Feature: fraction of aa alterations (fraction of non-zero alterations)
    frac_alter_aa = sites_aa_alter_num/float(sites_aa_num)
    features_dict[domain_name].append(frac_alter_aa)
    
    #Feature: average number of poymorphisms at one site
    avg_poly_aa = sites_poly_aa_num/float(sites_aa_alter_num)
    features_dict[domain_name].append(avg_poly_aa)
    
    #Feature: fraction of altered sites with more than 1 polymorphism
    frac_poly_several = sites_poly_aa_several/float(sites_aa_alter_num)
    features_dict[domain_name].append(frac_poly_several)
    
    #Feature: fraction of rare SNPs (0.5%)
    frac_rare_5 = rare_5_num/float(sites_aa_alter_num)
    features_dict[domain_name].append(frac_rare_5)
    
    #Feature: fraction of rare SNPs (0.05%)
    frac_rare_05 = rare_05_num/float(sites_aa_alter_num)
    features_dict[domain_name].append(frac_rare_05)
    
    #Feature: fraction of rare SNPs (0.005%)
    frac_rare_005 = rare_005_num/float(sites_aa_alter_num)
    features_dict[domain_name].append(frac_rare_005)
    
    print "Finished domain "+domain_name

Finished domain 7TM_GPCR_Srsx
Finished domain 7tm_1
Finished domain 7tm_4
Finished domain ABC_tran
Finished domain Ank
Finished domain Ank_2
Finished domain Ank_3
Finished domain Ank_4
Finished domain Ank_5
Finished domain Arf
Finished domain Arm
Finished domain BTB
Finished domain C1-set
Finished domain C2
Finished domain C2-set_2
Finished domain CH
Finished domain CUB
Finished domain Cadherin
Finished domain Collagen
Finished domain DUF1220
Finished domain EF-hand_1
Finished domain EF-hand_7
Finished domain EGF
Finished domain EGF_2
Finished domain EGF_CA
Finished domain FXa_inhibition
Finished domain HLH
Finished domain Helicase_C
Finished domain Homeobox
Finished domain I-set
Finished domain IQ
Finished domain Ig_2
Finished domain Ig_3
Finished domain Ion_trans
Finished domain KRAB
Finished domain Kelch_1
Finished domain Kelch_2
Finished domain Kelch_3
Finished domain Kelch_6
Finished domain Keratin_B2_2
Finished domain LIM
Finished domain LRR_4
Finished domain LRR_8
Finished domai

In [9]:
#Exporting to data-frames table
domains_features_df = pd.DataFrame.from_dict(features_dict,orient='index')
domains_features_df.columns = ["length", "avg_maf_all", "avg_maf_altered", "alter_num_aa", "alter_num_aa_norm", 
                               "alter_num_dna", "alter_num_dna_norm", "frac_alter", "avg_poly", "frac_poly_several", "rare_poly_0.5%", "rare_poly_0.05%", "rare_poly_0.005%"]
domains_features_df = domains_features_df.sort_index()

#Adding the data from the df
domains_features_df["num_genes"] = filtered_domains_df["genes"]
domains_features_df["num_instances"] = filtered_domains_df["instances"]

#Computing log2 of genes number
domains_features_df["num_genes_log2"] = domains_features_df["num_genes"].apply(lambda x: np.log2(x))

#Save to file
domains_features_df.to_csv(curr_dir[0]+"/domains_features_df_filtered100.csv", sep='\t')