## Domain Level Features

### Requirements
1. Dictionary files for each domain
2. List of domains
3. Domain stats dataframe
4. BLOSUM62 dictionary
5. PAM40 dictionary
6. pfam emissions probability dictionary
7. Fixed threshold clustering csv
8. Percentile clustering csv
9. Flexibility dictionary
10. Uniprot annotations dictionary
11. Secondary structure domain dictionaries

### Instructions
Run cells in order.

### Output
A csv file with rows labelled by domain name and columns by feature.

In [1]:
import pandas as pd
import numpy as np
import cPickle as pickle
import datetime
import sys
curr_dir = !pwd
sys.path.append(curr_dir[0] + "/../5.HMM_alter_align")
from calc_exac_freq_func import codon_table
import features_func as ffunc
import utils
import aa_chemical_properties as aa
from collections import defaultdict
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [2]:
#Getting path
curr_dir = !pwd
instance_cutoff = "50"

#Reading the list of filtered domains
with open(curr_dir[0]+"/../5.domains_stats/filtered"+instance_cutoff+"_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

#Reading the table of all domains stats
filtered_domains_df = pd.read_csv(curr_dir[0]+"/../5.domains_stats/filtered"+instance_cutoff+"_domains_df.csv", sep='\t', index_col=0)

#Read the substitutions table (for the dN/dS calculation)
with open(curr_dir[0]+"/codon_ns_table.pik", 'rb') as handle:
    codon_ns_table = pickle.load(handle)

#Reading the BLOSUM62 dict
with open(curr_dir[0]+"/../BLOSUM62/BLOSUM62_dict.pik", 'rb') as handle:
    blosum62_dict = pickle.load(handle)

#Reading the PAM40 dict
with open(curr_dir[0]+"/../PAM40/PAM40_dict.pik", 'rb') as handle:
    pam40_dict = pickle.load(handle)
    
#Reading the HMM dict
with open(curr_dir[0]+"/../2.parse_Pfam/v30/domains_hmm_prob_dict.pik", 'rb') as handle:
    hmm_prob_dict = pickle.load(handle)
    
#Reading clustering files
clustering_thresh = pd.read_csv(curr_dir[0]+"/clustering/clustering_fixedThresh.csv", sep=',', index_col=0)
clustering_percentile = pd.read_csv(curr_dir[0]+"/clustering/clustering_percentile.csv", sep=',', index_col=0)

#Reading flexibility scores
with open(curr_dir[0]+"/../flexibility/flex_dict.pik", 'rb') as handle:
    flex_dict = pickle.load(handle)
    
#Reading uniprot annotations dictionary
with open(curr_dir[0]+"/uniprot_annotations/uniprot_annotations_dict.pik", 'rb') as handle:
    uniprot_annotations_dict = pickle.load(handle)

#### Calculate domain features

In [3]:
input_path = curr_dir[0]+"/../5.HMM_alter_align/domains_states_dicts/pfam-v30/"
features_dict = defaultdict(list)

for domain_name in filtered_domains_list:
    col_names = []
    
    #Reading the domain states dictionary
    domain_dirfiles = !ls -t $input_path$domain_name
    recent_filename = utils.find_recent(domain_dirfiles)
    with open(input_path+domain_name+"/"+recent_filename, 'rb') as handle:
        states_dict = pickle.load(handle)
        
    #Initializing feature counters
    maf_list = []
    maf_sum = 0
    sites_aa_num = 0
    sites_aa_alter_num = 0
    sites_snp_alter_num = 0
    sites_poly_aa_num = 0 #The number of different aa in all the altered sites (most are 1)
    sites_poly_aa_several = 0
    
    #Rare-poly-counters
    maft_5 =  0.005
    maft_05 = 0.0005
    maft_005 = 0.00005
    rare_5_num = 0
    rare_05_num = 0
    rare_005_num = 0
    
    #BLOSUM62_vals
    blosum62_list = []
    weighted_blosum62_list = []
    
    #PAM40 vals
    pam40_list = []
    weighted_pam40_list = []
    
    #SIFT counters
    sift_list = []
    
    #PolyPhen counters
    polyphen_list = []
    
    #dn/ds counters and variables
    ref_seq = ""
    #ref_af_list = []
    Nd = 0
    Sd = 0
    
    #Entropy
    nonsyn_by_pos = defaultdict(list)
    nonsyn_by_gene = defaultdict(list)
    
    #Conservation scores
    phyloP_cutoff = 1.31
    phastCons_con_cutoff = 0.95
    phastCons_noncon_cutoff = 0.05
    phyloP_list = []
    phastCons_list = []
    
    #Nonsyn mutations in conserved aa positions
    num_con_nonsyn = 0
    
    #Very common mutations
    common_cutoff = 0.1
    num_common = 0
    
    #AA properties
    aa_freq_counts = utils.zeroes_dict(['*','A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'])
    charge_counts = utils.zeroes_dict(['NEUTRAL','POSITIVE','NEGATIVE'])
    func_group_counts = utils.zeroes_dict(['ALIPHATIC','AROMATIC','NEGATIVE','POSITIVE','POLAR','STOP'])
    volume_sum = 0
    hydro_list = []
    
    #pfam conserved sites
    num_conserved = 0
    sum_max = 0
    prob_entropy = []
    total_num = len(hmm_prob_dict[domain_name].keys())
    con_threshold = 0.8
    for state in hmm_prob_dict[domain_name].keys():
        prob_list = hmm_prob_dict[domain_name][state]
        prob_entropy.append(utils.entropy(prob_list))
        sum_max += max(prob_list)
        if (max(prob_list) > con_threshold):
            num_conserved += 1
    
    #Flexibility
    flex_list = []
    for gene in flex_dict[domain_name].keys():
        flex_list += flex_dict[domain_name][gene]
        
    #SPIDER2 Secondary Structure
    hsa2_cn = []
    hsa2_hsed = []
    hsa2_hseu = []
    hsb2_cn = []
    hsb2_hsed = []
    hsb2_hseu = []
    asa = []
    ss = []
    
    #Loading dict for this domain
    with open(curr_dir[0]+'/../SPIDER/dicts/'+domain_name+'_secondary_struct_dict.pik', 'rb') as handle:
        secondary_struct_dict = pickle.load(handle)
        
    #Extracting features
    for gene in secondary_struct_dict:
        for pos in secondary_struct_dict[gene]:
            ent = secondary_struct_dict[gene][pos]
            hsa2_cn.append(float(ent['hsa2_CN']))
            hsa2_hsed.append(float(ent['hsa2_HSEd']))
            hsa2_hseu.append(float(ent['hsa2_HSEu']))
            hsb2_cn.append(float(ent['hsb2_CN']))
            hsb2_hsed.append(float(ent['hsb2_HSEd']))
            hsb2_hseu.append(float(ent['hsb2_HSEu']))
            asa.append(float(ent['spd3_ASA']))
            ss.append(ent['spd3_SS'])
            
    #Uniprot annotations
    has_disulfide = 0
    has_disulfide_interchain = 0
    has_binding = 0
    has_calcium = 0
    has_metal = 0
    has_dna = 0
    has_nucleotide = 0
    has_crosslink = 0
    
    for gene in uniprot_annotations_dict[domain_name]:
        for pos in uniprot_annotations_dict[domain_name][gene]:
            ent = uniprot_annotations_dict[domain_name][gene][pos]
            if 'disulfide_t' in ent:
                has_disulfide = 1
                has_disulfide_interchain = 1
            if 'disulfide_f' in ent:
                has_disulfide = 1
            if 'binding' in ent:
                has_binding = 1
            if 'calcium' in ent:
                has_calcium = 1
            if 'metal' in ent:
                has_metal = 1
            if 'dna' in ent:
                has_dna = 1
            if 'nucleotide' in ent:
                has_nucleotide = 1
            if 'crosslink' in ent:
                has_crosslink = 1
            
    
    for state in states_dict:
        sites_aa_num += len(states_dict[state])
        for d in states_dict[state]:
            #Creating a position pseudo-ref sequence
            ref_codon = d["bp_ref"]
            ref_seq += ref_codon
            
            #Calculating frequency-based N/S
            bp_af_adj_dict = d["bp_af_adj_dict"]
            for alt_codon in bp_af_adj_dict.keys():
                alt_aa = codon_table[alt_codon]
                #syn
                if (alt_aa == d["aa_ref"]):
                    Sd += bp_af_adj_dict[alt_codon]
                #Non-syn
                else:
                    Nd += bp_af_adj_dict[alt_codon]
                    
            #Conservation
            for score in d["phyloP"]:
                phyloP_list.append(score)
            for score in d["phastCons"]:
                phastCons_list.append(score)
            
            #AA properties
            aa_freq_counts[d["aa_ref"]] += 1
            charge_counts[aa.aa_charge_dict[d["aa_ref"]].name] += 1
            func_group_counts[aa.aa_functional_group_dict[d["aa_ref"]].name] += 1   
            volume_sum += aa.volume[d["aa_ref"]]
            hydro_list.append(aa.hindex_Kyte_Doolitle[d["aa_ref"]])
            
            if (d["af_adj"] > 0):
                sites_aa_alter_num += 1
                sites_snp_alter_num += len(d["an_adj"])
                maf_list.append(d["af_adj"])
                maf_sum += d["af_adj"]
                
                #Number of different polymorphisms at this site
                site_poly_num = len(d["alterations_af_adj_dict"].keys())
                sites_poly_aa_num += site_poly_num
                if (site_poly_num > 1):
                    sites_poly_aa_several += 1
                
                #Rare poly features
                if (d["af_adj"] < maft_005):
                    rare_005_num += 1
                    rare_05_num += 1
                    rare_5_num += 1
                elif (d["af_adj"] < maft_05):
                    rare_05_num += 1
                    rare_5_num += 1
                elif (d["af_adj"] < maft_5):
                    rare_5_num += 1
                
                #Common SNP
                if(d["af_adj"] > common_cutoff):
                    num_common += 1
                    
                #BLOSUM62 features
                ref = d["aa_ref"]
                for alt in d["alterations_af_adj_dict"].keys():
                    blosum_val = blosum62_dict[ref][alt]
                    af_adj = np.mean(d["alterations_af_adj_dict"][alt])
                    blosum62_list.append(blosum_val)
                    weighted_blosum62_list.append(blosum_val*af_adj)
                
                #PAM40 features
                for alt in d["alterations_af_adj_dict"].keys():
                    pam_val = pam40_dict[ref][alt]
                    af_adj = np.mean(d["alterations_af_adj_dict"][alt])
                    pam40_list.append(pam_val)
                    weighted_pam40_list.append(pam_val*af_adj)
                
                #SIFT
                sift_vals = d["SIFT"]
                for s in sift_vals:
                    if (s != ""):
                        sift_list.append(float(s[s.find("(")+1:s.find(")")]))
                        
                #PolyPhen
                polyphen_vals = d["PolyPhen"]      
                for s in polyphen_vals:
                    if (s != ""):
                        polyphen_list.append(float(s[s.find("(")+1:s.find(")")]))
                
                #Entropy
                nonsyn_by_pos[state].append(d["af_adj"])
                nonsyn_by_gene[d['ens_gene']].append(d["af_adj"])

                #Nonsyn conserved mutations
                if len(d["phyloP"]) == 3 and np.mean(d["phyloP"]) > phyloP_cutoff:
                    num_con_nonsyn += 1
    
    #Feature: domain length
    domain_len = len(states_dict.keys())
    features_dict[domain_name].append(domain_len)
    col_names.append('length')
    
    #Feature: average MAF overall aa sites
    avg_maf_overall = maf_sum/float(sites_aa_num)
    features_dict[domain_name].append(avg_maf_overall)
    col_names.append('avg_maf')
    
    #Feature: average MAF of all the altered sites
    avg_maf_only_altered = maf_sum/float(sites_aa_alter_num)
    features_dict[domain_name].append(avg_maf_only_altered)
    col_names.append('avg_maf_nonsyn')
    
    #Feature: number of alterations - aa level (raw and normalized by domain length)
    norm_aa_alter_num = sites_aa_alter_num/float(domain_len)
    features_dict[domain_name].append(sites_aa_alter_num)
    col_names.append('sites_aa_alter_num')
    features_dict[domain_name].append(norm_aa_alter_num)
    col_names.append('norm_aa_alter_num')
    
    #Feature: number of alterations - DNA level (raw and normalized by domain length)
    norm_snp_alter_num = sites_snp_alter_num/float(domain_len)
    features_dict[domain_name].append(sites_snp_alter_num)
    col_names.append('sites_snp_alter_num')
    features_dict[domain_name].append(norm_snp_alter_num)
    col_names.append('norm_snp_alter_num')
    
    #Feature: fraction of aa alterations (fraction of non-zero alterations)
    frac_alter_aa = sites_aa_alter_num/float(sites_aa_num)
    features_dict[domain_name].append(frac_alter_aa)
    col_names.append('frac_alter_aa')
    
    #Feature: Fraction of DNA sites altered
    frac_snp_alter = float(sites_snp_alter_num) / (3*sites_aa_num)
    features_dict[domain_name].append(frac_snp_alter)
    col_names.append('frac_alter_dna')
    
    #Feature: average number of polymorphisms at one site
    avg_poly_aa = sites_poly_aa_num/float(sites_aa_alter_num)
    features_dict[domain_name].append(avg_poly_aa)
    col_names.append('avg_poly_aa')
    
    #Feature: fraction of altered sites with more than 1 polymorphism
    frac_poly_several = sites_poly_aa_several/float(sites_aa_alter_num)
    features_dict[domain_name].append(frac_poly_several)
    col_names.append('frac_poly_several')
    
    #Feature: fraction of rare SNPs (0.5%)
    frac_rare_5 = rare_5_num/float(sites_aa_alter_num)
    features_dict[domain_name].append(frac_rare_5)
    col_names.append('frac_rare_5')
    
    #Feature: fraction of rare SNPs (0.05%)
    frac_rare_05 = rare_05_num/float(sites_aa_alter_num)
    features_dict[domain_name].append(frac_rare_05)
    col_names.append('frac_rare_05')
    
    #Feature: fraction of rare SNPs (0.005%)
    frac_rare_005 = rare_005_num/float(sites_aa_alter_num)
    features_dict[domain_name].append(frac_rare_005)
    col_names.append('frac_rare_005')
    
    #Feature: BLOSUM62 average
    features_dict[domain_name].append(np.mean(blosum62_list))
    col_names.append('BLOSUM_avg')
    features_dict[domain_name].append(np.mean(weighted_blosum62_list))
    col_names.append('weighted_BLOSUM_avg')
    
    #Feature: BLOSUM62 std
    features_dict[domain_name].append(np.std(blosum62_list))
    col_names.append('BLOSUM_std')
    features_dict[domain_name].append(np.std(weighted_blosum62_list))
    col_names.append('weighted_BLOSUM_std')
    
    #Feature: PAM40 average
    features_dict[domain_name].append(np.mean(pam40_list))
    col_names.append('PAM40_avg')
    features_dict[domain_name].append(np.mean(weighted_pam40_list))
    col_names.append('weighted_PAM40_avg')
    
    #Feature: PAM40 std
    features_dict[domain_name].append(np.std(pam40_list))
    col_names.append('PAM40_std')
    features_dict[domain_name].append(np.std(weighted_pam40_list))
    col_names.append('weighted_PAM40_std')
    
    #Feature: pseudo-sequence dN/dS
    features_dict[domain_name].append(utils.calc_dNdS(ref_seq,Nd,Sd))
    col_names.append('pseudo_dNdS')
    
    #Feature: SIFT average
    features_dict[domain_name].append(utils.mean(sift_list))
    col_names.append('SIFT_avg')
    
    #Feature: SIFT std
    features_dict[domain_name].append(utils.std(sift_list))
    col_names.append('SIFT_std')
    
    #Feature: PolyPhen average
    features_dict[domain_name].append(utils.mean(polyphen_list))
    col_names.append('PolyPhen_avg')
    
    #Feature: PolyPhen std
    features_dict[domain_name].append(utils.std(polyphen_list))
    col_names.append('PolyPhen_std')
    
    #Feature: Entropy by position
    avg_nonsyn_pos = np.zeros(len(nonsyn_by_pos.keys()))
    index = 0
    for key in nonsyn_by_pos.keys():
        avg_nonsyn_pos[index] = np.median(nonsyn_by_pos[key])
        index += 1
    features_dict[domain_name].append(utils.entropy(avg_nonsyn_pos))
    col_names.append('entropy_nonsyn_pos')
    
    #Feature: Windowed entropy by position
    features_dict[domain_name].append(utils.density(avg_nonsyn_pos,20))
    col_names.append('entropy_nonsyn_window')
    
    #Feature: Entropy by gene
    avg_nonsyn_gene = np.zeros(len(nonsyn_by_gene.keys()))
    index = 0
    for key in nonsyn_by_gene.keys():
        avg_nonsyn_gene[index] = np.median(nonsyn_by_gene[key])
        index += 1
    features_dict[domain_name].append(utils.entropy(avg_nonsyn_gene))
    col_names.append('entropy_nonsyn_gene')
    
    #Feature: Average phyloP
    features_dict[domain_name].append(np.mean(phyloP_list))
    col_names.append('phyloP_avg')
    
    #Feature: Average phastCons
    features_dict[domain_name].append(np.mean(phastCons_list))
    col_names.append('phastCons_avg')
    
    #Feature: Standard deviation phyloP
    features_dict[domain_name].append(np.std(phyloP_list))
    col_names.append('phyloP_std')
    
    #Feature: Standard deviation phastCons
    features_dict[domain_name].append(np.std(phastCons_list))
    col_names.append('phastCons_std')
    
    #Feature: Ratio phyloP
    num_con_phyloP = sum(map(lambda x: x > phyloP_cutoff, phyloP_list))
    num_noncon_phyloP = sum(map(lambda x: x < -phyloP_cutoff, phyloP_list))
    ratio_phyloP = float(num_con_phyloP) / num_noncon_phyloP
    features_dict[domain_name].append(ratio_phyloP)
    col_names.append('phyloP_ratio')
    
    #Feature: Ratio phastCons
    num_con_phastCons = sum(map(lambda x: x > phastCons_con_cutoff, phastCons_list))
    num_noncon_phastCons = sum(map(lambda x: x < phastCons_noncon_cutoff, phastCons_list))
    ratio_phastCons = float(num_con_phastCons) / num_noncon_phastCons
    features_dict[domain_name].append(ratio_phastCons)
    col_names.append('phastCons_ratio')
    
    #Feature: pfam emission prob fraction
    frac_conserved = float(num_conserved) / total_num
    features_dict[domain_name].append(frac_conserved)
    col_names.append('pfam_frac_conserved')
    
    #Feature: pfam emission prob average max
    avg_max = sum_max / total_num
    features_dict[domain_name].append(avg_max)
    col_names.append('pfam_avg_max')

    #Feature: Average pfam emission prob entropy
    features_dict[domain_name].append(np.mean(prob_entropy))
    col_names.append('pfam_entropy')
    
    #Feature: Clustering with 0.005% cutoff
    features_dict[domain_name].append(clustering_thresh.loc[domain_name,"5e-05"])
    col_names.append('clustering_005')
    
    #Feature: Clustering with 90th percentile
    features_dict[domain_name].append(clustering_percentile.loc[domain_name,"90"])
    col_names.append('clustering_90')
    
    #Feature: Fraction of nonsyn altered positions that are conserved
    frac_con = float(num_con_nonsyn) / sites_aa_alter_num
    features_dict[domain_name].append(frac_con)
    col_names.append('frac_nonsyn_con')
    
    #Feature: Common SNPs
    frac_common = float(num_common) / sites_aa_alter_num
    features_dict[domain_name].append(frac_common)
    col_names.append('frac_common')
    
    #Feature: AA frequencies
    for key in aa_freq_counts:
        features_dict[domain_name].append(float(aa_freq_counts[key]) / sum(aa_freq_counts.values()))
        col_names.append('frac_'+key)
    
    #Feature: AA charge
    for key in charge_counts:
        features_dict[domain_name].append(float(charge_counts[key]) / sum(charge_counts.values()))
        col_names.append('charge_frac_'+key.lower())
        
    features_dict[domain_name].append(charge_counts['POSITIVE']-charge_counts['NEGATIVE'])
    col_names.append('net_charge')
    
    #Feature: AA functional group
    for key in func_group_counts:
        features_dict[domain_name].append(float(func_group_counts[key]) / sum(func_group_counts.values()))
        col_names.append('func_frac_'+key.lower())
    
    #Feature: Average aa volume
    features_dict[domain_name].append(volume_sum / sites_aa_num)
    col_names.append('aa_volume_avg')
    
    #Feature: Hydrophobicity
    features_dict[domain_name].append(np.mean(hydro_list))
    col_names.append('hydrophobicity_avg')
    
    features_dict[domain_name].append(np.std(hydro_list))
    col_names.append('hydrophobicity_std')
    
    #Feature: Flexibility
    features_dict[domain_name].append(np.mean(flex_list))
    col_names.append('flexibility_avg')
    
    features_dict[domain_name].append(np.std(flex_list))
    col_names.append('flexibility_std')
    
    #Feature: SPIDER2
    features_dict[domain_name].append(np.mean(hsa2_cn))
    col_names.append('hsa2_cn')
    
    features_dict[domain_name].append(np.mean(hsa2_hsed))
    col_names.append('hsa2_hsed')
    
    features_dict[domain_name].append(np.mean(hsa2_hseu))
    col_names.append('hsa2_hseu')
    
    features_dict[domain_name].append(np.mean(hsb2_cn))
    col_names.append('hsb2_cn')
    
    features_dict[domain_name].append(np.mean(hsb2_hsed))
    col_names.append('hsb2_hsed')
    
    features_dict[domain_name].append(np.mean(hsb2_hseu))
    col_names.append('hsb2_hseu')
    
    features_dict[domain_name].append(np.mean(asa))
    col_names.append('asa')
    
    if len(ss) == 0:
        for i in range(0,3):
            features_dict[domain_name].append(np.nan)
    else:
        frac_helix = sum(map(lambda x: x == 'H', ss)) / float(len(ss))
        features_dict[domain_name].append(frac_helix)

        frac_sheet = sum(map(lambda x: x == 'E', ss)) / float(len(ss))
        features_dict[domain_name].append(frac_sheet)

        frac_coil = sum(map(lambda x: x == 'C', ss)) / float(len(ss))
        features_dict[domain_name].append(frac_coil)
        
    col_names.append('frac_helix')
    col_names.append('frac_sheet')
    col_names.append('frac_coil')
    
    #Feature: Uniprot Annotations
    features_dict[domain_name].append(has_disulfide)
    col_names.append('disulfide')
    
    features_dict[domain_name].append(has_disulfide_interchain)
    col_names.append('disulfide_interchain')
    
    features_dict[domain_name].append(has_binding)
    col_names.append('binding')
    
    features_dict[domain_name].append(has_calcium)
    col_names.append('calcium')
    
    features_dict[domain_name].append(has_metal)
    col_names.append('metal')
    
    features_dict[domain_name].append(has_dna)
    col_names.append('dna')
    
    features_dict[domain_name].append(has_nucleotide)
    col_names.append('nucleotide')
    
    features_dict[domain_name].append(has_crosslink)
    col_names.append('crosslink')
    
    print("Finished domain "+domain_name)

Finished domain 7TM_GPCR_Srsx
Finished domain 7tm_1
Finished domain 7tm_4
Finished domain AAA
Finished domain AAA_5
Finished domain ABC_membrane
Finished domain ABC_tran
Finished domain Ank
Finished domain Ank_2
Finished domain Ank_3
Finished domain Ank_4
Finished domain Ank_5
Finished domain Annexin
Finished domain Arf
Finished domain Arm
Finished domain BACK
Finished domain BTB
Finished domain BTB_2
Finished domain Bromodomain
Finished domain C1-set
Finished domain C1_1
Finished domain C2
Finished domain C2-set_2
Finished domain CBFD_NFYB_HMF
Finished domain CH
Finished domain CUB
Finished domain Cadherin
Finished domain Cadherin_2
Finished domain Cadherin_3
Finished domain Cadherin_C_2
Finished domain Cadherin_tail
Finished domain Collagen
Finished domain DEAD
Finished domain DUF1220
Finished domain EF-hand_1
Finished domain EF-hand_5
Finished domain EF-hand_6
Finished domain EF-hand_7
Finished domain EF-hand_8
Finished domain EGF
Finished domain EGF_2
Finished domain EGF_3
Finished

#### Export features to a well-formatted data frame

In [4]:
domains_features_df = pd.DataFrame.from_dict(features_dict,orient='index')
domains_features_df.columns = col_names
domains_features_df = domains_features_df.sort_index()

#Adding the data from the df
domains_features_df["num_genes"] = filtered_domains_df["genes"]
domains_features_df["num_instances"] = filtered_domains_df["instances"]

#Computing log2 of genes number
domains_features_df["num_genes_log2"] = domains_features_df["num_genes"].apply(lambda x: np.log2(x))
domains_features_df["num_instances_log2"] = domains_features_df["num_instances"].apply(lambda x: np.log2(x))

#Save to file
domains_features_df.to_csv(curr_dir[0]+"/domains_features_df_filtered"+instance_cutoff+".csv", sep='\t')

In [5]:
domains_features_df

Unnamed: 0,length,avg_maf,avg_maf_nonsyn,sites_aa_alter_num,norm_aa_alter_num,sites_snp_alter_num,norm_snp_alter_num,frac_alter_aa,frac_alter_dna,avg_poly_aa,...,frac_binding,frac_calcium,frac_metal,frac_dna,frac_nucleotide,frac_crosslink,num_genes,num_instances,num_genes_log2,num_instances_log2
7TM_GPCR_Srsx,257,0.000502,0.001846,8887,34.579767,12621,49.108949,0.272057,0.128788,1.211207,...,1,0,1,0,0,0,145,145,7.179909,7.179909
7tm_1,268,0.001191,0.004008,47227,176.220149,67859,253.205224,0.297169,0.142331,1.232431,...,1,0,1,0,1,0,768,768,9.584963,9.584963
7tm_4,280,0.001838,0.005813,33656,120.200000,48190,172.107143,0.316185,0.150909,1.244860,...,0,0,0,0,0,0,478,478,8.900867,8.900867
AAA,132,0.000199,0.001009,1442,10.924242,2025,15.340909,0.197643,0.092516,1.199029,...,1,0,0,0,1,1,52,59,5.700440,5.882643
AAA_5,138,0.000295,0.001226,1880,13.623188,2670,19.347826,0.240348,0.113782,1.209043,...,1,0,0,0,1,0,40,61,5.321928,5.930737
ABC_membrane,274,0.000254,0.000904,2971,10.843066,4209,15.361314,0.281132,0.132759,1.216762,...,0,0,0,0,0,0,37,53,5.209453,5.727920
ABC_tran,137,0.000288,0.001044,2897,21.145985,4157,30.343066,0.275852,0.131943,1.231964,...,1,0,0,0,1,0,69,107,6.108524,6.741467
Ank,32,0.000256,0.001201,4292,134.125000,5972,186.625000,0.212992,0.098787,1.172181,...,1,0,0,0,0,0,224,691,7.807355,9.432542
Ank_2,84,0.000329,0.001564,7818,93.071429,10820,128.809524,0.210603,0.097157,1.173830,...,0,0,0,0,1,0,250,496,7.965784,8.954196
Ank_3,31,0.000176,0.000787,3109,100.290323,4347,140.225806,0.222995,0.103931,1.177227,...,1,0,0,0,1,0,207,504,7.693487,8.977280
