In [66]:
import pandas as pd
import numpy as np
import pickle
import sys
import datetime
from enum import Enum
sys.path.append('/home/anat/Research/ExAC/5.HMM_alter_align')
from af_format_calc import format_af, calculate_af_adj
from collections import defaultdict
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [2]:
curr_dir = !pwd
intance_cutoff = "10"

#Reading the list of filtered domains
with open(curr_dir[0]+"/../5.domains_stats/filtered"+intance_cutoff+"_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

In [61]:
def calc_domain_snps_stats(locus_idx, states_dict):
    """Go over the states dict of a domain.
    Input: locus_idx the line index of the last line, new line should start by advancing the index.
    Return: a dict with the input lines to write in a file
    """
    info_dict = defaultdict(list)
    no_alterations_cnt = 0
    alterations_cnt = 0
    
    for state in states_dict:

        for d in states_dict[state]:

            snps_num = len(d["ac_adj"])
            total_allele_count = np.sum(d["ac_adj"])

            #Building a line for the BayeScan input
            if (snps_num > 0) and (total_allele_count > 1):
                locus_idx += 1

                an_dict = {}

                for pop in populations:

                    #Population field name
                    an_str = "an_"+pop
                    ac_str = "ac_"+pop

                    #Calculating the average number of individuals in each population
                    an_dict[pop] = int(np.round(np.average(d[an_str])))

                    #Adding SNPs counters
                    snps_str = ""
                    for i in range(len(d[ac_str])):
                        snps_str += str(d[ac_str][i])+'  '
                    major_allele_num = (an_dict[pop]) - np.sum(d[ac_str])
                    snps_str += str(major_allele_num)

                    #Create the line string: number, number of individuals in the pop, number of alleles, 
                    info_dict[pop].append(str(locus_idx)+'  '+str(an_dict[pop])+'  '+str(snps_num+1)+'   '+snps_str+"\n")

                alterations_cnt += 1
            else:
                no_alterations_cnt += 1
            
    return (info_dict, locus_idx, alterations_cnt, no_alterations_cnt)
        

In [63]:
def create_BayeScan_file(bayescan_input, info_dict, populations, alterations_cnt):
    
    header = """[loci]={0}

[populations]={1}

""".format(alterations_cnt, len(populations))
    bayescan_input.write(header)

    pop_idx = 0
    for pop in populations:
        pop_idx += 1
        pop_header_line = "[pop]="+str(pop_idx)+'\n'
        bayescan_input.write(pop_header_line)

        for line in info_dict[pop]:
            bayescan_input.write(line)
        bayescan_input.write('\n')

    bayescan_input.close()

In [74]:
#%%time
class input_format(Enum):
    ALL = 0
    SEP = 1

#Choose the input format: all domains in one file or each in a sperate file
input_option = input_format.ALL

#Create BayeScan input files, one file for each domain
input_path = curr_dir[0]+"/../5.HMM_alter_align/domains_states_dicts/pfam-v30/"

populations = ["afr","amr","eas","fin","nfe","oth","sas"]
domain_features = defaultdict(list)

if (input_option == input_format.ALL):
    #Create the BayeScan input file
    bayescan_input  = open("BayeScan/BayeScan_input/All_domains10_SNPs","w")
    all_info_dict = defaultdict(list)
    all_no_alterations_cnt = 0
    all_alterations_cnt = 0

locus_idx = 0

for domain_name in filtered_domains_list:
    
    #Reading the domain states dictionary
    domain_dirfiles = !ls -t $input_path$domain_name
    recent_filename = domain_dirfiles[0]
    
    #Openning the most recent file
    with open(input_path+domain_name+"/"+recent_filename, 'rb') as handle:
        states_dict = pickle.load(handle)
        
        (domain_info_dict, locus_idx, alterations_cnt, no_alterations_cnt) = calc_domain_snps_stats(locus_idx, states_dict)
        
        if (input_option == input_format.SEP):
            
            #Create the BayeScan input file for each domain seperatelly
            bayescan_input  = open("BayeScan/BayeScan_input/"+domain_name+"_SNPs","w")
            
            create_BayeScan_file(bayescan_input, domain_info_dict, populations, alterations_cnt)
            
            #initializing the lines index for each domain
            locus_idx = 0
        else:
            for pop in populations:
                all_info_dict[pop].extend(domain_info_dict[pop])
            all_no_alterations_cnt += no_alterations_cnt
            all_alterations_cnt += alterations_cnt
        
        domain_features[domain_name].append(alterations_cnt)
        domain_features[domain_name].append(no_alterations_cnt)
        
        print "Finished domain "+domain_name

#Create the BayeScan input file for all the domains together
if (input_option == input_format.ALL):
    create_BayeScan_file(bayescan_input, all_info_dict, populations, all_alterations_cnt)

Finished domain 2OG-FeII_Oxy_3
Finished domain 7TM_GPCR_Srsx
Finished domain 7tm_1
Finished domain 7tm_2
Finished domain 7tm_3
Finished domain 7tm_4
Finished domain A2M
Finished domain A2M_N
Finished domain A2M_N_2
Finished domain A2M_comp
Finished domain A2M_recep
Finished domain AAA
Finished domain AAA_11
Finished domain AAA_12
Finished domain AAA_17
Finished domain AAA_18
Finished domain AAA_33
Finished domain AAA_5
Finished domain AAA_6
Finished domain AAA_7
Finished domain AAA_8
Finished domain AAA_9
Finished domain AA_permease
Finished domain AA_permease_2
Finished domain ABC2_membrane_3
Finished domain ABC_membrane
Finished domain ABC_tran
Finished domain ADAM_CR
Finished domain ADAM_spacer1
Finished domain ADH_N
Finished domain ADH_zinc_N
Finished domain ADK
Finished domain AMP-binding
Finished domain AMP-binding_C
Finished domain ANAPC3
Finished domain ANAPC4_WD40
Finished domain ANATO
Finished domain ANF_receptor
Finished domain APC_r
Finished domain APOBEC_C
Finished domain 

In [77]:
all_no_alterations_cnt

3067797

In [6]:
#Exporting to data-frames table
domains_features_df = pd.DataFrame.from_dict(domain_features,orient='index')
domains_features_df.columns = ["altered_positions","non-altered_positions"]
domains_features_df = domains_features_df.sort_index()
domains_features_df.to_csv(curr_dir[0]+"/BayeScan_input/domains10_features_rm1.csv", sep='\t')

In [8]:
%%time
#Running BayeScan on the input file(s)
file_ext = "_SNPs"
for domain_name in filtered_domains_list:

    !./BayeScan/BayeScan2.1/binaries/BayeScan2.1_linux64bits BayeScan/BayeScan_input/$domain_name$file_ext -od BayeScan/BayeScan_output -out_freq > BayeScan/out
    break

CPU times: user 744 ms, sys: 100 ms, total: 844 ms
Wall time: 6min 17s


In [23]:
Bayes_out = pd.read_csv("BayeScan/BayeScan_output/"+domain_name+file_ext+"_fst.txt", delim_whitespace=True, index_col=0, header=0)

In [26]:
Bayes_out.iloc[1]["alpha"]

-0.0027872999999999999

In [60]:
#Map the alpha parameters back to the domains positions
for domain_name in filtered_domains_list:
    
    #Reading the domain states dictionary
    domain_dirfiles = !ls -t $input_path$domain_name
    recent_filename = domain_dirfiles[0]
    
    #Openning the most recent file
    with open(input_path+domain_name+"/"+recent_filename, 'rb') as handle:
        states_dict = pickle.load(handle)
        
    #Open the BayeScan output file
    Bayes_out = pd.read_csv("BayeScan/BayeScan_output/"+domain_name+file_ext+"_fst.txt", delim_whitespace=True, index_col=0, header=0)
    
    locus_idx = 0
    
    alpha_states_dict = defaultdict(list)
    for state in states_dict:
    
            for d in states_dict[state]:
                
                snps_num = len(d["ac_adj"])
                total_allele_count = np.sum(d["ac_adj"])
            
                #Checking if this position is one of the tested locus
                if (snps_num > 0) and (total_allele_count > 1):
                    alpha = Bayes_out.iloc[locus_idx]["alpha"]
                    d["alpha"] = alpha
                    locus_idx += 1
                    
                    #Update the dict with the item containing alpha
                    alpha_states_dict[state].append(d)
    
    
    #Saving the alpha dicts
    !mkdir -p BayeScan/alpha_dicts/$domain_name
    with open(curr_dir[0]+"/BayeScan/alpha_dicts/"+domain_name+"/"+domain_name+"_alpha_states_dict_"+datetime.date.today().strftime("%m.%d.%y")+".pik", 'wb') as handle:
        pickle.dump(alpha_states_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    break
                    

In [7]:
d = states_dict[1][-7]

In [138]:
no_alterations_cnt

20069

In [113]:
d

{'PolyPhen': ['probably_damaging(0.999)', 'probably_damaging(0.999)', ''],
 'SIFT': ['deleterious(0.02)', 'deleterious(0.01)', ''],
 'aa_ref': 'C',
 'ac_adj': [1, 3, 4],
 'ac_afr': [0, 1, 2],
 'ac_amr': [0, 0, 0],
 'ac_eas': [0, 0, 0],
 'ac_fin': [0, 0, 0],
 'ac_het': [1, 3, 4],
 'ac_hom': [0, 0, 0],
 'ac_nfe': [1, 2, 1],
 'ac_oth': [0, 0, 0],
 'ac_sas': [0, 0, 1],
 'af': 3.296e-05,
 'af_adj': 3.479e-05,
 'alterations_af_adj_dict': defaultdict(list,
             {'W': [2.609e-05], 'Y': [8.701e-06]}),
 'an_adj': [114924, 114996, 114996],
 'an_afr': [10242, 10246, 10246],
 'an_amr': [11508, 11506, 11506],
 'an_eas': [8594, 8592, 8592],
 'an_fin': [6608, 6606, 6606],
 'an_nfe': [64606, 64612, 64612],
 'an_oth': [830, 832, 832],
 'an_sas': [12536, 12602, 12602],
 'bp_af_adj_dict': {'TAC': 8.701e-06, 'TGG': 2.609e-05, 'TGT': 3.478e-05},
 'bp_af_dict': {'TAC': 8.239e-06, 'TGG': 2.472e-05, 'TGT': 3.295e-05},
 'bp_ref': 'TGC',
 'chrom': '22',
 'chrom_pos': (37603680, 37603679, 37603678),
 'cli

In [112]:
print info_dict["eas"][0]

1	8593	4	0	0	0	8593



In [44]:
str(d["ac_afr"])

'[0, 1, 2]'

In [60]:
print afr_info[0]

1	114972.0	4	0	1	2	


In [49]:
a += "1"

In [50]:
a

'1'

In [51]:
b

''