## Extract coverage scores for each position

For ech protein position, extract the ExAC mean coverage for its 3 codon's positions.
Add their mean as another attribute.

### Output:
Creates a new dictionary/update existing dictionary with the mean coverage for each position.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import gzip
from collections import defaultdict
import datetime
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>");

In [2]:
curr_dir = !pwd
domains_th = "10"
update_same_file = True

if (update_same_file):
    input_path = curr_dir[0]+"/ext_features_dicts/pfam-v30/"
else:
    input_path = curr_dir[0]+"/../5.HMM_alter_align/domains_states_dicts/pfam-v30/"
output_path = curr_dir[0]+"/ext_features_dicts/pfam-v30/"

#Read the list of domains
with open(curr_dir[0]+"/../5.domains_stats/filtered"+domains_th+"_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

In [3]:
#Load all chromosomes coverage matrices
chromosome_names = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"]
cov_dict = {}

for chrom in chromosome_names:
    cov_dict[chrom] = pd.read_csv(curr_dir[0]+"/coverage_raw/Panel.chr"+chrom+".coverage.txt", sep='\t')

In [5]:
%%time
for domain_name in filtered_domains_list:
    domain_name = "EF-hand_4"
    dirfiles = !ls -t $input_path$domain_name
    filename = dirfiles[0]
    with open(input_path+domain_name+"/"+filename, 'rb') as handle:
        states_dict = pickle.load(handle)
    
    for state in states_dict.keys():
        
        for d in states_dict[state]:
            
            curr_chrom = d["chrom"]
            curr_pos = d["chrom_pos"]
            curr_cov = cov_dict[curr_chrom]
            cov_mean_sum = 0
            cov_mean_num = 0
            for pos in curr_pos:
                curr_mean = curr_cov[curr_cov["pos"] == pos]["mean"]
                if (len(curr_mean) == 0):
                    continue
                cov_mean_sum += float(curr_mean)
                cov_mean_num += 1
            
            if (cov_mean_num == 0):
                cov_mean = 0
            else:
                cov_mean =  float('{:.3e}'.format(float(cov_mean_sum/cov_mean_num)))
            d["coverage_mean"] = cov_mean
            
    
    #Saving the updated dictionary
    !mkdir -p ext_features_dicts/pfam-v30/$domain_name
    
    if (update_same_file):
        with open(curr_dir[0]+"/ext_features_dicts/pfam-v30/"+domain_name+"/"+filename, 'wb') as handle:
            pickle.dump(states_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    else:
        with open(curr_dir[0]+"/ext_features_dicts/pfam-v30/"+domain_name+"/"+domain_name+"_hmm_states_dict_"+datetime.date.today().strftime("%m.%d.%y")+".pik", 'wb') as handle:
            pickle.dump(states_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    print "Finished "+domain_name
    

Finished EF-hand_4
CPU times: user 18.4 s, sys: 360 ms, total: 18.8 s
Wall time: 19.1 s


In [5]:
states_dict[1][0]

{'100-way-BLOSUM_JSD': 0.782107110231,
 '100-way-BLOSUM_JSD0-way-BLOSUM_JSD': 0.782107110231,
 'PolyPhen': [],
 'SIFT': [],
 'SwissProt': [],
 'aa_ref': 'P',
 'ac_adj': [],
 'ac_afr': [],
 'ac_amr': [],
 'ac_eas': [],
 'ac_fin': [],
 'ac_het': [],
 'ac_hom': [],
 'ac_nfe': [],
 'ac_oth': [],
 'ac_sas': [],
 'af': 0,
 'af_adj': 0,
 'af_ref_orig': 'P',
 'alterations_af_adj_dict': defaultdict(list, {}),
 'an': [],
 'an_adj': [],
 'an_afr': [],
 'an_amr': [],
 'an_eas': [],
 'an_fin': [],
 'an_nfe': [],
 'an_oth': [],
 'an_sas': [],
 'bp_af_adj_dict': {},
 'bp_af_dict': {},
 'bp_list': [],
 'bp_ref': 'CCA',
 'chrom': '1',
 'chrom_pos': (145684648, 145684649, 145684650),
 'clin_sig': [],
 'coverage_mean': 53.66,
 'ens_gene': 'ENSG00000121848.9',
 'ens_prot': [],
 'phastCons': [1.0, 1.0, 0.669],
 'phyloP': [7.198, 7.198, -0.088],
 'prot_pos': 213}