## Extract coverage scores for each position

### Instructions:
Run the cells in order

### Output:
1. Creates a new dictionary for each domain with phastCons and phyloP conservation scores at each site
2. A text file with missing sites

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import gzip
from collections import defaultdict
import datetime
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>");

In [2]:
curr_dir = !pwd
domain_name = "PUF"
input_path = curr_dir[0]+"/../5.HMM_alter_align/domains_states_dicts/pfam-v30/"+domain_name+"/"
output_path = curr_dir[0]+"/coverage_states_dicts/pfam-v30/"+domain_name+"/"
dirfiles = !ls -t $input_path
filename = dirfiles[0]

In [3]:
with open(input_path+filename, 'rb') as handle:
    states_dict = pickle.load(handle)

In [4]:
states_dict[1][0]

{'PolyPhen': [],
 'SIFT': [],
 'SwissProt': [],
 'aa_ref': 'E',
 'ac_adj': [],
 'ac_afr': [],
 'ac_amr': [],
 'ac_eas': [],
 'ac_fin': [],
 'ac_het': [],
 'ac_hom': [],
 'ac_nfe': [],
 'ac_oth': [],
 'ac_sas': [],
 'af': 0,
 'af_adj': 0,
 'af_ref_orig': 'E',
 'alterations_af_adj_dict': defaultdict(list, {}),
 'an': [],
 'an_adj': [],
 'an_afr': [],
 'an_amr': [],
 'an_eas': [],
 'an_fin': [],
 'an_nfe': [],
 'an_oth': [],
 'an_sas': [],
 'bp_af_adj_dict': {},
 'bp_af_dict': {},
 'bp_list': [],
 'bp_ref': 'GAG',
 'chrom': '1',
 'chrom_pos': (31426610, 31426609, 31426608),
 'clin_sig': [],
 'ens_gene': 'ENSG00000134644.11',
 'ens_prot': [],
 'prot_pos': 848}

In [4]:
#Load all chromosomes coverage matrices
chromosome_names = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"]
cov_dict = {}

for chrom in chromosome_names:
    cov_dict[chrom] = pd.read_csv(curr_dir[0]+"/coverage_raw/Panel.chr"+chrom+".coverage.txt", sep='\t')

In [26]:
%%time

for state in states_dict.keys():
    for i in range(len(states_dict[state])):
        curr_dict = states_dict[state][i]
        curr_pos = curr_dict["chrom_pos"]
        curr_chrom = curr_dict["chrom"]
        curr_cov = cov_dict[curr_chrom]
        mean_sum = 0
        for pos in curr_pos:
            curr_mean = curr_cov[curr_cov["pos"] == pos]["mean"]
            if (len(curr_mean) == 0):
                continue
            mean_sum += float(curr_mean)
        cov_mean = float('{:.3e}'.format(float(mean_sum/3)))
        curr_dict["coverage_mean"] = cov_mean
    print "Finished state "+str(state)

Finished state 1
Finished state 2
Finished state 3
Finished state 4
Finished state 5
Finished state 6
Finished state 7
Finished state 8
Finished state 9
Finished state 10
Finished state 11
Finished state 12
Finished state 13
Finished state 14
Finished state 15
Finished state 16
Finished state 17
Finished state 18
Finished state 19
Finished state 20
Finished state 21
Finished state 22
Finished state 23
Finished state 24
Finished state 25
Finished state 26
Finished state 27
Finished state 28
Finished state 29
Finished state 30
Finished state 31
Finished state 32
Finished state 33
Finished state 34
Finished state 35
Finished state 36
Finished state 37
Finished state 38
Finished state 39
Finished state 40
Finished state 41
Finished state 42
Finished state 43
Finished state 44
Finished state 45
Finished state 46
Finished state 47
Finished state 48
Finished state 49
Finished state 50
Finished state 51
Finished state 52
Finished state 53
Finished state 54
Finished state 55
Finished state 56
F

In [27]:
with open(output_path+domain_name+"_hmm_states_dict_"+datetime.date.today().strftime("%m.%d")+"_cov.pik", 'wb') as handle:
    pickle.dump(states_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)