## Extract conservation scores for each position

### Requirements:
1. Phylop and phastCons scores arranged in "chunks". 
3. Domains list
4. Domain dictionary files

### Instructions:
Run the cells in order

### Output:
1. Creates a new dictionary for each domain with phastCons and phyloP conservation scores at each site
2. A text file with missing sites

In [1]:
import pandas as pd
import numpy as np
import cPickle as pickle
import datetime
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [2]:
#Getting path
curr_dir = !pwd
pfam_version = "30"
intance_cutoff = "10"

update_same_file = False

if (update_same_file):
    input_path = curr_dir[0]+"/ext_features_dicts/pfam-v"+pfam_version+"/"
else:
    input_path = curr_dir[0]+"/../5.HMM_alter_align/domains_states_dicts/pfam-v"+pfam_version+"/"


#Reading the list of filtered domains
with open(curr_dir[0]+"/../5.domains_stats/pfam-v"+pfam_version+"/filtered"+intance_cutoff+"_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

#### Helper functions - Find the file fragment corresponding to a given position on a chromosome

In [3]:
# Modified binary search to find correct fragment file
# Note: a position in a gap with still return the previous fragment — need to check if position is out of bounds later
#
# cons: "phyloP" or "phastCons"
# chrom: chromosome number
# pos: position number
def bin_search(cons,chrom,pos):
    input_path = curr_dir[0]+"/../conservation_scores/"+cons+"_frags_txt/chr"+str(chrom)+"/"
    with open(input_path+"index.pik",'rb') as handle:
        index = pickle.load(handle)
    if pos < index[0]:
        return(-1)
    return(rec_helper(0,len(index)-1,index,pos))

def rec_helper(lo,hi,a,val):
    if hi < lo:
        return(-1)
    # Indices will never be large, so computing mid in this way is fine
    mid = (hi+lo) / 2
    a_val = a[mid]
    if val >= a_val and mid == len(a)-1:
        return(a_val)
    elif val >= a_val and val < a[mid+1]:
        return(a_val)
    elif val > a_val:
        return(rec_helper(mid+1,hi,a,val))
    else:
        return(rec_helper(lo,mid-1,a,val))

### Add conservation scores to domain dictionaries

In [8]:
filtered_domains_list.index("IRS")

301

In [5]:
filtered_domains_list[478]

'Pkinase'

In [None]:
%%time
import linecache

# Get current date
today = datetime.date.today()

# Find missing scores and write to file
missing = open(curr_dir[0]+"/../conservation_scores/missing.txt",'w')

# Loop over domains
#for i in range(0,len(filtered_domains_list)):
#for domain_name in filtered_domains_list:
for i in range(294,302):
    domain_name = filtered_domains_list[i]
        
    # Reading the domain states dictionary
    domain_dirfiles = !ls -t $input_path$domain_name
    # Find the most recent file
    recent_priority = -1
    recent_filename = ""
    for f in domain_dirfiles:
        tokens = f.split("_")
        date = tokens[len(tokens)-1].split(".")
        month = int(date[0])
        day = int(date[1])
        # Not all files have years, but those that do are the most recent
        if date[2] != "pik":
            year = int(date[2])
        else:
            year = 0
        priority = year*1000 + month*50 + day
        if priority > recent_priority:
            recent_priority = priority
            recent_filename = f
    with open(input_path+domain_name+"/"+recent_filename, 'rb') as handle:
        states_dict = pickle.load(handle)

    # Loop over dictionary
    for state in states_dict:
        for d in states_dict[state]:
            for cons in ["phyloP","phastCons"]:
                # Find correct file
                frag_path = curr_dir[0]+"/../conservation_scores/"+cons+"_frags_txt/chr"+str(d['chrom'])+"/"
                # Get appropriate scores
                scores = []
                for pos in d['chrom_pos']:
                    start = bin_search(cons,str(d['chrom']),pos)
                    score = linecache.getline(frag_path+str(start)+".txt",pos-start+1).strip()
                    
                    if score != "":
                        scores.append(float(score))
                d[cons] = scores
                if len(scores) < 3:
                    missing.write(domain_name+" "+d['chrom']+" "+str(d['chrom_pos'])+"\n")
        linecache.clearcache()
    
    #Saving the updated dictionary
    !mkdir -p ext_features_dicts/pfam-v31/$domain_name
    
    with open(curr_dir[0]+"/ext_features_dicts/pfam-v"+pfam_version+"/"+domain_name+"/"+domain_name+"_hmm_states_dict_"+today.strftime('%m.%d.%y')+".pik", 'wb') as handle:
        pickle.dump(states_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Finished domain "+domain_name)
    linecache.clearcache()
    
missing.close()

Finished domain IBR
Finished domain IGFBP
Finished domain IL6Ra-bind
Finished domain IL8
Finished domain INT_SG_DDX_CT_C
Finished domain IQ
Finished domain IRK
