In [1]:
import pandas as pd
import numpy as np
import cPickle as pickle
from collections import defaultdict
import os
from IPython.core.display import HTML
HTML("<style>.container { width:95% !important; }</style>")

In [2]:
#Getting path
curr_dir = !pwd
intance_cutoff = "10"

#Reading the list of filtered domains
with open(curr_dir[0]+"/../5.domains_stats/filtered"+intance_cutoff+"_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

In [3]:
# Modified binary search to find correct fragment file
# Note: a position in a gap with still return the previous fragment — need to check if position is out of bounds later
#
# cons: "phyloP" or "phastCons"
# chrom: chromosome number
# pos: position number
def bin_search(cons,chrom,pos):
    input_path = curr_dir[0]+"/../conservation_scores/"+cons+"_frags_txt/chr"+str(chrom)+"/"
    with open(input_path+"index.pik",'rb') as handle:
        index = pickle.load(handle)
    if pos < index[0]:
        return(-1)
    return(rec_helper(0,len(index)-1,index,pos))

def rec_helper(lo,hi,a,val):
    if hi < lo:
        return(-1)
    # Indices will never be large, so computing mid in this way is fine
    mid = (hi+lo) / 2
    a_val = a[mid]
    if val >= a_val and mid == len(a)-1:
        return(a_val)
    elif val >= a_val and val < a[mid+1]:
        return(a_val)
    elif val > a_val:
        return(rec_helper(mid+1,hi,a,val))
    else:
        return(rec_helper(lo,mid-1,a,val))

55060001

In [5]:
# Break each chromosome into more manageable fragments — takes ~1 hour to run
input_path = curr_dir[0] + "/../conservation_scores/"

# Chromosomes
chroms = []
for i in range(1,23):
    chroms.append(str(i))
chroms.append('X')
chroms.append('Y')

for cons in ["phyloP","phastCons"]:
    os.makedirs(curr_dir[0] + "/../conservation_scores/"+cons+"_frags_txt")

    # Loop over chromosomes
    for chrom in chroms:
        os.makedirs(curr_dir[0] + "/../conservation_scores/"+cons+"_frags_txt/chr"+chrom)
        index_list = []
        with open(input_path + cons+"/chr"+chrom+"."+cons+"100way.wigFix") as f:
            for i,line in enumerate(f):
                if line[0] == 'f':
                    # Close old file if not the first line
                    if i != 0:
                        frag.close()
                    # Create new list
                    start = line.split(" ")[2]
                    curr_pos = int(start[6:len(start)])
                    frag = open(input_path+cons+"_frags_txt/chr"+chrom+"/"+str(curr_pos)+".txt", 'wb')
                    index_list.append(curr_pos)
                else:
                    frag.write(line)
            with open(input_path+cons+"_frags_txt/chr"+chrom+"/index.pik", 'wb') as handle:
                pickle.dump(index_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
# Add conservation scores to domain dictionaries — takes ~2 hours

import linecache

# Find missing scores and write to file
missing = open(curr_dir[0]+"/../conservation_scores/missing.txt",'w')
# Read conservation scores and add to domain dictionaries
input_path = curr_dir[0]+"/../5.HMM_alter_align/domains_states_dicts/pfam-v30/"

# Coverage variables
missing_count = 0
found_count = 0

# Loop over domains
for domain_name in filtered_domains_list:
    # Reading the domain states dictionary
    domain_dirfiles = !ls -t $input_path$domain_name
    # Find the most recent file
    recent_priority = -1
    recent_filename = ""
    for f in domain_dirfiles:
        tokens = f.split("_")
        date = tokens[len(tokens)-1].split(".")
        month = int(date[0])
        day = int(date[1])
        # Not all files have years, but those that do are the most recent
        if date[2] != "pik":
            year = int(date[2])
        else:
            year = 0
        priority = year*1000 + month*50 + day
        if priority > recent_priority:
            recent_priority = priority
            recent_filename = f
    with open(input_path+domain_name+"/"+recent_filename, 'rb') as handle:
        states_dict = pickle.load(handle)

    # Loop over dictionary
    for state in states_dict:
        for d in states_dict[state]:
            for cons in ["phyloP","phastCons"]:
                # Find correct file
                frag_path = curr_dir[0]+"/../conservation_scores/"+cons+"_frags_txt/chr"+str(d['chrom'])+"/"
                # Get appropriate scores
                scores = []
                old_pos = -1
                for pos in d['chrom_pos']:
                    start = bin_search(cons,str(d['chrom']),pos)
                    score = linecache.getline(frag_path+str(start)+".txt",pos-start+1).strip()
                    # Track number of positions not found
                    if score == "":
                        missing_count += 1
                    else:
                        scores.append(float(score))
                        found_count += 1
                    old_pos = pos
                d[cons] = scores
                if len(scores) < 3:
                    missing.write(domain_name+" "+d['chrom']+" "+str(d['chrom_pos'])+"\n")

    # Overwrite original file
    with open(input_path+domain_name+"/"+recent_filename, 'wb') as handle:
        pickle.dump(states_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Finished domain "+domain_name)
    linecache.clearcache()
missing.close()

Finished domain FERM_M
Finished domain Myosin_head


In [None]:
print(found_count)
print(missing_count)

In [12]:
# Verify correctness of score of a given position by reading through the large chromosome file
ver_pos = 68050786
chrom = "14"
cons = "phastCons"

input_path = curr_dir[0] + "/../conservation_scores/"
with open(input_path + cons+"/chr"+chrom+"."+cons+"100way.wigFix") as f:
    for i,line in enumerate(f):
        if line[0] == 'f':
            start = line.split(" ")[2]
            curr_pos = int(start[6:len(start)])
        elif curr_pos == ver_pos:
            print(line.strip())
            break
        elif curr_pos > ver_pos:
            print("Not found :(")
            break
        else:
            curr_pos += 1

1.000
