## Add SPIDER chemical scores to domains positions

For each protein position, update dictionaries with SPIDER2 characteristics.

input: SPIDER2 info was saved at: "ExAC/SPIDER/SPIDER2/protein_seq_results/domain_dicts/"

### Output:
Creates a new dictionary/update existing dictionary with the mean coverage for each position.

In [None]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import datetime

In [None]:
curr_dir = !pwd
domains_th = "10"
update_same_file = True

if (update_same_file):
    input_path = curr_dir[0]+"/ext_features_dicts/pfam-v30/"
else:
    input_path = curr_dir[0]+"/../5.HMM_alter_align/domains_states_dicts/pfam-v30/"

#Read the list of domains
with open(curr_dir[0]+"/../5.domains_stats/filtered"+domains_th+"_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

spider_path = curr_dir[0]+"/../SPIDER/SPIDER2/protein_seq_results/domain_dicts/"

In [None]:
for domain_name in filtered_domains_list:

    dirfiles = !ls -t $input_path$domain_name
    filename = dirfiles[0]
    with open(input_path+domain_name+"/"+filename, 'rb') as handle:
        states_dict = pickle.load(handle)
    
    with open(curr_dir[0]+"/../4.parse_Uniprot/domains_canonic_prot/pfam-v30/"+domain_name+"_canonic_prot.pik", 'rb') as handle:
        canonic_protein = pickle.load(handle)
        
    with open(spider_path+domain_name+"_secondary_struct_dict.pik",'rb') as handle:
        spider_domain_dict = pickle.load(handle)
        
    for state in states_dict.keys():
        
        for d in states_dict[state]:
            
            if (spider_domain_dict.has_key(d["ens_gene"]) == False):
                print "Gene is not in the Spider dictionary: "+d["ens_gene"]
                break
            if (spider_domain_dict[d["ens_gene"]].has_key(d["prot_pos"]) == False):
                print "protein position is not in the Spider[gene] dictionary: "+str(d["prot_pos"])
                break
            
            #Getting the relevant Spider entry
            spider_entry = spider_domain_dict[d["ens_gene"]][d["prot_pos"]]
            
            #Validation: Spider has the same AA
            if (spider_entry["spd3_AA"] != d["aa_ref_orig"]):
                print "Spider has a different AA: "+spider_entry["hsa2_AA"]+" dict AA: "+d["aa_ref_orig"]+" Gene: "+d["ens_gene"]+" prot_pos: "+str(d["prot_pos"])
                
            d["spider2-2nd_struct"] = spider_entry["spd3_SS"] #secondary structure prediction
            d["spider2-helix_prob"] = spider_entry["spd3_P(H)"] #alpha-Helix prob.
            d["spider2-sheet_prob"] = spider_entry["spd3_P(E)"] #beta-sheet prob.
            d["spider2-turn_prob"] = spider_entry["spd3_P(C)"] #turn prob.
            d["spider2-angle_Phi"] = spider_entry["spd3_Phi"] #backbone_torsion angle
            d["spider2-angle_Psi"] = spider_entry["spd3_Psi"] #backbone_torsion angle
            d["spider2-angle_tau"] = spider_entry["spd3_Tau(i-2=>i+1)"] #c-alpha angle (i-2=>i+1)
            d["spider2-angle_theta"] = spider_entry ["spd3_Theta(i-1=>i+1)"] #c-alpha angle (i-1=>i+1)
            d["spider2-ASA"] = spider_entry["spd3_ASA"] #Accessible Surface Area (solvent accessibility)
            d["spider2-hsa2_HSEu"] = spider_entry["hsa2_HSEu"] #half-sphere exposure Cα-Cα vectors (HSEα-up)
            d["spider2-hsa2_HSEd"] = spider_entry["hsa2_HSEd"] #half-sphere exposure Cα-Cα vectors (HSEα-down)
            d["spider2-hsb2_HSEu"] = spider_entry["hsb2_HSEu"] #half-sphere exposure Cα-Cβ vectors (HSEα-up)
            d["spider2-hsb2_HSEd"] = spider_entry["hsb2_HSEd"] #half-sphere exposure Cα-Cβ vectors (HSEβ-down)
            d["spider2-hsa2_CN"] = spider_entry["hsa2_CN"] #contanct number for Cα-Cα
            d["spider2-hsb2_CN"] = spider_entry["hsb2_CN"] #contact number for Cα-Cβ
            
        print "Finished state "+str(state)
    #Saving the updated dictionary
    !mkdir -p ext_features_dicts/pfam-v30/$domain_name
    
    with open(curr_dir[0]+"/ext_features_dicts/pfam-v30/"+domain_name+"/"+domain_name+"_hmm_states_dict_"+datetime.date.today().strftime("%m.%d.%y")+".pik", 'wb') as handle:
        pickle.dump(states_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
           
    print "Finished "+domain_name
    break