## Add SPIDER chemical scores to domains positions

For each protein position, update dictionaries with SPIDER2 characteristics.

input: SPIDER2 info was saved at: "ExAC/SPIDER/SPIDER2/protein_seq_results/domain_dicts/"

### Output:
Creates a new dictionary/update existing dictionary with the mean coverage for each position.

In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import datetime

In [2]:
curr_dir = !pwd
pfam_version = "32"
domains_th = "10"
update_same_file = True

if (update_same_file):
    input_path = curr_dir[0]+"/ext_features_dicts/pfam-v"+pfam_version+"/"
else:
    input_path = curr_dir[0]+"/../5.HMM_alter_align/domains_states_dicts/pfam-v"+pfam_version+"/"

#Read the list of domains
if (pfam_version == "32"):
    with open(curr_dir[0]+"/../5.domains_stats/pfam-v"+pfam_version+"/regular_human_domains_list.pik", 'rb') as handle:
        filtered_domains_list = pickle.load(handle)
else:
    with open(curr_dir[0]+"/../5.domains_stats/pfam-v"+pfam_version+"/filtered"+domains_th+"_list.pik", 'rb') as handle:
        filtered_domains_list = pickle.load(handle)
filtered_domains_list = filtered_domains_list.sort()
print (len(filtered_domains_list))

#Reading sequence dicts
with open(curr_dir[0]+"/../3.parse_HMMER/canonic_prot_seq/pfam-v32/all_domains_genes_prot_seq.pik") as handle:
    seq_dict = pickle.load(handle)

spider_path = curr_dir[0]+"/../SPIDER/SPIDER2/protein_seq_results/domain_dicts/pfam-v"+pfam_version+"/"

6503


In [3]:
#%%time
tmp_dict = dict()
tmp_dict["domain"] = []
tmp_dict["gene"] = []
tmp_dict["canonic_prot"] = []
spider_problems_domains_list = ["Exo_endo_phos", "SKICH", "TIR", "V-set", "WAP", "zf-C2H2", "zf-CCCH"]

for domain_name in spider_problems_domains_list:
    
    dirfiles = !ls -t $input_path$domain_name
    filename = dirfiles[0]
    with open(input_path+domain_name+"/"+filename, 'rb') as handle:
        states_dict = pickle.load(handle)
    
    with open(curr_dir[0]+"/../4.parse_Uniprot/domains_canonic_prot/pfam-v"+pfam_version+"/"+domain_name+"_canonic_prot.pik", 'rb') as handle:
        canonic_protein = pickle.load(handle)
        
    with open(spider_path+domain_name+"_secondary_struct_dict.pik",'rb') as handle:
        spider_domain_dict = pickle.load(handle)
        
    for state in states_dict.keys():

        for d in states_dict[state]:
            ens_prot = canonic_protein[d["ens_gene"]]
            seq = seq_dict[d["ens_gene"]][ens_prot]
            skip = seq[0:d["prot_pos"]].count('X')+seq[0:d["prot_pos"]].count('*')+seq[0:d["prot_pos"]].count('.')+seq[0:d["prot_pos"]].count('-')
            if (skip > 0):
                print skip
            spider_pos = d["prot_pos"]-skip
            
            if (spider_domain_dict.has_key(d["ens_gene"]) == False):
                print "Gene is not in the Spider dictionary: "+d["ens_gene"]
                continue
            if (spider_domain_dict[d["ens_gene"]].has_key(spider_pos) == False):
                print "protein position is not in the Spider[gene] dictionary: "+str(d["prot_pos"])
                tmp_dict["domain"].append(domain_name)
                tmp_dict["gene"].append(d["ens_gene"])
                tmp_dict["canonic_prot"].append(ens_prot)
                continue
            
            #Getting the relevant Spider entry
            spider_entry = spider_domain_dict[d["ens_gene"]][spider_pos]
            
            #Validation: Spider has the same AA
            if (spider_entry["spd3_AA"] != d["aa_ref_orig"]):
                print "Spider has a different AA: "+spider_entry["hsa2_AA"]+" dict AA: "+d["aa_ref_orig"]+" Gene: "+d["ens_gene"]+" prot_pos: "+str(d["prot_pos"])
                
            d["spider2-2nd_struct"] = spider_entry["spd3_SS"] #secondary structure prediction
            d["spider2-helix_prob"] = spider_entry["spd3_P(H)"] #alpha-Helix prob.
            d["spider2-sheet_prob"] = spider_entry["spd3_P(E)"] #beta-sheet prob.
            d["spider2-turn_prob"] = spider_entry["spd3_P(C)"] #turn prob.
            d["spider2-angle_Phi"] = spider_entry["spd3_Phi"] #backbone_torsion angle
            d["spider2-angle_Psi"] = spider_entry["spd3_Psi"] #backbone_torsion angle
            d["spider2-angle_tau"] = spider_entry["spd3_Tau(i-2=>i+1)"] #c-alpha angle (i-2=>i+1)
            d["spider2-angle_theta"] = spider_entry ["spd3_Theta(i-1=>i+1)"] #c-alpha angle (i-1=>i+1)
            d["spider2-ASA"] = spider_entry["spd3_ASA"] #Accessible Surface Area (solvent accessibility)
            d["spider2-hsa2_HSEu"] = spider_entry["hsa2_HSEu"] #half-sphere exposure Cα-Cα vectors (HSEα-up)
            d["spider2-hsa2_HSEd"] = spider_entry["hsa2_HSEd"] #half-sphere exposure Cα-Cα vectors (HSEα-down)
            d["spider2-hsb2_HSEu"] = spider_entry["hsb2_HSEu"] #half-sphere exposure Cα-Cβ vectors (HSEβ-up)
            d["spider2-hsb2_HSEd"] = spider_entry["hsb2_HSEd"] #half-sphere exposure Cα-Cβ vectors (HSEβ-down)
            d["spider2-hsa2_CN"] = spider_entry["hsa2_CN"] #contanct number for Cα-Cα
            d["spider2-hsb2_CN"] = spider_entry["hsb2_CN"] #contact number for Cα-Cβ
            
        #print "Finished state "+str(state)
    #Saving the updated dictionary
    !mkdir -p ext_features_dicts/pfam-v32/$domain_name
    
    with open(curr_dir[0]+"/ext_features_dicts/pfam-v"+pfam_version+"/"+domain_name+"/"+domain_name+"_hmm_states_dict_"+datetime.date.today().strftime("%m.%d.%y")+".pik", 'wb') as handle:
        pickle.dump(states_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
           
    print "Finished "+domain_name

Finished Exo_endo_phos
Finished SKICH
Finished TIR
Finished V-set
Finished WAP
Finished zf-C2H2
Finished zf-CCCH


In [14]:
tmp_df = pd.DataFrame.from_dict(tmp_dict)
tmp_df = tmp_df.drop_duplicates()
tmp_df.reset_index(inplace=True, drop=True)

In [16]:
tmp_df.to_csv("protein_positions_missing_in_spider_dict.csv", sep="\t")

In [43]:
ens_prot = canonic_protein["ENSG00000167612.8"]

In [44]:
ens_prot

'ENSP00000301190.6'

In [42]:
seq_dict["ENSG00000167612.8"]

{'ENSP00000301190.6': 'MKVQPSVTCVASWGGIVHLEAFGDPVIVLRGAWAVPRVDCLIDTLRTPNASCMRKGTHLLVPCLEEEELALHRRRLDMSEALPCPGKETPTPGCRLGALYWACVHNDPTQLQAILDGGVSPEEATQVDSNGRTGLMVACYHGFQSVVALLSHCPFLDVNQQDKGGDTALMLAAQAGHVPLVSLLLNYYVGLDLERRDQRGLTALMKAAMRNRCADLTAVDPVRGKTALEWAVLTDSFDTVWRIRQLLRRPQVEQLSQHYKPEWPALSGLVAQAQAQAQVAPSLLERLQATLSLPFAPSPQEGGVLDHLVTATTSLASPFVTTACHTLCPDHPPSLGTRSKSVPELLGTAPPPPLVPQSPPGSPQRSPWVFVPYQSPQGILSKCLQWLQPRDSTSPRPQVPKILLSKASSSSHQCQPKPSPSGHQSLALPLWRYQELRIEKRKQEEEARMAQK*',
 'ENSP00000344690.4': 'MVACYHGFQSVVALLSHCPFLDVNQQDKGGDTALMLAAQAGHVPLVSLLLNYYVGLDLERRDQRGLTALMKAAMRNRCECVATLLMAGADLTAVDPVRGKTALEWAVLTDSFDTVWRIRQLLRRPQVEQLSQHYKPEWPALSGLVAQAQAQAQVAPSLLERLQATLSLPFAPSPQEGGVLDHLVTATTSLASPFVTTACHTLCPDHPPSLGTRSKSVPELLVPAEAQSFRTPKSGPSSLAIPGAQDREEETGGGGQNGTEVGEDGIGQAGNR*'}

In [45]:
seq_dict["ENSG00000167612.8"]["ENSP00000344690.4"][98]

'G'

In [33]:
spider_domain_dict["ENSG00000167612.8"][96]

{'hsa2_AA': 'P',
 'hsa2_CN': '24.8',
 'hsa2_HSEd': '16.8',
 'hsa2_HSEu': '7.7',
 'hsb2_AA': 'P',
 'hsb2_CN': '26.0',
 'hsb2_HSEd': '18.0',
 'hsb2_HSEu': '7.6',
 'spd3_AA': 'P',
 'spd3_ASA': '74.7',
 'spd3_P(C)': '0.924',
 'spd3_P(E)': '0.027',
 'spd3_P(H)': '0.049',
 'spd3_Phi': '-67.3',
 'spd3_Psi': '-19.9',
 'spd3_SS': 'C',
 'spd3_Tau(i-2=>i+1)': '-150.3',
 'spd3_Theta(i-1=>i+1)': '92.0'}

In [40]:
spider_problems_domains_list = ["Ank_2", "Ank_4", "Ank_5", "Asp", "CD45", "Cys_knot", "DENN", "DUF1908", "DUF4187", "EF-hand_1", "EF-hand_5", "EF-hand_6", "EF-hand_7", "EF-hand_8", "EF-hand_9", "EFhand_Ca_insen", "ELM2", "Exo_endo_phos", "FYVE", "G-patch", "G-patch_2", "GRAM", "GSHPx", "IQ_SEC7_PH", "LRR_1", "LRR_12", "LRR_6", "LRR_8", "MFS_1", "Myb_DNA-binding", "Myotub-related", "PDZ", "PDZ_6", "PH", "PNMA", "PTP_N", "Pkinase", "Pkinase_Tyr", "RNase_T", "RUN", "Rdx", "SBF2", "SKICH", "Sec7", "SelP_C", "SelP_N", "SelR", "Sep15_SelM", "T4_deiodinase", "TAXi_N", "TIR", "Trefoil", "V-set", "WAP", "Y_phosphatase", "dDENN", "fn3", "uDENN", "zf-C2H2", "zf-C2H2_4", "zf-CCCH", "zf-H2C2_5"]

In [41]:
len(spider_problems_domains_list)

62