### Parse InteracDome results
First run all the pipeline in: https://github.com/Singh-Lab/InteracDome 

After the pipeline written in the README, also run: python interacdome_webserver.py and have the results saved in: InteracDome/processed_data/domains/binding_scores

And a 2nd run of: python interacdome_webserver.py --webserver --pfam_path path-to-dir/ExAC/run_hmmer/pfam_hmms/pfam_hmms-v31/ and have results saved in: InteracDome/interacdome-webserver/

In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict

#### Read scores file and domain-ligand pairs file

In [33]:
#Constants
curr_dir = !pwd
real_ligands = ["PEPTIDE_", "ION_", "METABOLITE_", "DNA_", "DNABASE_", "DNABACKBONE_", "RNA_", "RNABASE_", "RNABACKBONE_", "SM_"]

#The table that contain all the propensities and thresholds for everything
prop_input_path = "/home/anat/InteracDome/interacdome-webserver/"
prop_filename = "interacdome_fordownload.tsv"
interacdome_binding_scores = pd.read_csv(prop_input_path+prop_filename, sep='\t')

#For sanity checks and other qa, these two files contain the "conservatively modeled", positions with propensity above the precision 0.5 threshold.
pairs_input_path = "/home/anat/InteracDome/processed_data/domains/binding_scores/"
pairs_filename = "mindist_passing-domains_overall-precision-0.5_unique-instances-3_structures-3.txt"
domain_ligand_pairs = pd.read_csv(pairs_input_path+pairs_filename, sep='\t', skiprows=6)

conservativelly_modeled_filename = "mindist_passing-binding-propensities_overall-precision-0.5_unique-instances-3_structures-3_sitebased-precision-0.5.txt"
domains_conservativelly_modeled = pd.read_csv(pairs_input_path+conservativelly_modeled_filename, sep='\t', skiprows=8)

#### Filter to the real ligands we are working on

In [35]:
domain_ligand_pairs_filtered = domain_ligand_pairs[domain_ligand_pairs["ligand_type"].isin(real_ligands)]
interacdome_binding_scores_filtered = interacdome_binding_scores[interacdome_binding_scores["ligand_type"].isin(real_ligands)]
domains_conservativelly_modeled_filtered = domains_conservativelly_modeled[domains_conservativelly_modeled["ligand_type"].isin(real_ligands)]

#### Filter to 3+ nonidentical instances across 3 different structures (credible analysis)

In [42]:
interacdome_binding_scores_filtered_struct = interacdome_binding_scores_filtered[interacdome_binding_scores_filtered["num_nonidentical_instances"] >= 3][interacdome_binding_scores_filtered["num_structures"] >= 3]

### Create per-ligand dcitionary by domain -> ligand -> propensities and thresholds

In [97]:
propensity_ligands_dict = defaultdict(dict)

for index, row in interacdome_binding_scores_filtered_struct.iterrows():
    
    #Get ligand name
    ligand_type = row["ligand_type"]
    ligand_type_edited = ligand_type.lower()[:-1]
    
    #Get domain name
    pfam_id = row["pfam_id"]
    domain_name = pfam_id[pfam_id.find("_")+1:]
    
    #Init domain-ligand dict
    domain_ligand_dict = defaultdict(dict)
    
    #Get the prop_thresholds
    try: domain_ligand_dict["prop_th_0.1"] = float(row["propensity_at_precision_0.1"]) 
    except: domain_ligand_dict["prop_th_0.1"] = np.nan
    try: domain_ligand_dict["prop_th_0.25"] = float(row["propensity_at_precision_0.25"])
    except: domain_ligand_dict["prop_th_0.25"] = np.nan
    try: domain_ligand_dict["prop_th_0.5"] = float(row["propensity_at_precision_0.5"])
    except: domain_ligand_dict["prop_th_0.5"] = np.nan
    try: domain_ligand_dict["prop_th_0.75"] = float(row["propensity_at_precision_0.75"])
    except: domain_ligand_dict["prop_th_0.75"] = np.nan
    
    #Get propensities
    prop_list = row["binding_propensities"].split(",")
    
    for state in range(1,len(prop_list)+1):
        domain_ligand_dict["states_props"][state] = float(prop_list[state-1])
    
    propensity_ligands_dict[domain_name][ligand_type_edited] = domain_ligand_dict

In [99]:
#Save the propensity dict
with open(curr_dir[0]+"/domains_ligands_propensity_dict.pik", 'wb') as handle:
    pickle.dump(propensity_ligands_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)