In [15]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict

In [16]:
curr_dir = !pwd
filename = "ALL-biolipsummary-mindist.txt"
shilpa_binding_scores = pd.read_csv(curr_dir[0]+"/"+filename, sep='\t')

### Filter to only binding positions, with at least 10 instances

In [3]:
#Filter according to score
score_filtered = shilpa_binding_scores[shilpa_binding_scores["WeightedScore"] > 0.1]
#Filter according to number of instances
score_instance_filtered = score_filtered[score_filtered["NumInstances"] >= 10]

### Filter to all positions . with at least 10 instances

In [17]:
#Filter according to number of instances only
score_instance_filtered = shilpa_binding_scores[shilpa_binding_scores["NumInstances"] >= 10]

In [18]:
%%time
#Filter according to ligands
ligand_filter_idx = []
#Compose a list of Trues for the indices of "real ligands"
for i in range(score_instance_filtered.shape[0]):
    ligand = score_instance_filtered.iloc[i]["LigandType"]
    try:
        if ((ligand.find("PEPTIDE_") >= 0)  or (ligand.find("ION_") >= 0) or (ligand.find("DNABASE_") >= 0) or (ligand.find("RNABASE_") >= 0) or (ligand.find("METABOLITE_") >= 0)
           or (ligand.find("DNA_") >= 0) or (ligand.find("RNA_") >= 0) or (ligand.find("DNABACKBONE_") >= 0) or (ligand.find("RNABACKBONE_") >= 0)):
            ligand_filter_idx.append(True)
        else:
            ligand_filter_idx.append(False)
    except:
        ligand_filter_idx.append(False)


score_instance_ligand_filtered = score_instance_filtered[ligand_filter_idx]
score_instance_ligand_filtered.to_csv(curr_dir[0]+"/domains_binding_ligands_table.csv", sep='\t')

CPU times: user 46.2 s, sys: 0 ns, total: 46.2 s
Wall time: 45.8 s


In [9]:
score_instance_ligand_filtered.to_csv(curr_dir[0]+"/domains_binding_ligands_table.csv", sep='\t')

### Create dictionaries according to binding-partner and one unified dictionary

In [24]:
%%time
domains_ids = (score_instance_ligand_filtered["#HmmID"].unique()).tolist()

peptide_dict = defaultdict(dict)
ion_dict = defaultdict(dict)
metabolite_dict = defaultdict(dict)
dna_dict = defaultdict(dict)
rna_dict = defaultdict(dict)
dnabase_dict = defaultdict(dict)
rnabase_dict = defaultdict(dict)
dnabackbone_dict = defaultdict(dict)
rnabackbone_dict = defaultdict(dict)


domains_binding_dict = defaultdict(dict)
for domain_id in domains_ids:
    domain_name = domain_id[domain_id.find("_")+1:]
    curr_binding_info = score_instance_ligand_filtered[score_instance_ligand_filtered["#HmmID"] == domain_id]
    domains_binding_dict[domain_name]["scores"] = []
    domains_binding_dict[domain_name]["states"] = []
    for index, row in curr_binding_info.iterrows():
        state = row["MatchState"]
        score = row["WeightedScore"]
        ligand = row["LigandType"]
        domains_binding_dict[domain_name]["scores"].append(score)
        domains_binding_dict[domain_name]["states"].append(state)
        
        #Adding to ligand dictionaries:
        if (ligand.find("ION_") >= 0):
            ion_dict[domain_name][state] = score
        elif (ligand.find("PEPTIDE_") >= 0):
            peptide_dict[domain_name][state] = score
        elif (ligand.find("METABOLITE_") >= 0):
            metabolite_dict[domain_name][state] = score
        elif (ligand.find("DNA_") >= 0):
            dna_dict[domain_name][state] = score
        elif (ligand.find("RNA_") >= 0):
            rna_dict[domain_name][state] = score
        elif (ligand.find("DNABASE_") >= 0):
            dnabase_dict[domain_name][state] = score
        elif (ligand.find("RNABASE_") >= 0):
            rnabase_dict[domain_name][state] = score
        elif (ligand.find("DNABACKBONE_") >= 0):
            dnabackbone_dict[domain_name][state] = score
        elif (ligand.find("RNABACKBONE_") >= 0):
            rnabackbone_dict[domain_name][state] = score
        

CPU times: user 14.9 s, sys: 0 ns, total: 14.9 s
Wall time: 14.9 s


In [25]:
with open(curr_dir[0]+"/binding_dicts/ion_binding_dict.pik", 'wb') as handle:
    pickle.dump(ion_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(curr_dir[0]+"/binding_dicts/peptide_binding_dict.pik", 'wb') as handle:
    pickle.dump(peptide_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(curr_dir[0]+"/binding_dicts/metabolite_binding_dict.pik", 'wb') as handle:
    pickle.dump(metabolite_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(curr_dir[0]+"/binding_dicts/dna_binding_dict.pik", 'wb') as handle:
    pickle.dump(dna_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(curr_dir[0]+"/binding_dicts/rna_binding_dict.pik", 'wb') as handle:
    pickle.dump(rna_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(curr_dir[0]+"/binding_dicts/dnabase_binding_dict.pik", 'wb') as handle:
    pickle.dump(dnabase_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(curr_dir[0]+"/binding_dicts/rnabase_binding_dict.pik", 'wb') as handle:
    pickle.dump(rnabase_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(curr_dir[0]+"/binding_dicts/dnabackbone_binding_dict.pik", 'wb') as handle:
    pickle.dump(dnabackbone_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(curr_dir[0]+"/binding_dicts/rnabackbone_binding_dict.pik", 'wb') as handle:
    pickle.dump(rnabackbone_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
with open(curr_dir[0]+"/binding_dicts/domains_all_binding_dict.pik", 'wb') as handle:
    pickle.dump(domains_binding_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
for ligand in shilpa_binding_scores["LigandType"].unique():
    if str(ligand).find("_") > 0:
        print ligand

ALL_
DRUGLIKE_
ION_
METABOLITE_
PEPTIDE_
SM_
DNABACKBONE_
DNABASE_
DNA_
NUCACID_
RNABACKBONE_
RNABASE_
RNA_
