In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict

In [2]:
curr_dir = !pwd
filename = "ALL-biolipsummary-mindist.txt"
shilpa_binding_scores = pd.read_csv(curr_dir[0]+"/"+filename, sep='\t')

### Filter to only binding positions, with at least 10 instances

In [3]:
#Filter according to score
score_filtered = shilpa_binding_scores[shilpa_binding_scores["WeightedScore"] > 0.1]
#Filter according to number of instances
score_instance_filtered = score_filtered[score_filtered["NumInstances"] >= 10]

### Filter to all positions . with at least 10 instances

In [4]:
#Filter according to number of instances only
score_instance_filtered = shilpa_binding_scores[shilpa_binding_scores["NumInstances"] >= 10]

In [6]:
%%time
#Filter according to ligands
ligand_filter_idx = []
#Compose a list of Trues for the indices of "real ligands"
for i in range(score_instance_filtered.shape[0]):
    ligand = score_instance_filtered.iloc[i]["LigandType"]
    try:
        if ((ligand.find("PEPTIDE_") >= 0)  or (ligand.find("ION_") >= 0) or (ligand.find("DNABASE_") >= 0) or (ligand.find("RNABASE_") >= 0) or (ligand.find("METABOLITE_") >= 0)):
            ligand_filter_idx.append(True)
        else:
            ligand_filter_idx.append(False)
    except:
        ligand_filter_idx.append(False)


score_instance_ligand_filtered = score_instance_filtered[ligand_filter_idx]
score_instance_ligand_filtered.to_csv(curr_dir[0]+"/domains_binding_ligands_table.csv", sep='\t')

CPU times: user 47.7 s, sys: 468 ms, total: 48.2 s
Wall time: 47.6 s


In [9]:
score_instance_ligand_filtered.to_csv(curr_dir[0]+"/domains_binding_ligands_table.csv", sep='\t')

In [10]:
peptide_dict = {}
ion_dict = {}
dna_dict = {}
rna_dict = {}
metaboltie_dict = {}

domains_ids = (score_instance_ligand_filtered["#HmmID"].unique()).tolist()
for domain_id in domains_ids:
    domain_name = domain_id[domain_id.find("_")+1:]
    curr_binding_info = score_instance_ligand_filtered[score_instance_ligand_filtered["#HmmID"] == domain_id]
    for index, row in curr_binding_info.iterrows():
        state = row["MatchState"]
        score = row["WeightedScore"]
        ligand = row["LigandType"]

### Create dictionaries according to binding-partner and one unified dictionary

In [34]:
%%time
domains_ids = (score_instance_ligand_filtered["#HmmID"].unique()).tolist()

peptide_dict = defaultdict(dict)
ion_dict = defaultdict(dict)
dna_dict = defaultdict(dict)
rna_dict = defaultdict(dict)
metabolite_dict = defaultdict(dict)

domains_binding_dict = defaultdict(dict)
for domain_id in domains_ids:
    domain_name = domain_id[domain_id.find("_")+1:]
    curr_binding_info = score_instance_ligand_filtered[score_instance_ligand_filtered["#HmmID"] == domain_id]
    domains_binding_dict[domain_name]["scores"] = []
    domains_binding_dict[domain_name]["states"] = []
    for index, row in curr_binding_info.iterrows():
        state = row["MatchState"]
        score = row["WeightedScore"]
        ligand = row["LigandType"]
        domains_binding_dict[domain_name]["scores"].append(score)
        domains_binding_dict[domain_name]["states"].append(state)
        
        #Adding to ligand dictionaries:
        if (ligand.find("ION_") >= 0):
            ion_dict[domain_name][state] = score
        elif (ligand.find("PEPTIDE_") >= 0):
            peptide_dict[domain_name][state] = score
        elif (ligand.find("DNABASE_") >= 0):
            dna_dict[domain_name][state] = score
        elif (ligand.find("RNABASE_") >= 0):
            rna_dict[domain_name][state] = score
        elif (ligand.find("METABOLITE_") >= 0):
            metabolite_dict[domain_name][state] = score

CPU times: user 10.8 s, sys: 24 ms, total: 10.8 s
Wall time: 10.8 s


In [35]:
with open(curr_dir[0]+"/ion_binding_dict.pik", 'wb') as handle:
    pickle.dump(ion_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(curr_dir[0]+"/peptide_binding_dict.pik", 'wb') as handle:
    pickle.dump(peptide_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(curr_dir[0]+"/dna_binding_dict.pik", 'wb') as handle:
    pickle.dump(dna_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(curr_dir[0]+"/rna_binding_dict.pik", 'wb') as handle:
    pickle.dump(rna_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(curr_dir[0]+"/metabolite_binding_dict.pik", 'wb') as handle:
    pickle.dump(metabolite_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
with open(curr_dir[0]+"/domains_all_binding_dict.pik", 'wb') as handle:
    pickle.dump(domains_binding_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)