# BindingDB Dataset Processing

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm 
import matplotlib.pyplot as plt
tqdm.pandas()
sns.set()

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
BINDINGDB_PATH = '../data/BindingDB_All.tsv'
map_cols = {
    "BindingDB Reactant_set_id": "BindingDB_ID",
    "BindingDB Ligand Name": "ligand_name",
    "Ligand SMILES": "SMILES", 
    "Target Name Assigned by Curator or DataSource": "target_name",
    "Target Source Organism According to Curator or DataSource": "target_organism",
    "IC50 (nM)": "IC50",
    "Ki (nM)": "Ki", 
    "Kd (nM)": "Kd", 
    "EC50 (nM)": "EC50", 
    "PubChem AID": "PubChem_AID",
    "PubChem CID": "PubChem_CID", # Compound ID
    "PubChem SID": "PubChem_SID", # Substance ID. What's the difference?
    "Ligand HET ID in PDB": "ligand_het_id_pdb",
    "ChEBI ID of Ligand": "ligand_ChEBI_ID",
    "ChEMBL ID of Ligand": "ligand_ChEMBL_ID",
    "DrugBank ID of Ligand": "ligand_DrugBank_ID",
    "KEGG ID of Ligand": "ligand_KEGG_ID",
    "ZINC ID of Ligand": "ligand_ZINC_ID",
    "PDB ID(s) for Ligand-Target Complex": "interaction_PDB_IDs",
    "Number of Protein Chains in Target (>1 implies a multichain complex)": "n_chains",
    "BindingDB Target Chain  Sequence": "target_sequence",
    "PDB ID(s) of Target Chain": "target_PDB_IDs",
    "UniProt (SwissProt) Primary ID of Target Chain": "UniProt_S_ID",
    "UniProt (SwissProt) Recommended Name of Target Chain": "UniProt_S_entry_name",
    "UniProt (SwissProt) Entry Name of Target Chain": "UniProt_S_rec_name",
    "UniProt (SwissProt) Alternative ID(s) of Target Chain": "UniProt_S_alt_IDs",
    "UniProt (TrEMBL) Submitted Name of Target Chain": "UniProt_T_submitted_name",
    "UniProt (TrEMBL) Entry Name of Target Chain": "UniProt_T_entry_name",
    "UniProt (TrEMBL) Primary ID of Target Chain": "UniProt_T_ID",
    "UniProt (TrEMBL) Alternative ID(s) of Target Chain": "UniProt_T_alt_IDs"
}

BindingDB dataset contains information for the interaction of one small molecule ligand with one protein target. Each row includes a SMILES string for the ligand, the identity of the target, the measured affinity, the source of the data, and links to related information in other databases. The size of dataset is huge for memory, so we process it in chunks until it is filtered and the size is reasonable.

In [4]:
BindingDB_interactions = pd.read_csv(BINDINGDB_PATH, 
                                     sep='\t', 
                                     usecols=map_cols.keys(), 
                                     chunksize=100000)

In [5]:
def process_as_chunks(chunk_list, fnc):
    return [fnc(chunk) for chunk in chunk_list]

In [6]:
def rename(df):
    return df.rename(columns=map_cols)

chunk_interactions = process_as_chunks(BindingDB_interactions, rename) 

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
def size(df):
    return df.shape[0]
chunk_sizes = process_as_chunks(chunk_interactions, size)
num_of_interactions = sum(chunk_sizes)

BindingDB dataset contains ~2M interactions. To train our models, we need sequences of protein-ligand pairs. First, we drop the interactions missing either ligand SMILES or protein sequence.   

## Dropping Missing Fields

In [8]:
def drop_missing_sequences(df):
    return df.dropna(axis=0, subset=['SMILES', 'target_sequence'], how='any')

chunk_interactions = process_as_chunks(chunk_interactions, drop_missing_sequences) 
chunk_sizes = process_as_chunks(chunk_interactions, size)
num_of_interactions_with_seq = sum(chunk_sizes)
print('Interactions having protein and ligand sequence', num_of_interactions_with_seq)

Interactions having protein and ligand sequence 2035703


Next, we drop the interactions in which the target proteins has multichains.

In [9]:
def drop_multichain_targets(df):
    return df.query('n_chains == 1')
chunk_interactions = process_as_chunks(chunk_interactions, drop_multichain_targets) 
chunk_sizes = process_as_chunks(chunk_interactions, size)
print('Interactions with single chain protein', sum(chunk_sizes))

Interactions with single chain protein 1940233


To construct a test set of unseen proteins, we will hold out some families. The family information is not included in BindingDB, however it could be obtained from Pfam via proteins' UniProt ID. Therefore, we drop the interactions where the protein do not have UniProt ID.

In [10]:
def drop_wout_interaction_uniprot_id(df):
    return df.dropna(axis=0, subset=['UniProt_S_ID'])
chunk_interactions_with_uniprot_id = process_as_chunks(chunk_interactions, drop_wout_interaction_uniprot_id) 
chunk_sizes = process_as_chunks(chunk_interactions_with_uniprot_id, size)
print('Interactions with protein identifier', sum(chunk_sizes))

Interactions with protein identifier 1779696


Here, we export the proteins that we filtered to retrieve the family information of these from Pfam.

In [12]:
def get_uniprot_id_with_seq(df):
    return df[['UniProt_S_ID', 'target_name', 'target_sequence']]
protein_uniprot_ids = process_as_chunks(chunk_interactions_with_uniprot_id, get_uniprot_id_with_seq) 
protein_seq = pd.concat(protein_uniprot_ids)
protein_seq.drop_duplicates(inplace=True)
protein_seq.to_csv('../data/BindingDB_onechain_protein_seq.csv', index=None)
print('Number of proteins with UniProt ID', protein_df.shape[0])

Number of proteins with UniProt ID 6421


We will use affinity data to decide whether protein-ligand pair is likely to interact or not and we will label interactions as positive and negative based on affinity score. So, we drop the interactions not having any affinity measurement. 

In [13]:
def drop_interactions_without_affinity(df):
    return df.dropna(axis=0, subset=['IC50', 'Ki', 'EC50', 'Kd'], how='all')
chunk_interactions_wout_aff = process_as_chunks(chunk_interactions_with_uniprot_id, drop_interactions_without_affinity) 
chunk_sizes = process_as_chunks(chunk_interactions_wout_aff, size)
print('Interactions with affinity', sum(chunk_sizes))

Interactions with affinity 1777906


## Canonicalization

SMILES representation is a way to represent a 2D molecular graph as a 1D string. A molecular graph could possibly have many SMILES strings. By canonicalization, we map different SMILES representations of a molecular graph to a unique SMILES and also check whether SMILES is valid.  

In [14]:
from rdkit.Chem import MolFromSmiles, MolToSmiles, SanitizeMol, Descriptors
def canonicalize_smiles(s):
    mol = MolFromSmiles(s)
    if mol:
        try:
            SanitizeMol(mol)
            return MolToSmiles(mol), Descriptors.ExactMolWt(mol)
        except:
            pass
    return None, None

def normalize_ligands(df):
    df['canonical_SMILES'], df['MolWt'] = zip(*df['SMILES'].progress_apply(canonicalize_smiles))     
    df.dropna(subset=['canonical_SMILES'], inplace=True)
    return df

chunk_normalized_interactions = process_as_chunks(chunk_interactions_wout_aff, normalize_ligands)
chunk_sizes = process_as_chunks(chunk_normalized_interactions, size)
print('Interactions with valid SMILES', sum(chunk_sizes))

100%|██████████████████████████████████████████████████████████████████████████| 77932/77932 [00:57<00:00, 1346.27it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
100%|██████████████████████████████████████████████████████████████████████████| 90533/90533 [01:04<00:00, 1399.93it/s]
100%|██████████████████████████████████████████████████████████████████████████| 93907/93907 [01:10<00:00, 1328.48it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc

100%|██████████████████████████████████████████████████████████████████████████| 80358/80358 [00:57<00:00, 1388.79it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
100%|██████████████████████████████████████████████████████████████████████████| 94826/94826 [01:11<00:00, 1324.39it/s]
100%|██████████████████████████████████████████████████████████████████████████| 91791/91791 [01:12<00:00, 1262.22it/s]
100%|██████████████████████████████████████████████████████████████████████████| 95677/95677 [01:09<00:00, 1367.94it/s]
100%|█████████████████████████

Interactions with valid SMILES 1776394


## Molecular Weight

In [15]:
def filter_heavy_molecules(df):
    return df[df['MolWt'] < 1000]
chunk_normalized_interactions_small = process_as_chunks(chunk_normalized_interactions, filter_heavy_molecules) 

## Labeling interactions

### Geometric Mean

A ligand is considered as **active** towards a target protein if affinity score of the interaction is under a certain threshold. This threshold changes depending on the drug discovery stage. [Gleeson et al., 2011](https://www.nature.com/articles/nrd3367) analyze publicly available drugs and molecules from ChEMBL dataset and suggest that pXC50 threshold might be ~7.5 during early research and development in drug discovery (The hit-like molecules have a pXC50 of 6 (1 microMolar potency) and the desired output of lead optimization program is a pXC50 of 9 (1 nanoMolar potency)). In this study, we aim to generate lead-like molecules. So, we set the threshold as a pXC50 of 7 which corresponds to 100 nanoMolar. Ki and Kd values are converted to IC50 by a factor 2 by following [the work](https://ascpt.onlinelibrary.wiley.com/doi/full/10.1002/cpt.1846) and based on [the evidence](https://dmd.aspetjournals.org/content/dmd/43/11/1744.full.pdf?with-ds=yes). Herein, we label the interactions with affinity score is less than 100 nm as **active**.

In case of more than one reported affinity score and assay, we calculate the geometric mean of these values and compare this with the threshold.

In [16]:
THRESHOLD = 100
affinity_metrics = ['IC50', 'Ki', 'EC50', 'Kd']

def convert_to_numeric(df):
    for metric in affinity_metrics:
        #df[metric + '_n'] = df[metric].apply(numeric_affinity)
        df[metric + '_n'] = pd.to_numeric(df[metric], errors='coerce') # exclude not exact affinity scores (>100, <10000)
    df['Ki_m'] = df['Ki_n'] * 2
    df['Kd_m'] = df['Kd_n'] * 2
    return df
chunk_normalized_interactions_num = process_as_chunks(chunk_normalized_interactions_small, convert_to_numeric) 
all_interactions = pd.concat(chunk_normalized_interactions_num)
all_interactions = convert_to_numeric(all_interactions)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [17]:
all_interactions[['UniProt_S_ID', 'canonical_SMILES', 'IC50', 'Kd', 'EC50', 'Ki', 'IC50_n', 'Kd_n', 'EC50_n', 'Ki_n', 'Kd_m', 'Ki_m']]

Unnamed: 0,UniProt_S_ID,canonical_SMILES,IC50,Kd,EC50,Ki,IC50_n,Kd_n,EC50_n,Ki_n,Kd_m,Ki_m
142,P08684,Cc1nc(CN2CCN(c3c(Cl)cnc4[nH]c(-c5cn(C)nc5C)nc3...,>50000,,,,,,,,,
144,P51570,O=C1CCCC2=C1C1(CCS(=O)(=O)C1)N=C(Nc1nc3ccccc3o...,6676.9,,,,6676.9,,,,,
198,P42574,CN(Cc1ccc(O)c(C(=O)O)c1)Cc1ccc(C(=O)N[C@@H](CC...,,,,90,,,,90.0,,180.0
199,P29466,O=C[C@H](CC(=O)O)NC(=O)c1ccc(CNS(=O)(=O)c2ccc(...,,,,160,,,,160.0,,320.0
200,P29466,O=C[C@H](CC(=O)O)NC(=O)c1ccc(CNS(=O)(=O)c2ccc(...,,,,3900,,,,3900.0,,7800.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2036270,P07943,O=C(O)Cn1c(=O)c(CCc2ccccc2)nc2ccccc21,143,,,,143.0,,,,,
2036271,P07943,CC(C)(C)NC(=O)/C=C/c1nc2ccccc2n(CC(=O)O)c1=O,1860,,,,1860.0,,,,,
2036272,P07943,O=C(O)Cn1c(=O)c(/C=C/c2ccc(F)cc2)nc2cc(Cl)ccc21,660,,,,660.0,,,,,
2036273,P07943,O=C(O)Cn1c(=O)c(/C=C/c2ccccc2)nc2ccc(Cl)cc21,3880,,,,3880.0,,,,,


In [18]:
mean_affinity_metrics = ['IC50_n', 'Ki_m', 'EC50_n', 'Kd_m']
mean_affinity_metrics_n = ['IC50_n', 'Ki_n', 'EC50_n', 'Kd_n']
from scipy.stats.mstats import gmean
def geometric_mean(row, metrics):
    return gmean([row[metric] for metric in metrics if not pd.isnull(row[metric])])
all_interactions.dropna(axis=0, subset=['IC50_n', 'Ki_n', 'EC50_n', 'Kd_n'], how='all', inplace=True)
all_interactions['XC50_mean'] = all_interactions.progress_apply(geometric_mean, metrics=mean_affinity_metrics, axis=1)
all_interactions['XC50_mean_n'] = all_interactions.progress_apply(geometric_mean, metrics=mean_affinity_metrics_n, axis=1)

  log_a = np.log(np.array(a, dtype=dtype))
100%|█████████████████████████████████████████████████████████████████████| 1414261/1414261 [01:08<00:00, 20786.26it/s]
100%|█████████████████████████████████████████████████████████████████████| 1414261/1414261 [01:12<00:00, 19565.47it/s]


In [19]:
all_interactions_gmean = all_interactions.groupby(['UniProt_S_ID', 'canonical_SMILES'])['XC50_mean'].apply(gmean).rename("XC50").reset_index()
all_interactions_gmean_n = all_interactions.groupby(['UniProt_S_ID', 'canonical_SMILES'])['XC50_mean_n'].apply(gmean).rename("XC50_n").reset_index()

In [20]:
all_interactions_gmean = all_interactions_gmean.reset_index()
all_interactions.shape[0], all_interactions_gmean.shape[0]

(1414261, 1093710)

In [21]:
all_data = pd.merge(all_interactions_gmean, all_interactions, on=['UniProt_S_ID', 'canonical_SMILES'], how='left')
all_data = pd.merge(all_interactions_gmean_n, all_data, on=['UniProt_S_ID', 'canonical_SMILES'], how='left')
all_data.head()

Unnamed: 0,UniProt_S_ID,canonical_SMILES,XC50_n,index,XC50,BindingDB_ID,SMILES,ligand_name,target_name,target_organism,...,UniProt_T_alt_IDs,MolWt,IC50_n,Ki_n,EC50_n,Kd_n,Ki_m,Kd_m,XC50_mean,XC50_mean_n
0,A0A087WW23,CCC(CC)(Cc1nc2c(F)cc(OCc3ccc(C)cn3)cc2n1Cc1ccc...,6.0,0,12.0,282503,CCC(CC)(Cc1nc2c(F)cc(OCc3ccc(C)cn3)cc2n1Cc1ccc...,"US8952177, 65::US9089569, 65::US9695149, 65",5-lipoxygenase-activating protein (FLAP),,...,,553.138,,6.0,,,12.0,,12.0,6.0
1,A0A087WW23,CCC(CC)(Cc1nc2c(F)cc(OCc3ccc(C)cn3)cc2n1Cc1ccc...,49.0,1,98.0,282509,CCC(CC)(Cc1nc2c(F)cc(OCc3ccc(C)cn3)cc2n1Cc1ccc...,"US8952177, 71::US9089569, 71::US9695149, 71",5-lipoxygenase-activating protein (FLAP),,...,,489.243,,49.0,,,98.0,,98.0,49.0
2,A0A087WW23,CCC(CC)(Cc1nc2c(F)cc(OCc3ccc(C)cn3)cc2n1Cc1ccc...,17.0,2,34.0,282507,CCC(CC)(Cc1nc2c(F)cc(OCc3ccc(C)cn3)cc2n1Cc1ccc...,"US8952177, 69::US9089569, 69::US9695149, 69",5-lipoxygenase-activating protein (FLAP),,...,,559.209,,17.0,,,34.0,,34.0,17.0
3,A0A087WW23,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(Br...,10.0,3,20.0,282446,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(Br...,"US8952177, 8::US9089569, 8::US9695149, 8",5-lipoxygenase-activating protein (FLAP),,...,,535.147,,10.0,,,20.0,,20.0,10.0
4,A0A087WW23,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(C#...,750.0,4,1500.0,282457,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(cc...,"US8952177, 19::US9089569, 19::US9695149, 19",5-lipoxygenase-activating protein (FLAP),,...,,482.232,,750.0,,,1500.0,,1500.0,750.0


In [22]:
all_data['gmean'] = np.where(all_data['XC50'] < 100, 1, 0)
all_data['gmean_n'] = np.where(all_data['XC50_n'] < 100, 1, 0)
all_data.shape[0]

1414261

In [23]:
all_data.to_csv('../data/BindingDB_filtered_dup_gmean.csv', index=False)

In [24]:
interaction_data = all_data.drop_duplicates(subset=['UniProt_S_ID', 'canonical_SMILES'], keep='first')

In [25]:
interaction_data

Unnamed: 0,UniProt_S_ID,canonical_SMILES,XC50_n,index,XC50,BindingDB_ID,SMILES,ligand_name,target_name,target_organism,...,IC50_n,Ki_n,EC50_n,Kd_n,Ki_m,Kd_m,XC50_mean,XC50_mean_n,gmean,gmean_n
0,A0A087WW23,CCC(CC)(Cc1nc2c(F)cc(OCc3ccc(C)cn3)cc2n1Cc1ccc...,6.000000,0,12.000000,282503,CCC(CC)(Cc1nc2c(F)cc(OCc3ccc(C)cn3)cc2n1Cc1ccc...,"US8952177, 65::US9089569, 65::US9695149, 65",5-lipoxygenase-activating protein (FLAP),,...,,6.0,,,12.0,,12.0,6.0,1,1
1,A0A087WW23,CCC(CC)(Cc1nc2c(F)cc(OCc3ccc(C)cn3)cc2n1Cc1ccc...,49.000000,1,98.000000,282509,CCC(CC)(Cc1nc2c(F)cc(OCc3ccc(C)cn3)cc2n1Cc1ccc...,"US8952177, 71::US9089569, 71::US9695149, 71",5-lipoxygenase-activating protein (FLAP),,...,,49.0,,,98.0,,98.0,49.0,1,1
2,A0A087WW23,CCC(CC)(Cc1nc2c(F)cc(OCc3ccc(C)cn3)cc2n1Cc1ccc...,17.000000,2,34.000000,282507,CCC(CC)(Cc1nc2c(F)cc(OCc3ccc(C)cn3)cc2n1Cc1ccc...,"US8952177, 69::US9089569, 69::US9695149, 69",5-lipoxygenase-activating protein (FLAP),,...,,17.0,,,34.0,,34.0,17.0,1,1
3,A0A087WW23,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(Br...,10.000000,3,20.000000,282446,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(Br...,"US8952177, 8::US9089569, 8::US9695149, 8",5-lipoxygenase-activating protein (FLAP),,...,,10.0,,,20.0,,20.0,10.0,1,1
4,A0A087WW23,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(C#...,750.000000,4,1500.000000,282457,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(cc...,"US8952177, 19::US9089569, 19::US9695149, 19",5-lipoxygenase-activating protein (FLAP),,...,,750.0,,,1500.0,,1500.0,750.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1414255,V9GZ37,O=C(Oc1ccc2ccccc2c1)c1ccc2c(c1)C1C=CCC1C(C(=O)...,43900.000000,1093705,43900.000000,78413,OC(=O)C1Nc2ccc(cc2C2C=CCC12)C(=O)Oc1ccc2ccccc2c1,"8-(2-naphthoxycarbonyl)-3a,4,5,9b-tetrahydro-3...",Heat Shock 70kDa Protein 1,Homo sapiens,...,43900.0,,,,,,43900.0,43900.0,0,0
1414256,V9GZ37,O=C1C=C2CC3(O)CCc4c(ccc(O)c4O)C3=C2CC1=O,34200.000000,1093706,34200.000000,78430,Oc1ccc2C3=C4CC(=O)C(=O)C=C4CC3(O)CCc2c1O,"3,4,6a,10-tetrahydroxy-6,7-dihydro-5H-benzo[g]...",Heat Shock 70kDa Protein 1,Homo sapiens,...,34200.0,,,,,,34200.0,34200.0,0,0
1414257,V9GZ37,O=c1nc(C(F)(F)F)cc(C=Cc2cc([N+](=O)[O-])ccc2Cl...,46400.000000,1093707,46400.000000,57068,[O-][N+](=O)c1ccc(Cl)c(C=Cc2cc(nc(=O)[nH]2)C(F...,4-[(E)-2-(2-chloranyl-5-nitro-phenyl)ethenyl]-...,Heat Shock 70kDa Protein 1,Homo sapiens,...,46400.0,,,,,,46400.0,46400.0,0,0
1414258,V9GZ37,O=c1oc2c(O)c(O)cc3c(=O)oc4c(O)c(O)cc1c4c23,760.000000,1093708,760.000000,57058,Oc1cc2c3c(oc(=O)c4cc(O)c(O)c(oc2=O)c34)c1O,"6,7,13,14-tetrahydroxy-2,9-dioxatetracyclo[6.6...",Heat Shock 70kDa Protein 1,Homo sapiens,...,760.0,,,,,,760.0,760.0,0,0


In [26]:
interaction_data[interaction_data['gmean'] == 1].shape[0], interaction_data[interaction_data['gmean'] == 0].shape[0]

(429930, 663780)

In [27]:
interaction_data.to_csv('../data/BindingDB_filtered_gmean.csv', index=False)

In [2]:
import pandas as pd
interaction_data = pd.read_csv('../data/BindingDB_filtered_gmean.csv', index_col=False)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
interaction_data.shape[0], interaction_data[interaction_data['gmean'] == 1].shape[0], interaction_data[interaction_data['gmean'] == 0].shape[0]

(1093710, 429930, 663780)

In [5]:
interaction_data['UniProt_S_ID'].nuniqueique()

5117

In [7]:
interaction_data[interaction_data['gmean'] == 1]['UniProt_S_ID'].nunique()

3126

In [8]:
interaction_data[interaction_data['gmean'] == 1]['canonical_SMILES'].nunique()

332273

In [9]:
interaction_data[interaction_data['gmean'] == 0]['UniProt_S_ID'].nunique()

4880

In [10]:
interaction_data[interaction_data['gmean'] == 0]['canonical_SMILES'].nunique()

454993