In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd

from os import listdir
from urllib.request import urlopen
from progressbar import progressbar

np.random.seed(12345)

<IPython.core.display.Javascript object>

### Load data

In [3]:
data = pd.read_csv("bindingdb_ligands.csv", na_values="NULL")

<IPython.core.display.Javascript object>

In [4]:
data = data[~pd.isnull(data.protein_sequence).to_numpy()]

<IPython.core.display.Javascript object>

In [5]:
np.sum(pd.isnull(data.protein_sequence))

0

<IPython.core.display.Javascript object>

### Process data

In [6]:
data.columns

Index(['ligand_SMILES', 'ki', 'ic50', 'kd', 'ec50', 'protein_sequence',
       'BindingDB Reactant_set_id', 'BindingDB MonomerID',
       'BindingDB Ligand Name',
       'Target Name Assigned by Curator or DataSource', 'ZINC ID of Ligand'],
      dtype='object')

<IPython.core.display.Javascript object>

In [7]:
data_list = []

<IPython.core.display.Javascript object>

In [8]:
def get_interaction(i):
    for affinity in ["ki", "ic50", "kd", "ec50"]:
        if not pd.isnull(data[affinity].iloc[i]):
            try:
                if data[affinity].iloc[i].startswith("<"):
                    return 1
                elif data[affinity].iloc[i].startswith(">"):
                    return 0
            except:
                pass
    return str(int(float(data[affinity].iloc[i]) <= 10))

<IPython.core.display.Javascript object>

In [9]:
for i in range(len(data)):
    data_list.append(
        [
            data["ligand_SMILES"].iloc[i],
            data["protein_sequence"].iloc[i],
            get_interaction(i),
        ]
    )

<IPython.core.display.Javascript object>

In [10]:
len(data_list)

83577

<IPython.core.display.Javascript object>

### Filter out seen proteins

In [11]:
DUDE_fnames = listdir("DUDE/contact_map/")

DUDE_sequences = []
for fname in DUDE_fnames:
    with open(f"DUDE/contact_map/{fname}", "r") as f:
        DUDE_sequences.append(f.readlines()[1].strip("\n"))

<IPython.core.display.Javascript object>

In [12]:
data_list = [example for example in data_list if example[1] not in DUDE_sequences]

<IPython.core.display.Javascript object>

In [13]:
len(data_list)

83577

<IPython.core.display.Javascript object>

In [14]:
def get_fasta(target):
    response = urlopen(f"https://www.rcsb.org/fasta/entry/{target.upper()}/display")
    return response.read().decode("utf-8").strip("\n")


DUDE_fasta = [
    get_fasta(target)
    for target in pd.read_csv("DUDE/data_pre/dud-e_proteins.csv")["target_pdb"].tolist()
]

<IPython.core.display.Javascript object>

In [15]:
response = urlopen(f"https://www.bindingdb.org/bind/BindingDBTargetSequences.fasta")
bindingdb_fasta = response.read().decode("utf-8").strip("\n").split("\n")

bindingdb_fasta = bindingdb_fasta[:6000]

with open("proteins_list.txt", "w") as f:
    f.write("\n".join(DUDE_fasta + bindingdb_fasta))

<IPython.core.display.Javascript object>

#### Write bindingdb examples

In [16]:
text = []
for i in range(len(data_list)):
    text.append(" ".join(data_list[1]))

<IPython.core.display.Javascript object>

In [17]:
with open("bindingdb_test_set", "w") as f:
    f.writelines(text)

<IPython.core.display.Javascript object>