### Load Packages

In [19]:
import os
import pandas as pd
import numpy as np
import re
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from multiprocessing import Pool

### Load Data

In [None]:
df = pd.read_csv('Filtered_BindingDB_All.tsv', sep = '\t', low_memory=False)
print(df.shape)
df.head(2)

### Preprocess dataset

In [11]:
# lowercase column names
df.columns = [c.lower() for c in df.columns]

In [12]:
# check how many NA values there are
df.isna().sum()

reactant_set_id           0
drug_smiles              11
drug_name                 0
protein_name              0
ki                  2468617
ic50                1003527
kd                  2959917
ec50                2801754
kon                 3077906
koff                3077926
protein_sequence         99
dtype: int64

**IC50 has the lowest number of NA values ~ we'll use this as the target variable**
- NOTE: IC50 measures how much of a drug is required to inhibit a target protein by 50%. Lower IC50 values represent stronger drugâ€“protein interactions, while higher IC50 values indicate weaker interactions.

In [13]:
df = df.dropna(subset = ['ic50','drug_smiles','protein_sequence']) # remove NA values
df = df.drop(columns = ['ki','kd','ec50','kon','koff'])            # drop these columns

In [29]:
print(df.isna().sum(), '\n')
df.head(2)

reactant_set_id     0
drug_smiles         0
drug_name           0
protein_name        0
ic50                0
protein_sequence    0
ic50_numeric        0
dtype: int64 



Unnamed: 0,reactant_set_id,drug_smiles,drug_name,protein_name,ic50,protein_sequence,ic50_numeric
0,143,Cc1nc(CN2CCN(CC2)c2c(Cl)cnc3[nH]c(nc23)-c2cn(C...,"US9447092, 3",Cytochrome P450 3A4,>50000,MALIPDLAMETWLLLAVSLVLLYLYGTHSHGLFKKLGIPGPTPLPF...,50000.0
1,145,O=C1CCCC2=C1C1(CCS(=O)(=O)C1)N=C(Nc1nc3ccccc3o...,"US9447087, 24::2-(benzo[d]oxazol-2-ylamino)-4'...",Galactokinase,6676.9,MAALRQPQVAELLAEARRAFREEFGAEPELAVSAPGRVNLIGEHTD...,6676.9


In [18]:
# remove leading '<' or '>' and any surrounding whitespace to get a numeric IC50 value
def strip_sign(x):
    if isinstance(x, str):
        return x.lstrip('<>').strip()  # removes '<' or '>'
    return x

df['ic50_numeric'] = df['ic50'].apply(strip_sign)
df['ic50_numeric'] = pd.to_numeric(df['ic50_numeric'], errors='coerce')

In [20]:
# keep only rows where protein sequences contain standard amino acid letters
pattern = re.compile(r'^[ACDEFGHIKLMNPQRSTVWY]+$')
df = df[df["protein_sequence"].apply(lambda s: bool(pattern.match(s)))].reset_index(drop=True)

In [21]:
# clean SMILES strings
def clean_smiles(smiles):
    if '|' in smiles:
        smiles = smiles.split('|')[0]  # keep only the part before the first '|' (some SMILES strings might have extra annotations after a | character) 
    return smiles.strip()              # remove leading/trailing whitespace

In [22]:
df['drug_smiles'] = df['drug_smiles'].apply(clean_smiles)

In [24]:
print(df.shape)
df.head(2)

(2074074, 7)


Unnamed: 0,reactant_set_id,drug_smiles,drug_name,protein_name,ic50,protein_sequence,ic50_numeric
0,143,Cc1nc(CN2CCN(CC2)c2c(Cl)cnc3[nH]c(nc23)-c2cn(C...,"US9447092, 3",Cytochrome P450 3A4,>50000,MALIPDLAMETWLLLAVSLVLLYLYGTHSHGLFKKLGIPGPTPLPF...,50000.0
1,145,O=C1CCCC2=C1C1(CCS(=O)(=O)C1)N=C(Nc1nc3ccccc3o...,"US9447087, 24::2-(benzo[d]oxazol-2-ylamino)-4'...",Galactokinase,6676.9,MAALRQPQVAELLAEARRAFREEFGAEPELAVSAPGRVNLIGEHTD...,6676.9


In [26]:
# save filtered dataframe as a parquet file
df.to_parquet("base_cleaned_df.parquet")

### Get SMILES Fingerprints 
Morgan fingerprints encode molecular substructures by examining atom neighborhoods within a fixed radius and hashing them into a fixed-length binary vector.

In [35]:
morgan_fp_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

In [36]:
# convert a SMILES string into a 2048-dimension Morgan fingerprint vector
def smiles_to_morgan(smiles):
    
    mol = Chem.MolFromSmiles(smiles)   # convert SMILEs string to an RDKit molecule object
    if mol is None:
        return np.nan                  # return NaN for invalid SMILES
        
    fp = morgan_fp_gen.GetFingerprint(mol)      # generate the Morgan fingerprint for the molecule
    arr = np.zeros(fp.GetNumBits(), dtype=int)  # initalize a binary array for length equal to the number of fingeprint bits
    for i in fp.GetOnBits():                    # set bits corresponding to the fingerprint
        arr[i] = 1
    return arr                         # return the binary Morgan fingerprint vector

In [37]:
# converts all drug SMILES in the DataFrame to numeric Morgan fingerprints using parallel processing
def parallel_apply(smiles_list, n_cores=32):
    with Pool(n_cores) as pool:
        result = pool.map(smiles_to_morgan, smiles_list)
    return result

df['numeric_smiles'] = parallel_apply(df['drug_smiles'].tolist(), n_cores=32)

[17:27:48] Explicit valence for atom # 25 N, 4, is greater than permitted
[17:27:48] Explicit valence for atom # 25 N, 4, is greater than permitted
[17:27:48] Explicit valence for atom # 25 N, 4, is greater than permitted
[17:27:48] Explicit valence for atom # 25 N, 4, is greater than permitted
[17:27:48] Explicit valence for atom # 25 N, 4, is greater than permitted
[17:27:48] Can't kekulize mol.  Unkekulized atoms: 21 22 23 24 25 26 27 28 29
[17:27:48] Can't kekulize mol.  Unkekulized atoms: 21 22 23 24 25 26 27 28 29
[17:27:48] Explicit valence for atom # 22 N, 4, is greater than permitted
[17:27:48] Can't kekulize mol.  Unkekulized atoms: 16 17 19 20 21
[17:27:48] Explicit valence for atom # 1 N, 4, is greater than permitted
[17:27:48] Can't kekulize mol.  Unkekulized atoms: 24 25 26 28 29
[17:27:48] Can't kekulize mol.  Unkekulized atoms: 5 6 8 9 10 11 12 13 14
[17:27:48] Can't kekulize mol.  Unkekulized atoms: 5 6 8 9 10 11 12 13 14
[17:27:48] Can't kekulize mol.  Unkekulized ato

In [41]:
df.head(2)

Unnamed: 0,reactant_set_id,drug_smiles,drug_name,protein_name,ic50,protein_sequence,ic50_numeric,numeric_smiles
0,143,Cc1nc(CN2CCN(CC2)c2c(Cl)cnc3[nH]c(nc23)-c2cn(C...,"US9447092, 3",Cytochrome P450 3A4,>50000,MALIPDLAMETWLLLAVSLVLLYLYGTHSHGLFKKLGIPGPTPLPF...,50000.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,145,O=C1CCCC2=C1C1(CCS(=O)(=O)C1)N=C(Nc1nc3ccccc3o...,"US9447087, 24::2-(benzo[d]oxazol-2-ylamino)-4'...",Galactokinase,6676.9,MAALRQPQVAELLAEARRAFREEFGAEPELAVSAPGRVNLIGEHTD...,6676.9,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
# save numeric SMILES as a compressed numpy array 
arr_list = df["numeric_smiles"].values 
np.savez_compressed("numeric_smiles_compressed.npz", arr_list=arr_list)