# Machine Learning Model for Binding Affinity using BindingDB


In [1]:
# Install dependencies
!pip install -q torch fair-esm transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
from aiondata import BindingAffinity
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from rdkit import RDLogger
from tqdm.auto import tqdm
import polars as pl
import numpy as np
import torch
import esm
from transformers import AutoTokenizer, AutoModel

# Load BindingDB into a Polars DataFrame
df = BindingAffinity().to_df()

# Filter out rows with missing Ki (nM) values, SMILES, and Sequence
df = df.drop_nulls(subset=["Ki (nM)", "SMILES", "Sequence"])

# Filter out rows with Ki (nM) values greater than 10000
df = df.filter(pl.col("Ki (nM)") < 10000)

# Filter out rows with Sequences that are not valid
df = df.filter(pl.col("Sequence").str.contains("^[ACDEFGHIKLMNPQRSTVWY]+$"))

# For test purposes only use a subset of the data
df = df.head(1018)

# Get the SMILES, Sequence, and Ki (nM) columns
ligands = df["SMILES"]
target_sequence = df["Sequence"]
affinity = df["Ki (nM)"]

# Suppress RDKit warnings and errors
RDLogger.DisableLog("rdApp.*")  


#### Create Protein Embeddings using ESM

In [6]:
# Load ESM-35m model
model, alphabet = esm.pretrained.esm2_t12_35M_UR50D()

# Prepare model and move it to evaluation mode
model = model.eval()
if torch.cuda.is_available():
    model = model.cuda()

def create_protein_embedding(sequence: str):
    """Generate embeddings for a single protein sequence."""
    # Convert sequence to tokens
    tokens = torch.tensor([alphabet.encode(sequence)])
    if torch.cuda.is_available():
        tokens = tokens.cuda()
    
    with torch.no_grad():
        results = model(tokens, repr_layers=[12])  # Extract embeddings from the last layer

    # Extract embeddings and move to cpu
    embeddings = results["representations"][12].squeeze(0).cpu()

    # Reduce the embeddings to 1D by averaging across the sequence length
    embeddings_1d = embeddings.mean(dim=0)
    
    return embeddings_1d

def create_embedding_generator(sequences: list[str]):
    """Generate embeddings for a list of protein sequences."""
    for sequence in tqdm(sequences, desc="Generating embeddings", unit=" sequence"):
        yield create_protein_embedding(sequence).numpy()

# Generate embeddings for all protein sequences
X_proteins = np.array(list(create_embedding_generator(target_sequence)))


Generating embeddings:   0%|          | 0/1018 [00:00<?, ? sequence/s]

#### Create Ligand Embeddings using ChemBERTa

In [7]:
# Load ChemBERTa model and tokenizer
chemberta_tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
chemberta_model = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
chemberta_model.eval()


def create_ligand_embedding(smiles: str):
    """Generate embeddings for a single SMILES string."""
    inputs = chemberta_tokenizer(smiles, return_tensors="pt")
    with torch.no_grad():
        outputs = chemberta_model(**inputs)
    # Take the mean of the last hidden state to get a single vector representation
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze(0)
    return embedding

def create_embedding_generator(smiles: list[str]):
    """Generate embeddings for a list of SMILES strings."""
    for smile in tqdm(smiles, desc="Generating embeddings", unit=" ligand"):
        yield create_ligand_embedding(smile).numpy()


# Generate embeddings for all ligands
X_ligands = np.array(list(create_embedding_generator(ligands)))



Generating embeddings:   0%|          | 0/1018 [00:00<?, ? ligand/s]

#### Create the model and predict Ki (nM) values

In [8]:
X = np.concatenate([X_ligands, X_proteins], axis=1)
# Make affinity into a numpy y array
y = affinity.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=18)

model = RandomForestRegressor(n_estimators=100, random_state=18)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


Mean Absolute Percentage Error: 16815.72708652109
