# Machine Learning Model for Binding Affinity using BindingDB


In [None]:
# Install dependencies
!pip install -q torch fair-esm transformers &> /dev/null

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from rdkit import RDLogger
from tqdm.auto import tqdm
import numpy as np
import torch
import esm
from transformers import AutoTokenizer, AutoModel
from aiondata import BindingAffinity

# Load BindingDB into a Polars DataFrame
df = BindingAffinity().to_df()

# Filter out rows with missing SMILES and Sequence
df = df.drop_nulls(subset=["SMILES", "Sequence"])

# For test purposes only use a subset of the data
df = df.head(180)

# Get the SMILES, Sequence, and Binds columns
ligands = df["SMILES"]
target_sequence = df["Sequence"]
affinity = df["Binds"]

# Suppress RDKit warnings and errors
RDLogger.DisableLog("rdApp.*")  


#### Create Protein Embeddings using ESM

In [2]:
# Load ESM-35m model
model, alphabet = esm.pretrained.esm2_t12_35M_UR50D()

# Prepare model and move it to evaluation mode
model = model.eval()
if torch.cuda.is_available():
    model = model.cuda()

def create_protein_embedding(sequence: str):
    """Generate embeddings for a single protein sequence."""
    # Convert sequence to tokens
    tokens = torch.tensor([alphabet.encode(sequence)])
    if torch.cuda.is_available():
        tokens = tokens.cuda()
    
    with torch.no_grad():
        results = model(tokens, repr_layers=[12])  # Extract embeddings from the last layer

    # Extract embeddings and move to cpu
    embeddings = results["representations"][12].squeeze(0).cpu()

    # Reduce the embeddings to 1D by averaging across the sequence length
    embeddings_1d = embeddings.mean(dim=0)
    
    return embeddings_1d

def create_embedding_generator(sequences: list[str]):
    """Generate embeddings for a list of protein sequences."""
    for sequence in tqdm(sequences, desc="Generating protein embeddings", unit=" proteins"):
        yield create_protein_embedding(sequence).numpy()

# Generate embeddings for all protein sequences
X_proteins = np.array(list(create_embedding_generator(target_sequence)))


Generating embeddings:   0%|          | 0/180 [00:00<?, ? sequence/s]

#### Create Ligand Embeddings using ChemBERTa

In [3]:
# Load ChemBERTa model and tokenizer
chemberta_tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
chemberta_model = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
chemberta_model.eval()


def create_ligand_embedding(smiles: str):
    """Generate embeddings for a single SMILES string."""
    # Truncate SMILES string to 512 characters, required by ChemBERTa
    if len(smiles) > 512:
        smiles = smiles[:512]
    inputs = chemberta_tokenizer(smiles, return_tensors="pt")
    
    with torch.no_grad():
        outputs = chemberta_model(**inputs)
    # Take the mean of the last hidden state to get a single vector representation
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze(0)
    return embedding

def create_embedding_generator(smiles: list[str]):
    """Generate embeddings for a list of SMILES strings."""
    for smile in tqdm(smiles, desc="Generating ligand embeddings", unit=" ligand"):
        yield create_ligand_embedding(smile).numpy()


# Generate embeddings for all ligands
X_ligands = np.array(list(create_embedding_generator(ligands)))

Generating ligand embeddings:   0%|          | 0/180 [00:00<?, ? ligand/s]

#### Create the model and predict binding values

In [4]:
X = np.concatenate([X_ligands, X_proteins], axis=1)
# Make affinity into a numpy y array
y = affinity.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=18)

model = RandomForestClassifier(n_estimators=100, random_state=18)
model.fit(X_train, y_train)

# Evaluate the model ROC-AUC score
y_pred = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred)

print(f"ROC-AUC score: {roc_auc:.4f}")

ROC-AUC score: 0.8831
