In [1]:
from tdc.single_pred import ADME
data = ADME(name='BBBP_Martins')
df = data.get_data()

print("SUCCESS! TDC is working in Jupyter.")
print("\nColumns:", df.columns.tolist())
print("\nFirst 3 rows:\n", df.head(3))
print("\nShape:", df.shape)
print("\nLabel distribution:\n", df['Y'].value_counts(normalize=True))

Found local copy...
Loading...
Done!


SUCCESS! TDC is working in Jupyter.

Columns: ['Drug_ID', 'Drug', 'Y']

First 3 rows:
                 Drug_ID                                              Drug  Y
0            Propanolol                  CC(C)NCC(O)COc1cccc2ccccc12.[Cl]  1
1  Terbutylchlorambucil            CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1  1
2                 40730  CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23  1

Shape: (2030, 3)

Label distribution:
 Y
1    0.764039
0    0.235961
Name: proportion, dtype: float64


In [2]:
# Step 1: Environment Setup and Data Acquisition
!pip install transformers datasets rdkit tdc evaluate gradio shap torch scikit-learn

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from tdc.single_pred import ADME
from rdkit import Chem
from rdkit.Chem import Draw
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_recall_curve, auc
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import shap
import gradio as gr
from tdc import single_pred

# Load and prepare data

data = single_pred.ADME(name='BBBP_Martins')
df = data.get_data()

# Validate SMILES
def is_valid_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol is not None
    except:
        return False

df = df[df['Drug'].apply(is_valid_smiles)]  # Filter invalid
df = df.drop_duplicates(subset=['Drug'])    # Drop duplicates


# Use TDC's built-in scaffold split
split = data.get_split(method='scaffold')  # or 'scaffold_balanced'

train_df = split['train']
val_df   = split['valid']
test_df  = split['test']

print("Scaffold split sizes:", len(train_df), len(val_df), len(test_df))

# Proceed with your rename / Dataset creation
train_ds = Dataset.from_pandas(train_df.rename(columns={'Drug': 'SMILES', 'Y': 'labels'}))
val_ds   = Dataset.from_pandas(val_df.rename(columns={'Drug': 'SMILES', 'Y': 'labels'}))
test_ds  = Dataset.from_pandas(test_df.rename(columns={'Drug': 'SMILES', 'Y': 'labels'}))




W0226 23:45:14.162000 30923 site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
/Users/yousuf/.matplotlib is not a writable directory
Matplotlib created a temporary cache directory at /var/folders/4s/rj_sy56d06508gj9z8g3lwhw0000gn/T/matplotlib-hlvvhaux because there was an issue with the default path (/Users/yousuf/.matplotlib); it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.
Matplotlib is building the font cache; this may take a moment.
Found local copy...
Loading...
Done!
100%|████████████████████████████████████| 2030/2030 [00:00<00:00, 7002.92it/s]


Scaffold split sizes: 1421 203 406


In [3]:
print("=== Quick Sanity Check ===")
print("Train dataset size:", len(train_ds))
print("Val dataset size:", len(val_ds))
print("Test dataset size:", len(test_ds))
print("\nExample from train_ds:")
print(train_ds[0]) ##  {'SMILES': '...', 'labels': 0 or 1}

=== Quick Sanity Check ===
Train dataset size: 1421
Val dataset size: 203
Test dataset size: 406

Example from train_ds:
{'Drug_ID': 'Terbutylchlorambucil', 'SMILES': 'CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1', 'labels': 1}


In [4]:
import shutil
import os

cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
model_cache = os.path.join(cache_dir, "models--DeepChem--ChemBERTa-77M-MLM")
if os.path.exists(model_cache):
    shutil.rmtree(model_cache)
    print("Cleared corrupted model cache.")
else:
    print("No local cache found for this model — clean start.")

print("Cache cleared. Ready to retry model loading.")

Cleared corrupted model cache.
Cache cleared. Ready to retry model loading.


In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

local_path = './chemberta_model'  
tokenizer = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MLM')  # Tokenizer still from HF
model = AutoModelForSequenceClassification.from_pretrained(
    local_path,  # ← Point to local folder
    num_labels=2
)

print("Model loaded successfully from local files!")

tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/420 [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./chemberta_model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully from local files!


In [8]:
## Step 3 : Tokenization
def tokenize_function(examples):
    return tokenizer(examples['SMILES'], padding='max_length', truncation=True, max_length=512)

tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_val = val_ds.map(tokenize_function, batched=True)
tokenized_test = test_ds.map(tokenize_function, batched=True)

# Setting format for PyTorch
tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

print("Tokenization complete!")
print("Example input_ids length:", len(tokenized_train[0]['input_ids']))

Map:   0%|          | 0/1421 [00:00<?, ? examples/s]

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

Map:   0%|          | 0/406 [00:00<?, ? examples/s]

Tokenization complete!
Example input_ids length: 512


In [6]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_recall_curve, auc

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='binary')
    auc_score = roc_auc_score(labels, p.predictions[:, 1])
    precision, recall, _ = precision_recall_curve(labels, p.predictions[:, 1])
    pr_auc = auc(recall, precision)
    return {'accuracy': acc, 'f1': f1, 'roc_auc': auc_score, 'pr_auc': pr_auc}

In [9]:
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm  # For progress bar

# Move model to CPU explicitly
device = torch.device('cpu')
model.to(device)
model.train()

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# DataLoader (batch size 8, shuffle training)
train_loader = DataLoader(tokenized_train, batch_size=8, shuffle=True)

print("Starting manual fine-tuning on CPU...")

Starting manual fine-tuning on CPU...


In [10]:
for epoch in range(3):
    total_loss = 0
    num_batches = 0
    
    # Progress bar for batches
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/3")
    
    for batch in progress_bar:
        # Move batch to device
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'labels': batch['labels'].to(device)
        }
        
        # Forward pass
        outputs = model(**inputs)
        loss = outputs.loss
        
        # Backward + optimize
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
        num_batches += 1
        
        # Update progress bar
        progress_bar.set_postfix({'batch_loss': f'{loss.item():.4f}'})
    
    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch+1}/3 completed. Average loss: {avg_loss:.4f}")


Epoch 1/3: 100%|██████████| 178/178 [02:28<00:00,  1.20it/s, batch_loss=0.4887]


Epoch 1/3 completed. Average loss: 0.6041


Epoch 2/3: 100%|██████████| 178/178 [02:24<00:00,  1.23it/s, batch_loss=0.1737]


Epoch 2/3 completed. Average loss: 0.4873


Epoch 3/3: 100%|██████████| 178/178 [02:28<00:00,  1.20it/s, batch_loss=0.1259]

Epoch 3/3 completed. Average loss: 0.3868





In [11]:
# Saving fine-tuned model
model.save_pretrained('./dd27_scaffold')
tokenizer.save_pretrained('./dd27_scaffold')
print("Training complete! Model saved to ./dd27_scaffold")

Training complete! Model saved to ./dd27_scaffold


In [None]:
### Training for 10 more epochs

In [13]:
model = AutoModelForSequenceClassification.from_pretrained('./dd27_scaffold')
model.to(device)
model.train()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(600, 384, padding_idx=1)
      (position_embeddings): Embedding(515, 384, padding_idx=1)
      (token_type_embeddings): Embedding(1, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.144, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-2): 3 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.109, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
         

In [14]:
for epoch in range(10):
    total_loss = 0
    num_batches = 0
    
    # Progress bar for batches
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/10")
    
    for batch in progress_bar:
        # Move batch to device
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'labels': batch['labels'].to(device)
        }
        
        # Forward pass
        outputs = model(**inputs)
        loss = outputs.loss
        
        # Backward + optimize
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
        num_batches += 1
        
        # Update progress bar
        progress_bar.set_postfix({'batch_loss': f'{loss.item():.4f}'})
    
    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch+1}/3 completed. Average loss: {avg_loss:.4f}")


Epoch 1/10: 100%|█████████| 178/178 [02:28<00:00,  1.20it/s, batch_loss=1.0247]


Epoch 1/3 completed. Average loss: 0.3428


Epoch 2/10: 100%|█████████| 178/178 [02:36<00:00,  1.14it/s, batch_loss=0.2202]


Epoch 2/3 completed. Average loss: 0.3441


Epoch 3/10: 100%|█████████| 178/178 [02:26<00:00,  1.21it/s, batch_loss=0.2997]


Epoch 3/3 completed. Average loss: 0.3427


Epoch 4/10: 100%|█████████| 178/178 [02:25<00:00,  1.22it/s, batch_loss=0.6921]


Epoch 4/3 completed. Average loss: 0.3466


Epoch 5/10: 100%|█████████| 178/178 [02:29<00:00,  1.19it/s, batch_loss=0.5721]


Epoch 5/3 completed. Average loss: 0.3474


Epoch 6/10: 100%|█████████| 178/178 [02:26<00:00,  1.21it/s, batch_loss=0.1620]


Epoch 6/3 completed. Average loss: 0.3415


Epoch 7/10: 100%|█████████| 178/178 [02:23<00:00,  1.24it/s, batch_loss=0.2987]


Epoch 7/3 completed. Average loss: 0.3417


Epoch 8/10: 100%|█████████| 178/178 [02:31<00:00,  1.18it/s, batch_loss=0.2888]


Epoch 8/3 completed. Average loss: 0.3451


Epoch 9/10: 100%|█████████| 178/178 [02:26<00:00,  1.22it/s, batch_loss=0.1025]


Epoch 9/3 completed. Average loss: 0.3481


Epoch 10/10: 100%|████████| 178/178 [02:39<00:00,  1.12it/s, batch_loss=0.7226]

Epoch 10/3 completed. Average loss: 0.3444





In [23]:
# Saving fine-tuned model
model.save_pretrained('./dd27_scaffold')
tokenizer.save_pretrained('./dd27_scaffold')
print("Training complete! Model saved to ./dd27_scaffold")

Training complete! Model saved to ./dd27_scaffold


In [None]:
### Testing

In [15]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_recall_curve, auc
from rdkit import Chem
from rdkit.Chem import Draw
import matplotlib.pyplot as plt

device = torch.device('cpu') # version mismatch with accelerate so training on CPU (avoiding MPS)
model.to(device)
model.eval()

# Creating test DataLoader
test_loader = DataLoader(tokenized_test, batch_size=16, shuffle=False)

all_preds = []
all_labels = []
all_probs = []

print("Running evaluation on test set...")

with torch.no_grad():
    for batch in tqdm(test_loader):
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
        }
        labels = batch['labels'].to(device)
        
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)[:, 1].cpu().numpy()  # Class 1 - permeable
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        
        all_probs.extend(probs)
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Computing metric
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)
roc_auc = roc_auc_score(all_labels, all_probs)
precision, recall, _ = precision_recall_curve(all_labels, all_probs)
pr_auc = auc(recall, precision)

print("\n=== Test Set Evaluation ===")
print(f"Accuracy:     {accuracy:.4f}")
print(f"F1 Score:     {f1:.4f}")
print(f"ROC-AUC:      {roc_auc:.4f}")
print(f"PR-AUC:       {pr_auc:.4f}")

Running evaluation on test set...


100%|██████████████████████████████████████████| 26/26 [00:05<00:00,  4.96it/s]


=== Test Set Evaluation ===
Accuracy:     0.8448
F1 Score:     0.9038
ROC-AUC:      0.8321
PR-AUC:       0.9413



