In [1]:
import pandas as pd
import os
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler

In [2]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import wandb

# Initialize Weights & Biases
wandb.init(project="chemberta-smiles-binary-classification")

class ChemBERTaWithFeatures(nn.Module):
    def __init__(self, chemberta_model_name, feature_dim):
        super(ChemBERTaWithFeatures, self).__init__()
        self.chemberta = AutoModel.from_pretrained(chemberta_model_name)
        self.dropout = nn.Dropout(0.1)
        self.feature_batch_norm = nn.BatchNorm1d(feature_dim)
        self.classifier = nn.Linear(self.chemberta.config.hidden_size + feature_dim, 1)

    def forward(self, input_ids, attention_mask, features):
        chemberta_output = self.chemberta(input_ids, attention_mask=attention_mask)
        cls_output = chemberta_output.last_hidden_state[:, 0, :]
        normalized_features = self.feature_batch_norm(features)
        concatenated = torch.cat((cls_output, normalized_features), dim=1)
        concatenated = self.dropout(concatenated)
        logits = self.classifier(concatenated)
        probabilities = torch.sigmoid(logits)
        return probabilities

# Dataset class
class MoleculeDataset(Dataset):
    def __init__(self, encodings, features, labels):
        self.encodings = encodings
        self.features = features
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['features'] = torch.tensor(self.features[idx], dtype=torch.float32)
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

    def __len__(self):
        return len(self.labels)



Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mmparsa[0m ([33mmypersonalteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112743155616852, max=1.0…

In [3]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score
def compute_metrics(preds, labels):
    preds = preds.round()  # Convert probabilities to binary predictions
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, zero_division=0)  # Avoid division by zero
    return accuracy, precision

In [9]:

# Data loading and preparation 
features_columns = ['Molecular Weight', 'LogP', 'Number of Atoms',
       'Number of Bonds', 'Number of Rings', 'Rotatable Bonds Count',
       'Hydrogen Bond Donors', 'Hydrogen Bond Acceptors',
       'Number of Stereocenters', 'Topological Polar Surface Area (TPSA)']
# labels = data['Results'].values

  

train_data = pd.read_csv('/home/parsa/smiles_classification/training_w_features.csv').sample(frac=1)
val_data = pd.read_csv('/home/parsa/smiles_classification/validation_w_features.csv').sample(frac=1)

X_train, X_val = train_data[features_columns].values, val_data[features_columns].values
y_train, y_val = train_data['Results'].values, val_data['RESULT'].values

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

tokenizer = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MLM')
train_encodings = tokenizer(list(train_data['SMILES']), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_data['SMILES']), truncation=True, padding=True, max_length=512)

train_dataset = MoleculeDataset(train_encodings, X_train_scaled, y_train)
val_dataset = MoleculeDataset(val_encodings, X_val_scaled, y_val)

# Set up DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Define the device
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# Initialize model
model = ChemBERTaWithFeatures('DeepChem/ChemBERTa-77M-MLM', feature_dim=10).to(device)
loss_function = nn.BCELoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        features = batch['features'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, features)
        loss = loss_function(outputs, labels.unsqueeze(1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        # wandb.log({"train_loss": total_loss / len(train_loader)})


        all_preds.extend(outputs.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())
    
    train_accuracy, train_precision = compute_metrics(torch.tensor(all_preds), torch.tensor(all_labels))
    wandb.log({"train_loss": total_loss / len(train_loader), "train_accuracy": train_accuracy, "train_precision": train_precision})


    # Validation
    model.eval()
    val_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            features = batch['features'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask, features).squeeze(1)
            loss = loss_function(outputs, labels)
            val_loss += loss.item()

            all_preds.extend(outputs.detach().cpu().numpy())
            all_labels.extend(labels.detach().cpu().numpy())

    val_accuracy, val_precision = compute_metrics(torch.tensor(all_preds), torch.tensor(all_labels))
    wandb.log({"val_loss": val_loss / len(val_loader), "val_accuracy": val_accuracy, "val_precision": val_precision})

    print(f"Epoch {epoch+1}: Train Loss: {total_loss / len(train_loader)}, Train Accuracy: {train_accuracy}, Train Precision: {train_precision}")
    print(f"Epoch {epoch+1}: Val Loss: {val_loss / len(val_loader)}, Val Accuracy: {val_accuracy}, Val Precision: {val_precision}")





Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Epoch 1: Train Loss: 0.6817942215846136, Train Accuracy: 0.5689655172413793, Train Precision: 0.5786516853932584
Epoch 1: Val Loss: 0.6392378211021423, Val Accuracy: 0.6, Val Precision: 0.5925925925925926
Epoch 2: Train Loss: 0.6751704720350412, Train Accuracy: 0.6059113300492611, Train Precision: 0.6069651741293532
Epoch 2: Val Loss: 0.633136585354805, Val Accuracy: 0.62, Val Precision: 0.5882352941176471
Epoch 3: Train Loss: 0.6578912093089178, Train Accuracy: 0.6379310344827587, Train Precision: 0.6458333333333334
Epoch 3: Val Loss: 0.6219532191753387, Val Accuracy: 0.7, Val Precision: 0.6785714285714286
Epoch 4: Train Loss: 0.6474480330944061, Train Accuracy: 0.6896551724137931, Train Precision: 0.6896551724137931
Epoch 4: Val Loss: 0.6149758249521255, Val Accuracy: 0.7, Val Precision: 0.6666666666666666
Epoch 5: Train Loss: 0.6289966610761789, Train Accuracy: 0.7044334975369458, Train Precision: 0.7004830917874396
Epoch 5: Val Loss: 0.6069946885108948, Val Accuracy: 0.7, Val Preci

In [26]:
# Evaluate the model using the validation set
model.eval()
val_loss = 0
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        features = batch['features'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask, features)
        loss = loss_function(outputs, labels.unsqueeze(1))
        val_loss += loss.item()
    wandb.log({"val_loss": val_loss / len(val_loader)})

print(f"Validation Loss: {val_loss / len(val_loader)}")


Validation Loss: 0.9494809210300446


In [11]:
features_df = train_df[['Molecular Weight', 'LogP', 'Number of Atoms',
       'Number of Bonds', 'Number of Rings', 'Rotatable Bonds Count',
       'Hydrogen Bond Donors', 'Hydrogen Bond Acceptors',
       'Number of Stereocenters', 'Topological Polar Surface Area (TPSA)']]

In [12]:
torch.tensor(features_df.values)


tensor([[ 2.3430e+02,  1.2444e+00,  1.7000e+01,  ...,  4.0000e+00,
          1.0000e+00,  5.1020e+01],
        [ 2.3634e+02,  2.1663e+00,  1.6000e+01,  ...,  3.0000e+00,
          0.0000e+00,  3.3200e+01],
        [ 2.7033e+02,  3.0282e+00,  2.0000e+01,  ...,  3.0000e+00,
          0.0000e+00,  4.6340e+01],
        ...,
        [ 2.2332e+02,  1.6419e+00,  1.6000e+01,  ...,  2.0000e+00,
          1.0000e+00,  4.9330e+01],
        [ 2.0732e+02,  2.5996e+00,  1.5000e+01,  ...,  1.0000e+00,
          2.0000e+00,  2.0310e+01],
        [ 1.8222e+02, -9.0600e-02,  1.3000e+01,  ...,  2.0000e+00,
          1.0000e+00,  4.9410e+01]], dtype=torch.float64)

In [15]:
inputs = tokenizer(train_df.SMILES.tolist(), return_tensors="pt",padding=True)
features = torch.tensor(features_df.values)

# Normalize features (example, should use actual fitted scaler)
scaler = StandardScaler()
scaler.fit(features)  # This should be your training features
normalized_features = torch.tensor(scaler.transform(features), dtype=torch.float32)

# Forward pass
probabilities = model(inputs['input_ids'], inputs['attention_mask'], normalized_features)
print(probabilities)

tensor([[0.4527],
        [0.5496],
        [0.4562],
        [0.4339],
        [0.5071],
        [0.4529],
        [0.5177],
        [0.5702],
        [0.4314],
        [0.4699],
        [0.4849],
        [0.4800],
        [0.5121],
        [0.4870],
        [0.5081],
        [0.4606],
        [0.4801],
        [0.5726],
        [0.5739],
        [0.4679],
        [0.5324],
        [0.4366],
        [0.5080],
        [0.5031],
        [0.5974],
        [0.5608],
        [0.5644],
        [0.4994],
        [0.5333],
        [0.5753],
        [0.5784],
        [0.5474],
        [0.5097],
        [0.5337],
        [0.4936],
        [0.4952],
        [0.5226],
        [0.5336],
        [0.5440],
        [0.5430],
        [0.4722],
        [0.5452],
        [0.5578],
        [0.4671],
        [0.5416],
        [0.5158],
        [0.5422],
        [0.5308],
        [0.4809],
        [0.4390],
        [0.5561],
        [0.4472],
        [0.4518],
        [0.4788],
        [0.5015],
        [0