In [1]:
# First, install deepchem
!pip install --pre deepchem

Collecting deepchem
  Downloading deepchem-2.7.2.dev20240322175253-py3-none-any.whl (1.0 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.0 MB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.0/1.0 MB[0m [31m15.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting rdkit (from deepchem)
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit, deepchem
Successfully installed deepchem-2.7.2.dev20240322175253 rdkit-2023.9.5


In [2]:
import deepchem
deepchem.__version__

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


'2.7.2.dev'

In [3]:
from rdkit import Chem

We want to install NVIDIA's Apex tool, for the training pipeline used by `simple-transformers` and Weights and Biases. This package enables us to use 16-bit training, mixed precision, and distributed training without any changes to our code. Generally GPUs are good at doing 32-bit(single precision) math, not at 16-bit(half) nor 64-bit(double precision). Therefore traditionally deep learning model trainings are done in 32-bit. By switching to 16-bit, we’ll be using half the memory and theoretically less computation at the expense of the available number range and precision. However, pure 16-bit training creates a lot of problems for us (imprecise weight updates, gradient underflow and overflow). **Mixed precision training, with Apex, alleviates these problems**.

We will be installing `simple-transformers`, a library which builds ontop of HuggingFace's `transformers` package specifically for fine-tuning ChemBERTa.

In [4]:
!git clone https://github.com/NVIDIA/apex
!cd /content/apex
!pip install -v --no-cache-dir /content/apex
!pip install transformers
!pip install simpletransformers
!pip install wandb
!cd ..

Cloning into 'apex'...
remote: Enumerating objects: 11638, done.[K
remote: Counting objects: 100% (3731/3731), done.[K
remote: Compressing objects: 100% (579/579), done.[K
remote: Total 11638 (delta 3371), reused 3280 (delta 3149), pack-reused 7907[K
Receiving objects: 100% (11638/11638), 15.47 MiB | 30.17 MiB/s, done.
Resolving deltas: 100% (8173/8173), done.
Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Processing ./apex
  Running command pip subprocess to install build dependencies
  Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
  Collecting setuptools
    Downloading setuptools-69.2.0-py3-none-any.whl (821 kB)
       ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 821.5/821.5 kB 7.2 MB/s eta 0:00:00
  Collecting wheel
    Downloading wheel-0.43.0-py3-none-any.whl (65 kB)
       ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 65.8/65.8 kB 6.4 MB/s eta 0:00:00
  Installing collected packages: wheel, setuptools
    Creating /tm

In [5]:
import sys
!test -d bertviz_repo && echo "FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo"
# !rm -r bertviz_repo # Uncomment if you need a clean pull from repo
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
  sys.path += ['bertviz_repo']
!pip install regex

Cloning into 'bertviz_repo'...
remote: Enumerating objects: 1633, done.[K
remote: Counting objects: 100% (329/329), done.[K
remote: Compressing objects: 100% (118/118), done.[K
remote: Total 1633 (delta 231), reused 225 (delta 211), pack-reused 1304[K
Receiving objects: 100% (1633/1633), 198.37 MiB | 33.45 MiB/s, done.
Resolving deltas: 100% (1073/1073), done.


We're going to clone an auxillary repository, bert-loves-chemistry, which will enable us to use the MolNet dataloader for ChemBERTa, which automatically generates scaffold splits on any MoleculeNet dataset!

In [6]:
!git clone https://github.com/seyonechithrananda/bert-loves-chemistry.git

Cloning into 'bert-loves-chemistry'...
remote: Enumerating objects: 1566, done.[K
remote: Counting objects: 100% (202/202), done.[K
remote: Compressing objects: 100% (111/111), done.[K
remote: Total 1566 (delta 96), reused 92 (delta 91), pack-reused 1364[K
Receiving objects: 100% (1566/1566), 55.35 MiB | 27.12 MiB/s, done.
Resolving deltas: 100% (1000/1000), done.


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Load models

In [8]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline, RobertaModel, RobertaTokenizer
from bertviz import head_view

model = AutoModelForMaskedLM.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")

fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/515 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/336M [00:00<?, ?B/s]

Some weights of the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/165k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/101k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [9]:
from transformers import RobertaModel, RobertaTokenizer
from bertviz import head_view

model_version = 'seyonec/PubChem10M_SMILES_BPE_450k'
model = RobertaModel.from_pretrained(model_version, output_attentions=True)
tokenizer = RobertaTokenizer.from_pretrained(model_version)

### Define the Paths

In [10]:
TRAIN_PATH = "/content/drive/Shareddrives/1:1_Aiden_Chavda/Dataset/train_smile.csv"

In [11]:
import pandas as pd

# load the data
train = pd.read_csv(TRAIN_PATH)

In [12]:
# seperate features and labels

X_train = train['canonical_smiles']
y_train = train['pIC50']

### Define the model

In [13]:
from torch.utils.data import DataLoader, TensorDataset
import torch
from torch.nn import functional as F
from transformers import AutoTokenizer, RobertaModel
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, r2_score
from tqdm.notebook import tqdm
import numpy as np

# Define the Regression model
class RobertaForRegression(torch.nn.Module):
    def __init__(self, model):
        super(RobertaForRegression, self).__init__()
        self.roberta = model
        # Assuming using 'roberta-base'
        # 768 is the output dimension of the roberta model
        # you can add more linear layers
        # need to define the correct input dimension = output dimension of the respective above layer
        # and required output dimension
        self.regressor = torch.nn.Linear(768, 64)
        self.regressor_layer_one = torch.nn.Linear(64, 32)
        self.regressor_layer_two = torch.nn.Linear(32,1)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooler_output = outputs.pooler_output
        # make sure to defne the inputs and outputs for every layer defined in class constructor
        regression_output = self.regressor(pooler_output)
        regression_layer_one_output = self.regressor_layer_one(regression_output)
        regression_layer_two_output = self.regressor_layer_two(regression_layer_one_output)
        return regression_layer_two_output

# Setup tokenizer
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")

### Training

In [14]:
# Define loss function and learning rate and epochs
def train_chemberta_model(learning_rate, num_epochs):
  loss_fn = torch.nn.MSELoss()

  # KFold Cross-validation
  kf = KFold(n_splits=5, shuffle=True, random_state=42)
  mse_scores = []
  r2_score_set = []

  for train_index, valid_index in kf.split(X_train):
      X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
      y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]

      # Create DataLoader for training fold
      input_encodings_train = tokenizer(list(X_train_fold), return_tensors="pt", padding=True, truncation=True, max_length=512)
      train_dataset = TensorDataset(input_encodings_train["input_ids"], input_encodings_train["attention_mask"], torch.tensor(list(y_train_fold)).unsqueeze(-1))
      train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

      # Create DataLoader for validation fold
      input_encodings_valid = tokenizer(list(X_valid_fold), return_tensors="pt", padding=True, truncation=True, max_length=512)
      valid_dataset = TensorDataset(input_encodings_valid["input_ids"], input_encodings_valid["attention_mask"], torch.tensor(list(y_valid_fold)).unsqueeze(-1))
      valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=True)

      # Initialize model and optimizer
      model = RobertaForRegression(RobertaModel.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")).to("cuda" if torch.cuda.is_available() else "cpu")
      optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

      # Training loop
      for epoch in tqdm(range(num_epochs)):
          model.train()
          for batch in train_dataloader:
              batch = tuple(t.to("cuda" if torch.cuda.is_available() else "cpu") for t in batch)
              input_ids_batch, attention_masks_batch, labels_batch = batch
              outputs = model(input_ids_batch, attention_mask=attention_masks_batch)
              loss = loss_fn(outputs, labels_batch)

              optimizer.zero_grad()
              loss.backward()
              optimizer.step()

      # Validation loop
      model.eval()
      all_preds = []
      all_labels = []
      with torch.no_grad():
          for batch in valid_dataloader:
              batch = tuple(t.to("cuda" if torch.cuda.is_available() else "cpu") for t in batch)
              input_ids_batch, attention_masks_batch, labels_batch = batch
              outputs = model(input_ids_batch, attention_mask=attention_masks_batch)
              all_preds.extend(outputs.cpu().numpy())
              all_labels.extend(labels_batch.cpu().numpy())

      mse_scores.append(mean_squared_error(all_labels, all_preds))
      r2_score_set.append(r2_score(all_labels, all_preds))

  print("MSE scores from 5-fold cross validation:", mse_scores)
  print("Root Mean Square Error: ", np.sqrt(np.mean(mse_scores)))
  print("R2 Score: {}".format(np.mean(r2_score_set)))
  return [mse_scores, np.mean(r2_score_set), np.sqrt(np.mean(mse_scores))]

In [15]:
learning_rates = [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005]
all_mse_scores = []
all_r2_scores = []
all_rmse_scores = []
#for i in learning_rates:
  #result = train_chemberta_model(i, 50)
  #all_mse_scores.append(result[0])
  #all_r2_scores.append(result[1])
  #all_rmse_scores.append(result[2])
  #print("finished " + str(i))
#print("|".join(list(map(str, all_mse_scores))))
#print("|".join(list(map(str, all_r2_scores))))
#print("|".join(list(map(str, all_rmse_scores))))

In [16]:
import os
learning_rate = 0.00005
num_epochs = 40
loss_fn = torch.nn.MSELoss()

# KFold Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores = []
r2_score_set = []

for train_index, valid_index in kf.split(X_train):
    X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]

    # Create DataLoader for training fold
    input_encodings_train = tokenizer(list(X_train_fold), return_tensors="pt", padding=True, truncation=True, max_length=512)
    train_dataset = TensorDataset(input_encodings_train["input_ids"], input_encodings_train["attention_mask"], torch.tensor(list(y_train_fold)).unsqueeze(-1))
    train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

    # Create DataLoader for validation fold
    input_encodings_valid = tokenizer(list(X_valid_fold), return_tensors="pt", padding=True, truncation=True, max_length=512)
    valid_dataset = TensorDataset(input_encodings_valid["input_ids"], input_encodings_valid["attention_mask"], torch.tensor(list(y_valid_fold)).unsqueeze(-1))
    valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=True)

    # Initialize model and optimizer
    model = RobertaForRegression(RobertaModel.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")).to("cuda" if torch.cuda.is_available() else "cpu")
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in tqdm(range(num_epochs)):
        model.train()
        for batch in train_dataloader:
            batch = tuple(t.to("cuda" if torch.cuda.is_available() else "cpu") for t in batch)
            input_ids_batch, attention_masks_batch, labels_batch = batch
            outputs = model(input_ids_batch, attention_mask=attention_masks_batch)
            loss = loss_fn(outputs, labels_batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Validation loop
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in valid_dataloader:
            batch = tuple(t.to("cuda" if torch.cuda.is_available() else "cpu") for t in batch)
            input_ids_batch, attention_masks_batch, labels_batch = batch
            outputs = model(input_ids_batch, attention_mask=attention_masks_batch)
            all_preds.extend(outputs.cpu().numpy())
            all_labels.extend(labels_batch.cpu().numpy())
# Save the entire model to a file
SAVE_MODEL_PATH = "/content/drive/Shareddrives/1:1_Aiden_Chavda/Results"
MODEL_NAME = "pubchem.pth"

# save the entire model with architecture
torch.save(model, os.path.join(SAVE_MODEL_PATH, MODEL_NAME))

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]