In [10]:
import torch
from torch.utils.data import Dataset
from torch.nn.functional import mse_loss

# !pip install transformers[torch]
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, AutoConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

import pandas as pd
# !pip install wandb
import wandb

# !pip install accelerate -U
import accelerate

from transformers import pipeline

from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr
import numpy as np

# Load the dataset
train_df = pd.read_csv("train_delaney_1.csv")
valid_df = pd.read_csv("valid_delaney_1.csv")

print(f"There are {len(train_df)} molecules in train df.")
print(f"There are {len(valid_df)} molecules in val df.")

There are 676 molecules in train df.
There are 113 molecules in val df.


In [11]:
!export WANDB_API_KEY="09d02010ef9c7078a54f49356384e4d19d8f2e81"

In [12]:
import logging

device = "cuda"
label = "logSolubility"

# Initialize W&B and define the sweep configuration
sweep_config = {
    'method': 'random',
    'metric': {'name': 'eval_loss', 'goal': 'minimize'},
    'parameters': {
        'learning_rate': {'min': 5e-6, 'max': 5e-5},
        'weight_decay': {'values': [0.0, 5e-4, 1e-3]},
        'per_device_train_batch_size': {'values': [16, 32, 64]}
    }
}

# sweep_config = {
#     'method': 'random',
#     'metric': {'name': 'eval_loss', 'goal': 'minimize'},
#     'parameters': {
#         'learning_rate': {'values': [5e-5]},
#         'weight_decay': {'values': [0.0]},
#         'per_device_train_batch_size': {'values': [8]}
#     }
# }

# Initialize the Wandb run
sweep_id = wandb.sweep(sweep_config, project="chemberta-delaney-hparams")

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

Create sweep with ID: vqs4onpd
Sweep URL: https://wandb.ai/data-fusion/chemberta-delaney-hparams/sweeps/vqs4onpd


In [13]:
class Input(Dataset):
    def __init__(self, data, tokenizer, max_length, labels_mean=None, labels_std=None):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        # Use provided mean and std for normalization
        self.labels_mean = labels_mean
        self.labels_std = labels_std

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        smiles = self.data.iloc[idx]["smiles"]
        inputs = self.tokenizer(smiles, return_tensors="pt", padding='max_length', truncation=True, max_length=self.max_length)
        inputs["input_ids"] = inputs["input_ids"].squeeze(0)
        inputs["attention_mask"] = inputs["attention_mask"].squeeze(0)
        if "token_type_ids" in inputs:
            inputs["token_type_ids"] = inputs["token_type_ids"].squeeze(0)

        # Check if mean and std are provided before normalization
        label = self.data.iloc[idx]["logSolubility"]
        if self.labels_mean is not None and self.labels_std is not None:
            normalized_label = (label - self.labels_mean) / self.labels_std
        else:
            normalized_label = label

        inputs["labels"] = torch.tensor(normalized_label, dtype=torch.float).unsqueeze(0)

        return inputs

In [14]:
model_name = "DeepChem/ChemBERTa-77M-MTR"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
# config.num_hidden_layers += 3

max_length = 512

# Prepare the dataset for training

training_mean = train_df[label].mean()
training_std = train_df[label].std()

# training_mean = 0
# training_std = 1

train_dataset = Input(train_df, tokenizer, max_length, labels_mean=training_mean, labels_std=training_std)
print(f"this is the first training set label: {train_dataset[0]['labels']} with mean {train_dataset.labels_mean}.")

eval_dataset = Input(valid_df, tokenizer, max_length)
print(f"this is the first valid set label: {eval_dataset[0]['labels']} with mean {eval_dataset.labels_mean}.")

this is the first training set label: tensor([0.7691]) with mean -2.990841715976331.
this is the first valid set label: tensor([-1.4880]) with mean None.




In [15]:
class RegressionTrainer(Trainer):
    def __init__(self, labels_mean=None, labels_std=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.labels_mean = labels_mean
        self.labels_std = labels_std

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        predictions = outputs[0]
        # the following two are the same
        # print(outputs.logits)
        # print(predictions)

        if model.training:
            # During training, compare predictions directly with normalized labels
            loss = torch.sqrt(mse_loss(predictions, labels))
        else:
            # During evaluation, reverse normalize predictions before calculating RMSE with original labels
            reverse_normalized_predictions = predictions * self.labels_std + self.labels_mean
            loss = torch.sqrt(mse_loss(reverse_normalized_predictions, labels))

        return (loss, outputs) if return_outputs else loss

In [16]:
import torch
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr
from torch.nn.functional import mse_loss

def train(config=None):
  with wandb.init(config=config):
    # set sweep configuration
    config = wandb.config

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

    # set training arguments
    training_args = TrainingArguments(
        output_dir="./esol-tf-output",
	      report_to='wandb',  # Turn on Weights & Biases logging
        logging_strategy='steps',
        evaluation_strategy='steps',
        save_strategy='steps',
        do_eval=True,
        num_train_epochs=100,
        logging_steps=10,
        per_device_train_batch_size=config.per_device_train_batch_size,
        learning_rate=config.learning_rate,
        weight_decay=config.weight_decay,
        load_best_model_at_end=True,     # load the best model when finished training
        metric_for_best_model='eval_loss'
    )

    def compute_metrics(eval_pred):
      predictions, labels = eval_pred
      predictions = predictions[:, 0]  # Adjust according to your model output format
      labels = labels.flatten()

      # During evaluation, reverse normalize predictions before calculating metrics
      eval_predictions = predictions * training_std + training_mean
      eval_r2 = r2_score(labels, eval_predictions)
      eval_mse = mean_squared_error(labels, eval_predictions)
      eval_mae = mean_absolute_error(labels, eval_predictions)
      eval_pearson_coef = pearsonr(labels, eval_predictions)[0]
      eval_rmse = torch.sqrt(torch.tensor(eval_mse))
      return {
          'eval_r2': eval_r2,
          'eval_rmse': eval_rmse.item(),  # Converting to Python scalar
          'eval_pearson': eval_pearson_coef,
          'eval_mae': eval_mae
      }

    # define training loop
    trainer = RegressionTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        labels_mean=training_mean,
        labels_std=training_std
    )


    # start training loop
    trainer.train()

    # trainer.eval()

wandb.agent(sweep_id, train, count = 3)

[34m[1mwandb[0m: Agent Starting Run: evhfb7xb with config:
[34m[1mwandb[0m: 	learning_rate: 1.753735240575967e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 64
[34m[1mwandb[0m: 	weight_decay: 0.001


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,R2,Rmse,Pearson,Mae
10,1.0001,1.816167,0.004432,1.923761,0.601018,1.497107
20,0.979,1.787174,0.034875,1.89412,0.7614,1.469256
30,0.9764,1.756085,0.067157,1.862173,0.800132,1.438742
40,0.9339,1.721932,0.102391,1.826667,0.80674,1.405487
50,0.9656,1.679809,0.144777,1.783017,0.802119,1.364593
60,0.9334,1.629306,0.194687,1.730207,0.790197,1.315805
70,0.87,1.571227,0.251538,1.668018,0.77752,1.258188
80,0.8888,1.506431,0.314169,1.596704,0.763384,1.196858
90,0.8441,1.440223,0.377352,1.521378,0.749267,1.136276
100,0.8096,1.383147,0.432431,1.452529,0.742137,1.098221


VBox(children=(Label(value='13.095 MB of 13.123 MB uploaded\r'), FloatProgress(value=0.9978404637901241, max=1…

0,1
eval/loss,██▇▆▅▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/mae,██▇▅▅▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/pearson,▁▅▅▄▄▅▆▆▇▇▇▇▇▇██████████████████████████
eval/r2,▁▂▃▄▅▆▇▇▇▇▇█████████████████████████████
eval/rmse,██▇▆▅▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▂▄▁▃▃▄▄▂▃▃▄▃▅▄▄▅▄▅▅▅▆▄▆▆▄▅▅▇▆▆▅██▅▅▇▆▆▇
eval/samples_per_second,█▇▅█▆▅▅▅▇▅▆▅▆▄▅▅▄▅▄▄▄▃▅▃▃▅▄▄▂▃▃▄▁▁▄▃▁▃▃▂
eval/steps_per_second,█▇▅█▆▅▅▅▇▅▆▅▆▄▅▅▄▅▄▄▄▃▅▃▃▅▄▄▂▃▃▄▁▁▄▃▁▃▃▂
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/loss,0.64203
eval/mae,0.52622
eval/pearson,0.94545
eval/r2,0.88171
eval/rmse,0.6631
eval/runtime,0.4124
eval/samples_per_second,274.005
eval/steps_per_second,36.372
total_flos,622852287283200.0
train/epoch,100.0


[34m[1mwandb[0m: Agent Starting Run: p8pscdpt with config:
[34m[1mwandb[0m: 	learning_rate: 5.844055219318087e-06
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	weight_decay: 0.0005


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,R2,Rmse,Pearson,Mae
10,0.951,1.849113,-0.031113,1.957803,-0.035298,1.537959
20,0.98,1.842063,-0.023579,1.950637,0.075132,1.530846
30,0.9934,1.833941,-0.015048,1.942491,0.190982,1.522527
40,1.0035,1.826876,-0.007483,1.935239,0.293951,1.515675
50,0.9457,1.819833,-4.8e-05,1.928085,0.379456,1.508858
60,1.004,1.811969,0.008091,1.920223,0.457898,1.500914
70,0.9064,1.804373,0.01603,1.912523,0.516939,1.493359
80,0.9766,1.795989,0.024589,1.904187,0.563618,1.484655
90,1.0235,1.788552,0.032326,1.896621,0.6007,1.477343
100,0.9636,1.780867,0.040321,1.888769,0.632472,1.469976


VBox(children=(Label(value='13.095 MB of 13.095 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,██▇▆▅▄▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/mae,██▇▅▅▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/pearson,▁▅▆▆▆▆▇▇▇▇▇▇████████████████████████████
eval/r2,▁▂▃▄▅▆▇▇▇▇▇▇▇███████████████████████████
eval/rmse,██▇▆▅▄▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▂▂▂▃▂▂▂▂▂▂▄▃▃▄▄▃▄▅▄▃▄▃▄▄▄▄▆▅▅▅▆▆▆▆▇▆█▇█
eval/samples_per_second,█▇▇▇▆▇▇▆▆▆▆▅▆▆▅▅▅▄▄▅▆▄▅▅▄▅▅▃▄▄▃▃▃▃▃▂▃▁▂▁
eval/steps_per_second,█▇▇▇▆▇▇▆▆▆▆▅▆▆▅▅▅▄▄▅▆▄▅▅▄▅▅▃▄▄▃▃▃▃▃▂▃▁▂▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/loss,0.73113
eval/mae,0.60152
eval/pearson,0.92733
eval/r2,0.84673
eval/rmse,0.75483
eval/runtime,0.4746
eval/samples_per_second,238.112
eval/steps_per_second,31.608
total_flos,622852287283200.0
train/epoch,100.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: sys24zpd with config:
[34m[1mwandb[0m: 	learning_rate: 2.72291641531306e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,R2,Rmse,Pearson,Mae
10,0.9663,1.804691,0.015293,1.913239,0.59456,1.485252
20,0.9914,1.7622,0.05988,1.869422,0.754934,1.44455
30,0.9318,1.71533,0.107831,1.821123,0.788553,1.398714
40,0.9201,1.658328,0.165015,1.761794,0.790101,1.345216
50,0.9213,1.592857,0.229917,1.691938,0.787453,1.284271
60,0.8373,1.507053,0.31082,1.600597,0.775713,1.197876
70,0.7823,1.421928,0.391295,1.504248,0.765585,1.122297
80,0.7176,1.341675,0.465689,1.40933,0.759861,1.06711
90,0.7574,1.273453,0.526835,1.326239,0.763985,1.021665
100,0.666,1.199764,0.585529,1.24126,0.785384,0.959946


VBox(children=(Label(value='13.095 MB of 13.095 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▆▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/mae,█▆▄▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/pearson,▁▂▄▆▆▇▇▇████████████████████████████████
eval/r2,▁▃▆▇▇▇██████████████████████████████████
eval/rmse,█▆▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▂▂▁▅▅▂▃▁▃▂▁▂▃▃▃▁▃▄▃▃▃▄▃▅▄▄▄▄▅▇▇▄▆▆▅▆▆█▇
eval/samples_per_second,█▇▇█▄▄▆▆█▆▆█▇▆▆▅█▆▅▆▆▆▅▅▃▄▅▄▅▄▁▂▅▃▃▃▃▃▁▂
eval/steps_per_second,█▇▇█▄▄▆▆█▆▆█▇▆▆▅█▆▅▆▆▆▅▅▃▄▅▄▅▄▁▂▅▃▃▃▃▃▁▂
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/loss,0.6264
eval/mae,0.50172
eval/pearson,0.94626
eval/r2,0.88075
eval/rmse,0.66581
eval/runtime,0.4265
eval/samples_per_second,264.975
eval/steps_per_second,35.174
total_flos,622852287283200.0
train/epoch,100.0
