In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_excel('Raw Data.xlsx')

In [2]:
df.shape

(350, 6)

In [3]:
df.dropna(inplace=True)

In [4]:
df.shape

(349, 6)

In [5]:
df = df[['Heavy Chain Sequences','Light Chain Sequences','Payload SMILES','Linker SMILES']]

In [6]:
df.columns = ['heavy_chain','light_chain','payload_smiles','linker_smiles']

In [7]:
df_train,df_val = train_test_split(df,test_size=0.20, random_state=42)

In [8]:
df_train.shape

(279, 4)

df_val.shape

# Finetune Seq2seq model

In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, Dataset
from transformers import EarlyStoppingCallback
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Running on {device}')
# Load a pre-trained Seq2Seq model like T5 or BART
model_name = 'google/flan-t5-base'#"t5-small"  # You can choose "facebook/bart-large" or any other Seq2Seq model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model.to(device)

def preprocess_function(examples):
    inputs = []
    
    for heavy_chain, light_chain, payload_smiles in zip(examples['heavy_chain'], examples['light_chain'], examples['payload_smiles']):
        input_str = heavy_chain + " " + light_chain + " " + payload_smiles
        inputs.append(input_str)
    
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['linker_smiles'], max_length=512, truncation=True, padding="max_length")

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_dataset = Dataset.from_dict(df_train)
val_dataset = Dataset.from_dict(df_val)

tokenized_dataset_train = train_dataset.map(preprocess_function, batched=True)
tokenized_dataset_val = val_dataset.map(preprocess_function, batched=True)

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=3, # reduce if getting out of memory error
    per_device_eval_batch_size=3, # reduce if getting out of memory error
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5, # epochs
    logging_steps=1,
    predict_with_generate=True,
    load_best_model_at_end=True, 
    metric_for_best_model="eval_loss", 
    greater_is_better=False  
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)] 
)


trainer.train()

2024-10-12 06:15:39.808769: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Running on cuda




Map:   0%|          | 0/279 [00:00<?, ? examples/s]



Map:   0%|          | 0/70 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.3206,0.252251
2,0.1541,0.128242
3,0.0908,0.100043
4,0.1686,0.0911
5,0.1852,0.087313


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=465, training_loss=1.454674028100506, metrics={'train_runtime': 185.2474, 'train_samples_per_second': 7.53, 'train_steps_per_second': 2.51, 'total_flos': 955236506664960.0, 'train_loss': 1.454674028100506, 'epoch': 5.0})

In [10]:
row_num = 12
heavy = df['heavy_chain'][row_num]
light = df['light_chain'][row_num]
payload = df['payload_smiles'][row_num]

In [12]:
#Generate predictions (Linker SMILES) using the fine-tuned model
test_inputs = f"{heavy} {light} {payload}"

# Tokenize the input
test_inputs_tokenized = tokenizer(test_inputs, return_tensors="pt", max_length=512, truncation=True)

input_tensor = test_inputs_tokenized.to(device)  # Move inputs to the correct device

# Generate the Linker SMILES
outputs = model.generate(test_inputs_tokenized['input_ids'], max_length=50)

# Decode the result
predicted_smiles = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Predicted Linker SMILES: {predicted_smiles}")

Predicted Linker SMILES: C1CC(=O)N(C1=O)NC(=O)CCCCCN2C(=O)C=CC2=O
