# Fine-tuning ChemBERTa on SMILES Data for Human Acetylcholinesterase

In [3]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [4]:
import pandas as pd
df_2class = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Bioinformatics/Acetylcholinesterase/acetylcholinesterase_bioactivity_data_2class_pIC50.csv")

In [5]:
columnNames = ["canonical_smiles","bioactivity_class"]
smiles_2class = pd.DataFrame(data =df_2class, columns=columnNames)
smiles_2class

Unnamed: 0,canonical_smiles,bioactivity_class
0,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,active
1,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,active
2,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,inactive
3,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,active
4,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,active
...,...,...
2723,c1ccc2oc(CN(CCCCCCCNc3c4c(nc5ccccc35)CCCCC4)Cc...,inactive
2724,O=C(NCCCCCCCNc1c2c(nc3ccccc13)CCC2)c1cc2ccccc2o1,active
2725,COc1cccc2cc(C(=O)NCCCCCCNc3c4c(nc5ccccc35)CCCC...,inactive
2726,O=C(NCCCCCCNc1c2c(nc3ccccc13)CCC2)c1cc2ccccc2o1,inactive


In [6]:
columnNames = ["canonical_smiles","bioactivity_class"]
smiles_2class = pd.DataFrame(data =df_2class, columns=columnNames)
smiles_2class

Unnamed: 0,canonical_smiles,bioactivity_class
0,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,active
1,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,active
2,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,inactive
3,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,active
4,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,active
...,...,...
2723,c1ccc2oc(CN(CCCCCCCNc3c4c(nc5ccccc35)CCCCC4)Cc...,inactive
2724,O=C(NCCCCCCCNc1c2c(nc3ccccc13)CCC2)c1cc2ccccc2o1,active
2725,COc1cccc2cc(C(=O)NCCCCCCNc3c4c(nc5ccccc35)CCCC...,inactive
2726,O=C(NCCCCCCNc1c2c(nc3ccccc13)CCC2)c1cc2ccccc2o1,inactive


## Integer Encoding on Bioactivity Class

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
smiles_2class['bioactivity_code'] = le.fit_transform(smiles_2class['bioactivity_class'])
smiles_2class

Unnamed: 0,canonical_smiles,bioactivity_class,bioactivity_code
0,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,active,0
1,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,active,0
2,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,inactive,1
3,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,active,0
4,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,active,0
...,...,...,...
2723,c1ccc2oc(CN(CCCCCCCNc3c4c(nc5ccccc35)CCCCC4)Cc...,inactive,1
2724,O=C(NCCCCCCCNc1c2c(nc3ccccc13)CCC2)c1cc2ccccc2o1,active,0
2725,COc1cccc2cc(C(=O)NCCCCCCNc3c4c(nc5ccccc35)CCCC...,inactive,1
2726,O=C(NCCCCCCNc1c2c(nc3ccccc13)CCC2)c1cc2ccccc2o1,inactive,1


## Split Dataset

In [8]:
from sklearn.model_selection import train_test_split

train_data_2class, test_data_2class, train_labels_2class, test_labels_2class = train_test_split(smiles_2class['canonical_smiles'], smiles_2class['bioactivity_code'], test_size=0.2, random_state=42)
train_data_2class, val_data_2class, train_labels_2class, val_labels_2class = train_test_split(train_data_2class, train_labels_2class, test_size=0.2, random_state=42)

In [9]:
train_2class = pd.concat([train_data_2class, train_labels_2class], axis=1)
test_2class = pd.concat([test_data_2class, test_labels_2class], axis=1)
val_2class = pd.concat([val_data_2class, val_labels_2class], axis=1)

In [10]:
print("Train Dataset: {}".format(train_2class.shape))
print("Eval Dataset: {}".format(val_2class.shape))
print("TEST Dataset: {}".format(test_2class.shape))

Train Dataset: (1745, 2)
Eval Dataset: (437, 2)
TEST Dataset: (546, 2)


## Import libraries

In [None]:
!pip install --pre deepchem
import deepchem
from rdkit import Chem

In [None]:
!pip install transformers
!pip install simpletransformers
!pip install wandb

In [13]:
from simpletransformers.classification import ClassificationModel
import logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

## Build Classifucation Model with ChemBERTa

In [14]:
# Check avaibility CUDA
import torch

print(torch.cuda.is_available())

True


In [15]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

model = ClassificationModel('roberta', 'seyonec/PubChem10M_SMILES_BPE_396_250', args={'evaluate_each_epoch': True, 'evaluate_during_training_verbose': True, 'no_save': True, 'num_train_epochs': 10, 'auto_weights': True}) 

Downloading (…)lve/main/config.json:   0%|          | 0.00/515 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/336M [00:00<?, ?B/s]

Some weights of the model checkpoint at seyonec/PubChem10M_SMILES_BPE_396_250 were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE

Downloading (…)okenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/165k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/101k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [16]:
print(model.tokenizer)

RobertaTokenizerFast(name_or_path='seyonec/PubChem10M_SMILES_BPE_396_250', vocab_size=7924, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)


In [17]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Train the Model

In [22]:
#Train the model
model.train_model(train_2class, eval_df=val_2class, output_dir='/content/PubChem_acetylcholinesterase_2class', args={'wandb_project': 'Drug Discovery - Acetylcholinesterase'})

  0%|          | 0/1745 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

VBox(children=(Label(value='0.166 MB of 0.166 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training loss,▆▅█▆▆▅▄▃▆▆▅▄▄▃▃▅▆▄▂▂▃▃▂▂▆▂▂▅▁▁▂▁▂▁▂▁▂▁▁▂
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇██
lr,▄▆███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁

0,1
Training loss,0.08493
global_step,2150.0
lr,0.0


Running Epoch 0 of 10:   0%|          | 0/219 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/219 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/219 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/219 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/219 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/219 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/219 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/219 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/219 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/219 [00:00<?, ?it/s]

(2190, 0.1394020786432371)

In [23]:
import sklearn
result, model_outputs, wrong_predictions = model.eval_model(test_2class, acc=sklearn.metrics.accuracy_score)



  0%|          | 0/546 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/69 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training loss,▁▁▃▁▁▃█▃▁▂▄▃▁▁▁▁▁▁▄▁▁▁▃▁▁▁▂▃▃▁▁▁▁▁▁▁▁▁▁▁
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇██
lr,▄▆███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁

0,1
Training loss,6e-05
global_step,2150.0
lr,0.0




In [24]:
result

{'mcc': 0.4029052249036781,
 'tp': 168,
 'tn': 216,
 'fp': 88,
 'fn': 74,
 'auroc': 0.7629947803392778,
 'auprc': 0.7053420328643043,
 'acc': 0.7032967032967034,
 'eval_loss': 2.26989429562852}