# QSAR/QSPR Models Fine-Tuning 1: Classification 

This notebook is an example of a classification task on BBBP dataset.

In [184]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import time
import tqdm
import sqlite3

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

from SmilesPE.pretokenizer import atomwise_tokenizer
from SmilesPE.pretokenizer import kmer_tokenizer
from SmilesPE.spe2vec import Corpus

import pandas as pd

from multiprocessing import Pool

from fastai import *
from fastai.text import *
from utils import *
import torch
print (torch.__version__)

torch.cuda.set_device(0) #change to 0 if you only has one GPU 

Number_of_workers = 8

1.3.1


In [185]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device,torch.cuda.is_available()

(device(type='cuda'), True)

In [186]:
data_path = Path('results')
name = 'classification_new'
path = data_path/name
path.mkdir(exist_ok=True, parents=True)

In [187]:
# To remove rdkit and other warning

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

warnings.filterwarnings('ignore')

## Load Data

In [188]:
quantmap_data = pd.read_csv('data/ML_input_5338_out.txt',sep=" ",names=["Smiles", "Label"]).sample(frac=1).reset_index(drop=True) #,header=None)
print('Dataset:', quantmap_data.shape)
quantmap_data.head(5)

Dataset: (4982, 2)


Unnamed: 0,Smiles,Label
0,CCOP(=S)(CC)SC1=CC=CC=C1,2
1,CC[N+]1=C(C(=C(C(=C1C)C(=O)OC(C)C)C2=CC=CC=C2C...,2
2,C1=CC(=C(C=C1C(CNCCCCCCNCC(C2=CC(=C(C=C2)O)O)O...,1
3,C1=CC=C2C(=C1)C=CC=C2N=C=S,2
4,CC1=NC=C(C(=C1O)CN)CO,0


In [189]:
quantmap_data.groupby('Label').count()

Unnamed: 0_level_0,Smiles
Label,Unnamed: 1_level_1
0,714
1,1887
2,1894
3,259
4,228


In [190]:
# Removing unbalanced data and shuffling
balanced_data = quantmap_data[quantmap_data.Label != 0][quantmap_data.Label != 1][quantmap_data.Label != 2]
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)
balanced_data["Label"].replace({3: 0,4:1}, inplace=True)
balanced_data.groupby('Label').count()

Unnamed: 0_level_0,Smiles
Label,Unnamed: 1_level_1
0,259
1,228


In [191]:
train_percentage = 0.8
valid_test_percentage = (1 - train_percentage)/2

data_to_use = balanced_data
# Ratios
train_ratio = int (len(data_to_use) * train_percentage)
valid_ratio = train_ratio + int(len(data_to_use)*valid_test_percentage)
test_ratio = valid_ratio + int(len(data_to_use)*valid_test_percentage)

train = data_to_use.iloc[:train_ratio,:]
valid = data_to_use.iloc[train_ratio:valid_ratio,:]
test = data_to_use.iloc[valid_ratio:test_ratio,:]
print('Positive Sample:\n',train.groupby('Label').count(),valid.groupby('Label').count(),test.groupby('Label').count())

Positive Sample:
        Smiles
Label        
0         196
1         193        Smiles
Label        
0          31
1          17        Smiles
Label        
0          32
1          16


## Data Augmentation

In [192]:
from functools import partial

def randomize_smiles(smiles,random_smiles=[],iteration=5):
    try:
        m = Chem.MolFromSmiles(smiles)
        ans = list(range(m.GetNumAtoms()))
        np.random.shuffle(ans)
        nm = Chem.RenumberAtoms(m,ans)
        out_smiles = (Chem.MolToSmiles(nm, canonical=False, isomericSmiles=True, kekuleSmiles=False))
    except:
        return (False)
    
    if out_smiles not in random_smiles:
        return out_smiles
    else:
        iteration -= 1
        if iteration > 0:
            out_smiles = randomize_smiles(smiles,random_smiles,iteration)
            return out_smiles
        return (False)
    
def augment_smiles(count,iteration,smiles):
    random_smiles = []
    for i in range(count):
        if smiles != None:
            out_smiles = randomize_smiles(smiles,random_smiles,iteration=iteration)
            if out_smiles:
                random_smiles.append(out_smiles)
            else:
                break
        
    return random_smiles

def unpack_and_write_list(smiles,label,filename):
    for entry in smiles:
        if type(entry) == list:
            unpack_and_write_list(entry,label,filename)
        else:
            filename.write(entry + "," + str(label) + "\n")
    
def smiles_augmentation(df, N_rounds=1,iteration=5,data_set_type="train"):
    
    try:
        os.mkdir("data/classification/")
    except:
        pass
    
    filename = "data/classification/" + str(data_set_type) + "_aug_canonical_smiles.csv"

    aug_out = open(filename,"w")

    aug_out.write("Smiles,Label\n")
        
    labels = []
    for label in df.groupby('Label'):
        labels.append(label[0])
    
    augmentation_list = []
    if type(N_rounds) == list:
        assert(len(N_rounds) == len(labels))
        augmentation_list = N_rounds
    else:
        for i in range(len(labels)):
            augmentation_list.append(N_rounds)
        
    for label,augmentation in zip(labels,augmentation_list):
    
        canonical_smiles = df[df['Label'] == label]['Smiles'].to_list()

        p = Pool(Number_of_workers)
        func = partial(augment_smiles, augmentation, iteration)
        augmented_smiles = list(tqdm.tqdm(p.imap(func, canonical_smiles), total=len(canonical_smiles)))
        p.close()
    
        print ("Saving data for label = " + str(label))

        unpack_and_write_list(augmented_smiles,label,filename=aug_out)

        unpack_and_write_list(canonical_smiles,label,filename=aug_out)
        
        print ("Saved data for label = " + str(label))
        
    aug_out.close()


In [193]:
label_count_df = train.groupby('Label').count()
label_count_list = []
for entry in range(len(label_count_df)):
    label_count_list.append(label_count_df.iloc[entry][0])

augmentation_list = []
max_value = max(label_count_list)
for entry in label_count_list:
    augmentation_list.append(int(max_value/entry))

As shown above, the dataset is not balanced. For training data, we generated 10 and 30 randomized SMILES for molecules belong to the positive and negative classes, respectively. The numbers can be changed based on different datasets.

In [194]:
number_of_augmentation = 1

###
augmentation_list = [entry*number_of_augmentation for entry in augmentation_list]
iteration = 10000
# Augmentation for training data
train_data = smiles_augmentation(train,N_rounds=augmentation_list,iteration=iteration,data_set_type="train")

# Augmentation for validation data
val_data = smiles_augmentation(valid,N_rounds=augmentation_list,iteration=iteration,data_set_type="valid")

# Augmentation for test data
test_data = smiles_augmentation(test,N_rounds=augmentation_list,iteration=iteration,data_set_type="test")

100%|██████████| 196/196 [00:00<00:00, 5741.97it/s]

Saving data for label = 0
Saved data for label = 0



100%|██████████| 193/193 [00:00<00:00, 10728.40it/s]

Saving data for label = 1
Saved data for label = 1



100%|██████████| 31/31 [00:00<00:00, 4183.78it/s]

Saving data for label = 0
Saved data for label = 0



100%|██████████| 17/17 [00:00<00:00, 2835.79it/s]

Saving data for label = 1
Saved data for label = 1



100%|██████████| 32/32 [00:00<00:00, 5561.35it/s]

Saving data for label = 0
Saved data for label = 0



100%|██████████| 16/16 [00:00<00:00, 1555.03it/s]

Saving data for label = 1
Saved data for label = 1





In [195]:
train_aug = pd.read_csv("data/classification/train_aug_canonical_smiles.csv", header=0).sample(frac=1).reset_index(drop=True)
valid_aug = pd.read_csv("data/classification/valid_aug_canonical_smiles.csv", header=0).sample(frac=1).reset_index(drop=True)
test_aug = pd.read_csv("data/classification/test_aug_canonical_smiles.csv", header=0).sample(frac=1).reset_index(drop=True)

In [196]:
train_aug.groupby('Label').count(),valid_aug.groupby('Label').count(),test_aug.groupby('Label').count()

(       Smiles
 Label        
 0         392
 1         386,
        Smiles
 Label        
 0          62
 1          34,
        Smiles
 Label        
 0          64
 1          32)

## Adpot the Encoder of MSPM According to the Target Dataset.

In order to fine-tuning the pre-trained MSPM on the QSAR datasets of interest, we need to prepare the data:

- Tokenize the SMILES of the QSAR dataset.
- Align the token IDs of the QSAR dataset to the token IDs pre-trained MSPM. 

Often, the vocab size of the QSAR dataset is different from that of the pre-trained strcuture prediction model, which means the QSAR model will have a different input size from that of pre-trained model. Here, we need to change the input size of the pre-trained model to the vocab size of the QSAR dataset.

In [197]:
class MolTokenizer(BaseTokenizer):
    def __init__(self, lang = 'en'):
        self.lang = lang
        
    def tokenizer(self, smiles):        
        smiles = "[BOS]" + smiles
        tokens = atomwise_tokenizer(smiles)
        return tokens
    
    def add_special_cases(self, toks):
        pass

In [198]:
bs = 64
tok = Tokenizer(partial(MolTokenizer), n_cpus=8, pre_rules=[], post_rules=[])

In [199]:
qsar_vocab = TextLMDataBunch.from_df(path, train_aug, valid_aug, bs=bs, tokenizer=tok, 
                              chunksize=50000, text_cols=0,label_cols=1, max_vocab=60000, include_bos=False)

print(f'Vocab Size: {len(qsar_vocab.vocab.itos)}')

Vocab Size: 48


In [200]:
pretrained_model_path = Path('results/pretraining_new/models')

pretrained_fnames = ['pretraining_new_wt', 'pretraining_new_vocab']
fnames = [pretrained_model_path/f'{fn}.{ext}' for fn,ext in zip(pretrained_fnames, ['pth', 'pkl'])]

In [201]:
lm_learner = language_model_learner(qsar_vocab, AWD_LSTM, drop_mult=1.0)
lm_learner = lm_learner.load_pretrained(*fnames)
lm_learner.freeze()
lm_learner.save_encoder(f'lm_encoder')

## Databunch for QSAR Modeling

You need to change the `text_cols` and `label_col` based on your dataset.

In [202]:
data_clas = TextClasDataBunch.from_df(path, train_aug, valid_aug, bs=bs, tokenizer=tok, 
                                          chunksize=50000, text_cols='Smiles',label_cols='Label', 
                                          vocab=qsar_vocab.vocab, max_vocab=60000, include_bos=False)

In [203]:
data_clas.show_batch()

text,target
[BOS] C [N+] 1 = C C = C ( C = C 1 ) C 2 = C 3 C = C C ( = C ( C 4 = C C = C ( N 4 ) C ( = C 5 C = C C ( = N 5 ) C ( = C 6 C = C C 2 = N 6 ) C 7 =,1
[BOS] C 1 = N C 2 = C ( N 1 C 3 C ( C ( C ( O 3 ) C O P ( = O ) ( O ) O P ( = O ) ( O ) O P ( = O ) ( O ) O P ( = O ) ( O ) O C C 4 C ( C ( C (,1
[BOS] N c 1 [nH] c 2 n ( C 3 O C ( C O P ( O ) ( = O ) O P ( O ) ( = O ) O P ( O ) ( O P ( = O ) ( O ) O C C 4 C ( O ) C ( O ) C ( n 5 c 6 c ( n c,1
[BOS] C C C 1 C = C ( C ( C ( C C ( C 2 C ( C C ( C ( O 2 ) ( C ( = O ) C ( = O ) N 3 C C C C C 3 C ( = O ) O C ( C ( C ( C C 1 = O ) O ) C ) C,0
[BOS] O C 1 C ( C ) = C C ( C C ) C ( = O ) C C ( O ) C ( C ) C ( C ( = C C 2 C C C ( O ) C ( O C ) C 2 ) C ) O C ( = O ) C 2 N ( C ( = O ) C (,0


In [204]:
class AUROC(Callback):
    _order = -20 #Needs to run before the recorder

    def __init__(self, learn, **kwargs): self.learn = learn
    def on_train_begin(self, **kwargs): self.learn.recorder.add_metric_names(['AUROC'])
    def on_epoch_begin(self, **kwargs): self.output, self.target = [], []
    
    def on_batch_end(self, last_target, last_output, train, **kwargs):
        if not train:
            self.output.append(last_output)
            self.target.append(last_target)
                
    def on_epoch_end(self, last_metrics, **kwargs):
        if len(self.output) > 0:
            output = torch.cat(self.output)
            target = torch.cat(self.target)
            preds = F.softmax(output, dim=1)
            metric = auroc_score(preds, target)
            return add_metrics(last_metrics, [metric])

## Fine-tuning

In [205]:
cls_learner = text_classifier_learner(data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2)
cls_learner.load_encoder(f'lm_encoder')
cls_learner.freeze()

In [206]:
cls_learner.model

SequentialRNN(
  (0): MultiBatchEncoder(
    (module): AWD_LSTM(
      (encoder): Embedding(48, 400, padding_idx=1)
      (encoder_dp): EmbeddingDropout(
        (emb): Embedding(48, 400, padding_idx=1)
      )
      (rnns): ModuleList(
        (0): WeightDropout(
          (module): LSTM(400, 1152, batch_first=True)
        )
        (1): WeightDropout(
          (module): LSTM(1152, 1152, batch_first=True)
        )
        (2): WeightDropout(
          (module): LSTM(1152, 400, batch_first=True)
        )
      )
      (input_dp): RNNDropout()
      (hidden_dps): ModuleList(
        (0): RNNDropout()
        (1): RNNDropout()
        (2): RNNDropout()
      )
    )
  )
  (1): PoolingLinearClassifier(
    (layers): Sequential(
      (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Dropout(p=0.08000000000000002, inplace=False)
      (2): Linear(in_features=1200, out_features=50, bias=True)
      (3): ReLU(inplace=True)
      (4): BatchN

In [207]:
cls_learner.fit_one_cycle(4, 3e-3, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.650228,0.681211,0.572917,00:02
1,0.482701,0.645533,0.760417,00:02
2,0.380646,0.628124,0.6875,00:02
3,0.325025,0.582642,0.75,00:02


In [208]:
cls_learner.freeze_to(-2)
cls_learner.fit_one_cycle(4, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.238771,0.578381,0.708333,00:02
1,0.266326,0.382529,0.8125,00:02
2,0.217244,0.432686,0.78125,00:02
3,0.172607,0.415987,0.770833,00:02


In [209]:
cls_learner.freeze_to(-3)
cls_learner.fit_one_cycle(4, slice(5e-4/(2.6**4),5e-4), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.087045,0.424461,0.791667,00:02
1,0.095922,0.421304,0.791667,00:02
2,0.086082,0.416386,0.791667,00:02
3,0.092283,0.435109,0.791667,00:02


In [210]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [211]:
cls_learner.unfreeze()
cls_learner.fit_one_cycle(6, slice(5e-5/(2.6**4),5e-5), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.068385,0.422301,0.791667,00:02
1,0.075419,0.435164,0.78125,00:02
2,0.074571,0.426764,0.822917,00:02
3,0.069021,0.429549,0.802083,00:02
4,0.068134,0.420946,0.802083,00:02
5,0.071122,0.427485,0.8125,00:02


In [212]:
split_type = "two_class"
split_id = "trial"

In [213]:
cls_learner.save(f'{split_type}_{split_id}_clas')

## Test on the Test Set

1. Test only on Canoicial SMILES

In [214]:
test_data_clas = TextClasDataBunch.from_df(path, train, test, bs=bs, tokenizer=tok, 
                              chunksize=50000, text_cols='Smiles',label_cols='Label', vocab=qsar_vocab.vocab, max_vocab=60000,
                                              include_bos=False)

learner = text_classifier_learner(test_data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2)
learner.load(f'{split_type}_{split_id}_clas', purge=False);

In [215]:
learner.model

SequentialRNN(
  (0): MultiBatchEncoder(
    (module): AWD_LSTM(
      (encoder): Embedding(48, 400, padding_idx=1)
      (encoder_dp): EmbeddingDropout(
        (emb): Embedding(48, 400, padding_idx=1)
      )
      (rnns): ModuleList(
        (0): WeightDropout(
          (module): LSTM(400, 1152, batch_first=True)
        )
        (1): WeightDropout(
          (module): LSTM(1152, 1152, batch_first=True)
        )
        (2): WeightDropout(
          (module): LSTM(1152, 400, batch_first=True)
        )
      )
      (input_dp): RNNDropout()
      (hidden_dps): ModuleList(
        (0): RNNDropout()
        (1): RNNDropout()
        (2): RNNDropout()
      )
    )
  )
  (1): PoolingLinearClassifier(
    (layers): Sequential(
      (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Dropout(p=0.08000000000000002, inplace=False)
      (2): Linear(in_features=1200, out_features=50, bias=True)
      (3): ReLU(inplace=True)
      (4): BatchN

In [216]:
test_data_clas

TextClasDataBunch;

Train: LabelList (389 items)
x: TextList
[BOS] C 1 C ( C ( O C 1 N 2 C = C C ( = N C 2 = O ) N ) C O P ( = O ) ( O ) O P ( = O ) ( O ) O ) O,[BOS] C C N ( C C C C 1 = C C = C C = C 1 ) C C C C 2 = C C = C C = C 2,[BOS] C C 1 2 C C C ( = O ) C = C 1 C C C 3 C 2 C C C 4 ( C 3 C C C 4 ( C ) O ) C,[BOS] C C N ( C C ) C C C 1 = C N C 2 = C 1 C = C C ( = C 2 ) F . Cl,[BOS] C N 1 C C C C ( C 1 ) C C 2 C 3 = C C = C C = C 3 S C 4 = C C = C C = C 2 4
y: CategoryList
1,0,1,0,0
Path: results/classification_new;

Valid: LabelList (48 items)
x: TextList
[BOS] C [N+] 1 ( C C C ( C 1 ) O C ( = O ) C ( C 2 C C C C 2 ) ( C 3 = C C = C C = C 3 ) O ) C,[BOS] C O C 1 = C C = C C = C 1 N 2 C C N C C 2,[BOS] C C 1 ( O C 2 C C 3 C 4 C C ( C 5 = C C ( = O ) C = C C 5 ( C 4 C ( C C 3 ( C 2 ( O 1 ) C ( = O ) C O ) C ) O ) C ) F ) C,[BOS] C N S ( = O ) ( = O ) C C C 1 = C C 2 = C ( C = C 1 ) N C = C 2 C 3 C C N ( C C 3 ) C,[BOS] xxpad . [OH-]
y: CategoryList
0,0,1,0,0
Path: results/classifica

In [217]:
test_get_scores(learner)

Testing 48 molecues
Accuracy: 0.833
False Positives: 0.146
False Negatives: 0.021
Recall: 0.938
Precision: 0.682
Sensitivity: 0.938
Specificity: 0.781
MCC: 0.680
ROCAUC: 0.939


2. Test on averaging prediction of canoicial and randomized SMILES.

In [218]:
def test_smiles_augmentation(df):
    dist_aug = {col_name: [] for col_name in df}
    
    for i in range(df.shape[0]):
        dist_aug['Smiles'].append(randomize_smiles(df.iloc[i]['Smiles']))
        dist_aug['Label'].append(df.iloc[i]['Label'])
                     
    return pd.DataFrame.from_dict(dist_aug)

In [219]:
lb = torch.tensor(test['Label'].values)

In [220]:
preds = []

# Randomized SMILES Predictions
for i in range(4):
    np.random.seed(12*i)    
    test_aug = test_smiles_augmentation(test)
    
    # model
    test_data_clas = TextClasDataBunch.from_df(path, train, test_aug, bs=bs, tokenizer=tok, 
                              chunksize=50000, text_cols='Smiles',label_cols='Label', vocab=qsar_vocab.vocab, max_vocab=60000,
                                              include_bos=False)
    learner = text_classifier_learner(test_data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2)
    learner.load(f'{split_type}_{split_id}_clas', purge=False);
    
    
    #get predictions
    pred,lbl = learner.get_preds(ordered=True)
    
    preds.append(pred)

# Canonical SMILES Predictions

test_data_clas = TextClasDataBunch.from_df(path, train, test, bs=bs, tokenizer=tok, 
                              chunksize=50000, text_cols='Smiles',label_cols='Label', vocab=qsar_vocab.vocab, max_vocab=60000,
                                              include_bos=False)

learner = text_classifier_learner(test_data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2)
learner.load(f'{split_type}_{split_id}_clas', purge=False);


pred,lbl = learner.get_preds(ordered=True)


preds.append(pred)



In [228]:
#sum(preds)/len(preds)

In [221]:
avg_preds = sum(preds)/len(preds)
print(f'Performance of Averaging Predictions of Canoicial and Randomized SMILES: {roc_auc_score(lbl, avg_preds[:,1], multi_class="ovo"):.3f}')

Performance of Averaging Predictions of Canoicial and Randomized SMILES: 0.908
