# QSAR/QSPR Models Fine-Tuning 1: Classification 

This notebook is an example of a classification task on BBBP dataset.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import time
import tqdm
import sqlite3

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

from SmilesPE.pretokenizer import atomwise_tokenizer
from SmilesPE.pretokenizer import kmer_tokenizer
from SmilesPE.spe2vec import Corpus

import pandas as pd

from multiprocessing import Pool

from fastai import *
from fastai.text import *
from utils import *
import torch
print (torch.__version__)

import supp_utils as su

torch.cuda.set_device(0) #change to 0 if you only has one GPU 

1.3.1


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device,torch.cuda.is_available()

(device(type='cuda'), True)

In [3]:
data_path = Path('results')
name = 'classification_new'
path = data_path/name
path.mkdir(exist_ok=True, parents=True)

In [4]:
# To remove rdkit and other warning

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

warnings.filterwarnings('ignore')

## Load Data

In [80]:
trial = True # setting False saves the output files else not saved

multi_files = False

if multi_files:
    input_file = ["first_5000.txt","ML_input_5338.txt"]
else:
    input_file = "data/ML_input_5338_out.txt" # Input data containing smiles and label

if not trial:
    log_file.write("Used files " + str(input_file) + "\n")
    
number_of_augmentation = 1 # Data augmentation multiplier
train_percentage = 0.7 # Fraction to use for training (validation and test would be half of remaining data)
Number_of_workers = 8 # Number of CPU threads to use

In [81]:
# Removing data with lower distribution
lower_label_count_cutoff = 1000
upper_label_count_cutoff = 5000
enable_label_cutoff = True

if not multi_files and enable_label_cutoff:
    open_file = open(input_file,"r").readlines()
    
    # Find the  number of labels (count of each label)
    label_count_init = {}
    for entry in open_file:
        label = int(entry.split()[1])
        if label in label_count_init:
            label_count_init[label] += 1
        else:
            label_count_init[label] = 1
    
    # Select the label above the cutoff  (count of each label above cutoff)
    label_count = {}
    for entry in label_count_init:
        if label_count_init[entry] > lower_label_count_cutoff and label_count_init[entry] < upper_label_count_cutoff:
            label_count[entry] = label_count_init[entry]
    
    # Select only the smiles with labels above the cutoff
    smiles_label = {}
    allocated_label = []
    allowed_labels = sorted(label_count.keys())
    for entry in open_file:
        smiles = entry.split()[0]
        label = int(entry.split()[1])
        if label in allowed_labels:
            if label not in allocated_label:
                allocated_label.append(label)
            smiles_label[smiles] = allocated_label.index(label)
            
    

In [82]:
print (label_count)
print ()
print (allowed_labels)

{2: 1894, 1: 1887}

[1, 2]


In [83]:
check_dict ={}
for entry in smiles_label:
    if smiles_label[entry] not in check_dict:
        check_dict[smiles_label[entry]] = 1
    else:
        check_dict[smiles_label[entry]] += 1
#check_dict

In [84]:
if multi_files:
    for i,input_filename in enumerate(input_file):
        if i != 0:
            quantmap_data2 = pd.read_csv(input_filename,sep=" ",names=["Smiles", "Label"]).sample(frac=1).reset_index(drop=True) #,header=None)
            quantmap_data = pd.concat([quantmap_data,quantmap_data2])
        else:
            quantmap_data = pd.read_csv(input_filename,sep=" ",names=["Smiles", "Label"]).sample(frac=1).reset_index(drop=True) #,header=None)
    del quantmap_data2
else:
    quantmap_data = pd.read_csv(input_file,sep=" ",names=["Smiles", "Label"]).sample(frac=1).reset_index(drop=True)

if enable_label_cutoff:
    quantmap_data = pd.DataFrame(smiles_label.items(),columns=["Smiles", "Label"]).sample(frac=1).reset_index(drop=True)
    
print('Dataset:', quantmap_data.shape)
quantmap_data.head(5)

Dataset: (3781, 2)


Unnamed: 0,Smiles,Label
0,CN(C)CCC1=CNC2=C1C=C(C=C2)CC3COC(=O)N3,0
1,C(C1C(C(C(C(O1)OC2C(OC(C(C2O)O)OC3C(OC(C(C3O)O...,1
2,C1CCNC(C1)C(C2=CC(=NC3=C2C=CC=C3C(F)(F)F)C(F)(...,0
3,CN1C=NC2=C1C(=O)N=C(N2)N,1
4,CC1=C(N=CN1)CSCCNC(=NC)NC#N,0


In [85]:
print (quantmap_data.groupby('Label').count())
if not trial:
    log_file.write("Class distribution before augmentation\n")
    log_file.write(str(quantmap_data.groupby('Label').count()) + "\n")

       Smiles
Label        
0        1894
1        1887


In [86]:
# Removing unbalanced data and shuffling
balance_data = False
if balance_data:
    balanced_data = quantmap_data[quantmap_data.Label != 0][quantmap_data.Label != 3][quantmap_data.Label != 4]
    balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)
    balanced_data["Label"].replace({1: 0,2:1}, inplace=True)
    quantmap_data = balanced_data
    balanced_data.groupby('Label').count()


In [87]:
quantmap_data = su.sanity_check(quantmap_data,None,Number_of_workers)

                                        

In [88]:
valid_test_percentage = (1 - train_percentage)/2

data_to_use = quantmap_data
# Ratios
train_ratio = int (len(data_to_use) * train_percentage)
valid_ratio = train_ratio + int(len(data_to_use)*valid_test_percentage)
test_ratio = valid_ratio + int(len(data_to_use)*valid_test_percentage)

In [89]:
# Make index to split into train and val set
np.random.seed(3)
def make_index(len_data,train_ratio,valid_ratio,test_ratio):
    
    index = np.random.permutation(len_data)
    
    # Train index and val index
    return (index[:train_ratio],index[train_ratio:valid_ratio],index[valid_ratio:test_ratio])

train_index ,valid_index,test_index = make_index(len(data_to_use),train_ratio,valid_ratio,test_ratio)

In [90]:
train_data = quantmap_data[quantmap_data.index.isin(train_index)]
valid_data = quantmap_data[quantmap_data.index.isin(valid_index)]
test_data = quantmap_data[quantmap_data.index.isin(test_index)]

In [91]:
class_count_list = np.array([entry for entry in train_data.groupby('Label').count()["Smiles"]])
class_weight = np.min(class_count_list)/class_count_list
class_weight = torch.FloatTensor(class_weight).cuda()

if not trial:
    log_file.write("Class weight for loss (balancing weights)= " + str(class_weight) + "\n")
class_weight

tensor([1.0000, 0.9985], device='cuda:0')

In [92]:
train_augmentation_list = su.get_augmentation_list(train_data,number_of_augmentation)
valid_augmentation_list = su.get_augmentation_list(valid_data,number_of_augmentation)
test_augmentation_list = su.get_augmentation_list(test_data,number_of_augmentation)

In [93]:
###
iteration = 1000000
# Augmentation for training data
train_aug = su.smiles_augmentation(train_data,N_rounds=train_augmentation_list,iteration=iteration,data_set_type="train_data",Number_of_workers=Number_of_workers)
valid_aug = su.smiles_augmentation(valid_data,N_rounds=valid_augmentation_list,iteration=iteration,data_set_type="valid_data",Number_of_workers=Number_of_workers)
test_aug = su.smiles_augmentation(test_data,N_rounds=test_augmentation_list,iteration=iteration,data_set_type="test_data",Number_of_workers=Number_of_workers)

                                        

In [94]:
print (train_data.groupby('Label').count())
print (valid_data.groupby('Label').count())
print (test_data.groupby('Label').count())
if not trial:
    log_file.write("number of augmentation = " + str(number_of_augmentation) + "\n")
    log_file.write("Class distribution after augmentation\n")
    log_file.write("Train data\n")
    log_file.write(str(train_data.groupby('Label').count()) + "\n")
    log_file.write("Valid data\n")
    log_file.write(str(valid_data.groupby('Label').count()) + "\n")
    log_file.write("Test data\n")
    log_file.write(str(test_data.groupby('Label').count()) + "\n")
    log_file.write("Train/valid split ratio = " + str(train_percentage) + "\n")

       Smiles
Label        
0        1322
1        1324
       Smiles
Label        
0         292
1         275
       Smiles
Label        
0         280
1         287


## Adpot the Encoder of MSPM According to the Target Dataset.

In order to fine-tuning the pre-trained MSPM on the QSAR datasets of interest, we need to prepare the data:

- Tokenize the SMILES of the QSAR dataset.
- Align the token IDs of the QSAR dataset to the token IDs pre-trained MSPM. 

Often, the vocab size of the QSAR dataset is different from that of the pre-trained strcuture prediction model, which means the QSAR model will have a different input size from that of pre-trained model. Here, we need to change the input size of the pre-trained model to the vocab size of the QSAR dataset.

In [95]:
class MolTokenizer(BaseTokenizer):
    def __init__(self, lang = 'en'):
        self.lang = lang
        
    def tokenizer(self, smiles):        
        smiles = "[BOS]" + smiles
        tokens = atomwise_tokenizer(smiles)
        return tokens
    
    def add_special_cases(self, toks):
        pass

In [96]:
bs = 64
tok = Tokenizer(partial(MolTokenizer), n_cpus=8, pre_rules=[], post_rules=[])

In [97]:
qsar_vocab = TextLMDataBunch.from_df(path, train_aug, valid_aug, bs=bs, tokenizer=tok, 
                              chunksize=50000, text_cols=0,label_cols=1, max_vocab=60000, include_bos=False)

print(f'Vocab Size: {len(qsar_vocab.vocab.itos)}')

Vocab Size: 96


In [98]:
pretrained_model_path = Path('results/pretraining_new/models')

pretrained_fnames = ['pretraining_new_wt', 'pretraining_new_vocab']
fnames = [pretrained_model_path/f'{fn}.{ext}' for fn,ext in zip(pretrained_fnames, ['pth', 'pkl'])]

In [99]:
lm_learner = language_model_learner(qsar_vocab, AWD_LSTM, drop_mult=1.0)
lm_learner = lm_learner.load_pretrained(*fnames)
lm_learner.freeze()
lm_learner.save_encoder(f'lm_encoder')

## Databunch for QSAR Modeling

You need to change the `text_cols` and `label_col` based on your dataset.

In [100]:
data_clas = TextClasDataBunch.from_df(path, train_aug, valid_aug, bs=bs, tokenizer=tok, 
                                          chunksize=50000, text_cols='Smiles',label_cols='Label', 
                                          vocab=qsar_vocab.vocab, max_vocab=60000, include_bos=False)

In [101]:
data_clas.show_batch()

text,target
[BOS] C C 1 C ( C ( C C ( O 1 ) O C 2 C ( C ( C ( O C 2 O C 3 = C 4 C = C 5 C = C 3 O C 6 = C ( C = C ( C = C 6 ) C ( C ( C ( = O ) N C ( C ( =,0
[BOS] C C C ( C ) C ( C ( = O ) N C ( C C ( = O ) N ) C ( = O ) N C ( C C ( C ) C ) C ( = O ) N C ( C C C C N ) C ( = O ) N C ( C ) C ( = O ) N,1
[BOS] C ( C C C 1 C ( C ) ( C C ( N ) = O ) C 2 = C ( C ) C 3 = N C ( C ) ( C 4 C ( C C ( N ) = O ) C ( C ) ( C C C ( N C C ( O P ( = O ) ( O ),0
[BOS] C C 1 C ( C ( C C ( O 1 ) O C 2 C C ( C C 3 = C ( C 4 = C ( C ( = C 2 3 ) O ) C ( = O ) C 5 = C ( C 4 = O ) C = C C = C 5 O C ) O ) ( C (,1
[BOS] C 1 ( C C ( C ) C ) N ( C ) C ( = O ) C N ( C ) C ( = O ) C ( C C ) N C ( = O ) C ( C ( C ( C ) C C = C C ) O ) N ( C ) C ( = O ) C ( C (,0


In [102]:
class AUROC(Callback):
    _order = -20 #Needs to run before the recorder

    def __init__(self, learn, **kwargs): self.learn = learn
    def on_train_begin(self, **kwargs): self.learn.recorder.add_metric_names(['AUROC'])
    def on_epoch_begin(self, **kwargs): self.output, self.target = [], []
    
    def on_batch_end(self, last_target, last_output, train, **kwargs):
        if not train:
            self.output.append(last_output)
            self.target.append(last_target)
                
    def on_epoch_end(self, last_metrics, **kwargs):
        if len(self.output) > 0:
            output = torch.cat(self.output)
            target = torch.cat(self.target)
            preds = F.softmax(output, dim=1)
            metric = auroc_score(preds, target)
            return add_metrics(last_metrics, [metric])

## Fine-tuning

In [103]:
cls_learner = text_classifier_learner(data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2)
cls_learner.load_encoder(f'lm_encoder')
cls_learner.freeze()

In [104]:
#cls_learner.model

In [105]:
cls_learner.fit_one_cycle(4, 3e-3, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.72831,0.702305,0.527337,00:04
1,0.683058,0.689539,0.570547,00:04
2,0.65171,0.699294,0.563492,00:04
3,0.615875,0.712868,0.563492,00:04


In [106]:
cls_learner.freeze_to(-2)
cls_learner.fit_one_cycle(4, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.635676,0.735681,0.528219,00:04
1,0.644104,0.738805,0.536155,00:04
2,0.579515,0.78308,0.548501,00:04
3,0.493157,0.82831,0.551146,00:04


In [107]:
cls_learner.freeze_to(-3)
cls_learner.fit_one_cycle(4, slice(5e-4/(2.6**4),5e-4), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.458311,0.854441,0.549383,00:06
1,0.442702,0.886316,0.546737,00:06
2,0.417213,0.919518,0.547619,00:06
3,0.412886,0.936245,0.55291,00:06


In [108]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [109]:
cls_learner.unfreeze()
cls_learner.fit_one_cycle(6, slice(5e-5/(2.6**4),5e-5), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.417834,0.930339,0.544974,00:07
1,0.394736,0.918777,0.541446,00:07
2,0.409746,0.947798,0.546737,00:07
3,0.405609,0.933561,0.552028,00:07
4,0.385743,0.937424,0.551146,00:07
5,0.391422,0.946191,0.545855,00:07


In [39]:
cls_learner.fit_one_cycle(20, slice(5e-5/(2.6**4),5e-5), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.388172,0.762243,0.622575,00:04
1,0.382363,0.768161,0.620811,00:04
2,0.387755,0.766937,0.619048,00:04
3,0.373341,0.770508,0.620811,00:04
4,0.37505,0.774783,0.619048,00:04
5,0.375479,0.769883,0.606702,00:04
6,0.369886,0.776303,0.613757,00:04
7,0.374325,0.78726,0.61552,00:04
8,0.365175,0.778524,0.613757,00:04
9,0.368064,0.785986,0.606702,00:04


In [116]:
split_type = "two_class"
split_id = "trial"

In [117]:
cls_learner.save(f'{split_type}_{split_id}_clas')

## Test on the Test Set

1. Test only on Canoicial SMILES

In [118]:
test_data_clas = TextClasDataBunch.from_df(path, train, test, bs=bs, tokenizer=tok, 
                              chunksize=50000, text_cols='Smiles',label_cols='Label', vocab=qsar_vocab.vocab, max_vocab=60000,
                                              include_bos=False)

learner = text_classifier_learner(test_data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2)
learner.load(f'{split_type}_{split_id}_clas', purge=False);

NameError: name 'test' is not defined

In [None]:
learner.model

In [None]:
test_data_clas

In [None]:
test_get_scores(learner)

2. Test on averaging prediction of canoicial and randomized SMILES.

In [218]:
def test_smiles_augmentation(df):
    dist_aug = {col_name: [] for col_name in df}
    
    for i in range(df.shape[0]):
        dist_aug['Smiles'].append(randomize_smiles(df.iloc[i]['Smiles']))
        dist_aug['Label'].append(df.iloc[i]['Label'])
                     
    return pd.DataFrame.from_dict(dist_aug)

In [219]:
lb = torch.tensor(test['Label'].values)

In [220]:
preds = []

# Randomized SMILES Predictions
for i in range(4):
    np.random.seed(12*i)    
    test_aug = test_smiles_augmentation(test)
    
    # model
    test_data_clas = TextClasDataBunch.from_df(path, train, test_aug, bs=bs, tokenizer=tok, 
                              chunksize=50000, text_cols='Smiles',label_cols='Label', vocab=qsar_vocab.vocab, max_vocab=60000,
                                              include_bos=False)
    learner = text_classifier_learner(test_data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2)
    learner.load(f'{split_type}_{split_id}_clas', purge=False);
    
    
    #get predictions
    pred,lbl = learner.get_preds(ordered=True)
    
    preds.append(pred)

# Canonical SMILES Predictions

test_data_clas = TextClasDataBunch.from_df(path, train, test, bs=bs, tokenizer=tok, 
                              chunksize=50000, text_cols='Smiles',label_cols='Label', vocab=qsar_vocab.vocab, max_vocab=60000,
                                              include_bos=False)

learner = text_classifier_learner(test_data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2)
learner.load(f'{split_type}_{split_id}_clas', purge=False);


pred,lbl = learner.get_preds(ordered=True)


preds.append(pred)



In [228]:
#sum(preds)/len(preds)

In [221]:
avg_preds = sum(preds)/len(preds)
print(f'Performance of Averaging Predictions of Canoicial and Randomized SMILES: {roc_auc_score(lbl, avg_preds[:,1], multi_class="ovo"):.3f}')

Performance of Averaging Predictions of Canoicial and Randomized SMILES: 0.908
