In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os
import gc
import glob
import sys
import random
import string
import tqdm
import json
import time
import sqlite3
import warnings
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import RDLogger

from SmilesPE.pretokenizer import atomwise_tokenizer
from SmilesPE.pretokenizer import kmer_tokenizer
from SmilesPE.spe2vec import Corpus

from multiprocessing import Pool

from fastai import *
from fastai.text import *
#from utils import *
import torch

sys.path.append('/scratch-shared/akshai/Publication/supp_scripts/')
import supp_utils as su

#torch.cuda.set_device(0) #change to 0 if you only has one GPU
# set gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device,torch.cuda.is_available()

Could not import custom script CNN


(device(type='cuda'), True)

In [2]:
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore")
# To remove rdkit warning
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

In [3]:
parameter_filename = "parameters.json" 

In [9]:
parameter_file = open(parameter_filename)
parameters = json.load(parameter_file)
parameter_file.close()

# User inputs
input_file = parameters["input_file"] #"/scratch-shared/akshai/Publication/initial_models/ML_input_5338.txt" # input file

trial = parameters["trial"] # setting False saves the output files else not saved

# User inputs
input_file = parameters["input_file"] #"/scratch-shared/akshai/Publication/initial_models/ML_input_5338.txt" # input file

trial = parameters["trial"] # setting False saves the output files else not saved

# Removing data with lower distribution
#enable_label_cutoff = parameters["label_cutoff"]["enable_label_cutoff"]
lower_label_count_cutoff = int(parameters["label_cutoff"]["lower_label_count_cutoff"])
upper_label_count_cutoff = int(parameters["label_cutoff"]["upper_label_count_cutoff"])

k_fold_value = int(parameters["k_fold_value"]) # Number of folds

test_set_percentage = float(parameters["test_set_percentage"])

label_wise_augmentation = parameters["augmentation"]["label_wise_augmentation"]
number_of_augmentation = int(parameters["augmentation"]["number_of_augmentation"])
iteration = int(parameters["augmentation"]["iteration"])

tokenization = parameters["tokens"]["tokenization"] # options are SPE,atomwise,vocab_file
if tokenization == "SPE":
    spe_token_path = parameters["tokens"]["spe_token_path"]

#####################
# Network parameters#
#####################
load_model = parameters["pretrained_model"]["load_model"]
#if load_model is True set the path for pretrained_model_path
pretrained_model_path = parameters["pretrained_model"]["pretrained_model_path"]
pretraining_new_wt = parameters["pretrained_model"]["pretraining_new_wt"]
pretraining_new_vocab = parameters["pretrained_model"]["pretraining_new_vocab"]

epochs = int(parameters["network_parameters"]["epochs"])
batch_size = int(parameters["network_parameters"]["batch_size"])
learning_rate = float(parameters["network_parameters"]["learning_rate"])
enable_class_weight = parameters["network_parameters"]["enable_class_weight"]

Number_of_workers = int(parameters["Number_of_workers"])


##################
### Do not edit###
##################
os.system("mkdir run_files")

atomwise_tokenization = False
train_SPE = False

if tokenization == "SPE":
    train_SPE = True
else:
    atomwise_tokenization = True

if not trial:
    network_parameter_output = open("run_files/network_parameters.txt","w",1)
    for parameter in parameters:
        network_parameter_output.write(str(parameter) + " = " + str(parameters[parameter]) + "\n")

In [10]:
smiles_label,label_count = su.get_data_within_cutoff(input_file,lower_label_count_cutoff,upper_label_count_cutoff,sanitize=True,canonical=False)

                                                         

0 molecules removed after sanity check
1354/85860 data points obtained




In [11]:
train_valid_df,test_df,_ = su.split_data_with_label(smiles_label,train_percentage=1-test_set_percentage,valid_percentage=test_set_percentage)

In [16]:
data_path = Path('results')
name = 'classification_new'
path = data_path/name
path.mkdir(exist_ok=True, parents=True)

In [18]:
gc.collect()
torch.cuda.empty_cache()

In [23]:
for fold in range(k_fold_value):
    
    if not trial:
        log_file = open("run_files/model_" + str(fold) + ".txt","w")
        
    piece_count = fold + 1
    # create train and valid dataframe
    train,valid,piece_count = su.CV.get_K_fold_cv_data(train_valid_df,k_fold_value,piece_count,shuffle_output=True)
    train_df = pd.DataFrame(train.items(),columns=["Smiles", "Label"]).sample(frac=1).reset_index(drop=True)
    valid_df = pd.DataFrame(valid.items(),columns=["Smiles", "Label"]).sample(frac=1).reset_index(drop=True)
    
    # calculate class_weight
    if enable_class_weight:
        class_weight = torch.FloatTensor(su.get_class_weight(train_df)).cuda()
        if not trial:
            log_file.write("Class weight for loss (balancing weights)= " + str(class_weight) + "\n")
    
    if not trial:
        log_file.write("Class distribution before augmentation\n")
        log_file.write("Train data\n")
        log_file.write(str(train_df.groupby('Label').count()) + "\n")
        log_file.write("Valid data\n")
        log_file.write(str(valid_df.groupby('Label').count()) + "\n"))
        log_file.write("Test data\n")
        log_file.write(str(test_df.groupby('Label').count()) + "\n")
        
    # Data augmentation
    if number_of_augmentation > 0:
        if label_wise_augmentation:
            
            train_augmentation_list = su.get_augmentation_list(train_df,number_of_augmentation)
            number_of_augmentation_train = train_augmentation_list
            
            valid_augmentation_list = su.get_augmentation_list(valid_df,number_of_augmentation)
            number_of_augmentation_valid = valid_augmentation_list
            
            if fold == 0:
                test_augmentation_list = su.get_augmentation_list(test_df,number_of_augmentation)
                number_of_augmentation_test = test_augmentation_list

        else:   
            number_of_augmentation_train = number_of_augmentation
            number_of_augmentation_valid = number_of_augmentation
            if fold == 0:
                number_of_augmentation_test = number_of_augmentation
                
        train_data = su.smiles_augmentation(train_df,
                                            N_rounds=number_of_augmentation_train,
                                            iteration=iteration,
                                            data_set_type="train_data",
                                            Number_of_workers=Number_of_workers)     
            
        valid_data = su.smiles_augmentation(valid_df,
                                            N_rounds=number_of_augmentation_valid,
                                            iteration=iteration,
                                            data_set_type="valid_data",
                                            Number_of_workers=Number_of_workers)
        if fold == 0:
            test_data = su.smiles_augmentation(test_df,
                                            N_rounds=number_of_augmentation_test,
                                            iteration=iteration,
                                            data_set_type="test_data",
                                            Number_of_workers=Number_of_workers)
        
        if not trial:
            log_file.write("number of augmentation = " + str(number_of_augmentation) + "\n")
            log_file.write("Class distribution after augmentation\n")
            log_file.write("Train data\n")
            log_file.write(str(train_data.groupby('Label').count()) + "\n")
            log_file.write("Valid data\n")
            log_file.write(str(valid_data.groupby('Label').count()) + "\n")
            log_file.write("Test data\n")
            log_file.write(str(valid_data.groupby('Label').count()) + "\n")
    else:
        train_data = train_df
        valid_data = valid_df
        if fold == 0:
            test_data = test_df
        
    if tokenization == "SPE":
        MolTokenizer = su.molpmofit.MolTokenizer_spe_sos_eos
    else:
        MolTokenizer = su.molpmofit.MolTokenizer_atomwise_sos_eos

    tok = Tokenizer(partial(MolTokenizer,token_path=spe_token_path), n_cpus=Number_of_workers, pre_rules=[], post_rules=[])

    qsar_vocab = TextLMDataBunch.from_df(path, train_data, valid_data, bs=batch_size, tokenizer=tok, 
                                  chunksize=50000, text_cols=0,label_cols=1, max_vocab=60000, include_bos=False)

    pretrained_model_path = Path(pretrained_model_path)

    pretrained_fnames = [pretraining_new_wt, pretraining_new_vocab]
    fnames = [pretrained_model_path/f'{fn}.{ext}' for fn,ext in zip(pretrained_fnames, ['pth', 'pkl'])]

    lm_learner = language_model_learner(qsar_vocab, AWD_LSTM, drop_mult=1.0)
    lm_learner = lm_learner.load_pretrained(*fnames)
    lm_learner.freeze()
    lm_learner.save_encoder(f'lm_encoder')

    data_clas = TextClasDataBunch.from_df(path, train_data, valid_data, bs=batch_size, tokenizer=tok, 
                                              chunksize=50000, text_cols='Smiles',label_cols='Label', 
                                              vocab=qsar_vocab.vocab, max_vocab=60000, include_bos=False)
    
    cls_learner = text_classifier_learner(data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2)
    cls_learner.load_encoder(f'lm_encoder')
    
    cls_learner.freeze()
    cls_learner.fit_one_cycle(4, 3e-3, moms=(0.8,0.7))
    cls_learner.freeze_to(-2)
    cls_learner.fit_one_cycle(4, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))
    cls_learner.freeze_to(-3)
    cls_learner.fit_one_cycle(4, slice(5e-4/(2.6**4),5e-4), moms=(0.8,0.7))
    cls_learner.unfreeze()
    cls_learner.fit_one_cycle(6, slice(5e-5/(2.6**4),5e-5), moms=(0.8,0.7))
    
    split_type = ""
    split_id = "model_" + str(fold)
    cls_learner.save(f'{split_type}_{split_id}_clas')
    gc.collect()
    torch.cuda.empty_cache()

epoch,train_loss,valid_loss,accuracy,time
0,2.696974,2.592409,0.076923,00:02
1,2.654603,2.602691,0.062937,00:02
2,2.630439,2.602851,0.062937,00:02
3,2.610514,2.611013,0.076923,00:02


epoch,train_loss,valid_loss,accuracy,time
0,2.581578,2.819462,0.06993,00:02
1,2.611699,2.678876,0.055944,00:02
2,2.610717,2.64748,0.076923,00:02
3,2.586171,2.614305,0.104895,00:02


epoch,train_loss,valid_loss,accuracy,time
0,2.528219,2.676616,0.076923,00:03
1,2.539971,2.607183,0.104895,00:03
2,2.532332,2.608318,0.104895,00:03
3,2.534533,2.622225,0.104895,00:02


epoch,train_loss,valid_loss,accuracy,time
0,2.516232,2.638446,0.097902,00:03
1,2.530118,2.626099,0.097902,00:03
2,2.539635,2.593477,0.132867,00:03
3,2.536101,2.644271,0.06993,00:03


KeyboardInterrupt: 

NameError: name 'cls_learner' is not defined