# General-Domain MSPM Training

In this notebook, we are going to train an universal molecular structure prediction model (MSPM) on **one million** compounds curated from [ChEMBL](https://www.ebi.ac.uk/chembl/). 

We use [SMILES](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system) as molecuar representation. SMILES is a type of textual represetnation for molecules. 

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import time
import tqdm
import sqlite3

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

from SmilesPE.pretokenizer import atomwise_tokenizer
from SmilesPE.pretokenizer import kmer_tokenizer
from SmilesPE.spe2vec import Corpus

import pandas as pd

from multiprocessing import Pool

from fastai import *
from fastai.text import *
from utils import *
import torch
print (torch.__version__)

torch.cuda.set_device(0) #change to 0 if you only has one GPU 

1.3.1


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device,torch.cuda.is_available()

(device(type='cuda'), True)

In [3]:
Number_of_workers = 8 # Number of threads to use
train_percentage = 99 # Train and valid split percentage

## Get quantmap smiles

In [4]:
conn_old = sqlite3.connect('/mnt/external-images-pvc/quantmap/qm_chem_fix.sqlite') #OLD DATABASE
co = conn_old.cursor()

#### Remove rdkit warnings (optional)

In [5]:
# To remove rdkit warning

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

#### Get data, convert to canonical smiles and save it

In [6]:
def to_canonical_smiles(molecule):
    try:
        canonical_smile = Chem.MolToSmiles(Chem.MolFromSmiles(molecule))
    except:
        canonical_smile = False
    return canonical_smile

In [None]:
co.execute("select distinct smiles from stitch_chem;")
query_out = co.fetchall()

quantmap_smiles = []
for smile in query_out:
    quantmap_smiles.append(smile[0])
    

p = Pool(Number_of_workers)
canonical_quantmap_smiles = list(tqdm.tqdm(p.imap(to_canonical_smiles, quantmap_smiles), total=len(quantmap_smiles)))
p.close()
p.terminate()


In [9]:
out_csv = open("quantmap_canonical_smiles.csv","w")
out_csv.write("Smiles,Canonical\n")
loop = tqdm.tqdm(canonical_quantmap_smiles,total=len(canonical_quantmap_smiles),leave=False)
for smile in loop:
    if type(smile) == str:
        out_csv.write(str(smile) + ",yes\n")
out_csv.close()

                                                               

In [7]:
# Read the previously saved canonical csv with shuffling
csv_path = '/scratch-shared/akshai/workdir/molpmofit_method/quantmap_canonical_smiles.csv'
smiles_df = pd.read_csv(csv_path).sample(frac=1).reset_index(drop=True)

In [9]:
ratio = int(smiles_df.shape[0] * (train_percentage/100))

train_df = smiles_df.iloc[:ratio,:]
valid_df = smiles_df.iloc[ratio:,:]

train_df.to_pickle("train_canonical.pkl")
valid_df.to_pickle("valid_canonical.pkl")

print ("Number of training smiles = " + str(len(train_df)))
print ("Number of validation smiles = " + str(len(valid_df)))

Number of training smiles = 16611236
Number of validation smiles = 167791


In [10]:
# Load the pickle data
train_df = pd.read_pickle("train_canonical.pkl")
valid_df = pd.read_pickle("valid_canonical.pkl")

### Smiles augmentation

In [11]:
from functools import partial

def randomize_smiles(smiles,random_smiles=[],iteration=5):
    try:
        m = Chem.MolFromSmiles(smiles)
        ans = list(range(m.GetNumAtoms()))
        np.random.shuffle(ans)
        nm = Chem.RenumberAtoms(m,ans)
        out_smiles = (Chem.MolToSmiles(nm, canonical=False, isomericSmiles=True, kekuleSmiles=False))
    except:
        return (False)
    
    if out_smiles not in random_smiles:
        return out_smiles
    else:
        iteration -= 1
        if iteration > 0:
            out_smiles = randomize_smiles(smiles,random_smiles,iteration)
            return out_smiles
        return (False)
    
def augment_smiles(count,iteration,smiles):
    random_smiles = []
    for i in range(count):
        if smiles != None:
            out_smiles = randomize_smiles(smiles,random_smiles,iteration=iteration)
            if out_smiles:
                random_smiles.append(out_smiles)
            else:
                break
        
    return random_smiles

def unpack_and_write_list(smiles,filename):
    for entry in smiles:
        if type(entry) == list:
            unpack_and_write_list(entry,filename)
        else:
            filename.write(entry + ",no\n")
    
def smiles_augmentation(df, N_rounds=1,iteration=5,data_set_type="train"):
    canonical_smiles = df['Smiles'].to_list()
    
    p = Pool(Number_of_workers)
    func = partial(augment_smiles, N_rounds, iteration)
    augmented_smiles = list(tqdm.tqdm(p.imap(func, canonical_smiles), total=len(canonical_smiles)))
    p.close()
    
    print ("Saving data")
    
    filename = str(data_set_type) + "_aug_canonical_smiles.csv"
    
    aug_out = open(filename,"w")
        
    aug_out.write("Smiles,Canonical\n")
    
    unpack_and_write_list(augmented_smiles,filename=aug_out)
    
    unpack_and_write_list(canonical_smiles,filename=aug_out)
    
    aug_out.close()
    
    print ("Saved data")


In [12]:
import random, math

def read_shuffled_chunks(filepath: str, chunk_size: int,
                        file_length: int, has_header=True):
    
    random.seed(1)
    
    header = 0 if has_header else None
    first_data_idx = 1 if has_header else 0
    # create index list
    index_list = list(range(first_data_idx,file_length))

    # shuffle the list in place
    random.shuffle(index_list)

    # iterate through the chunks and read them
    n_chunks = math.ceil(file_length/chunk_size)
    for i in range(n_chunks):

        rows_to_keep = index_list[(i*chunk_size):((i+1)*chunk_size - 1)]
        if has_header:
            rows_to_keep += [0] # include the index row
        # get the inverse selection
        rows_to_skip = list(set(index_list) - set(rows_to_keep)) 
        yield pd.read_csv(filepath,skiprows=rows_to_skip, header=header)

In [13]:
number_of_augmentation = 1

# Augmentation for training data
train_data = smiles_augmentation(train_df,N_rounds=number_of_augmentation,iteration=100,data_set_type="train")

# Augmentation for validation data
val_data = smiles_augmentation(valid_df,N_rounds=number_of_augmentation,iteration=100,data_set_type="valid")

100%|██████████| 16611236/16611236 [20:33<00:00, 13463.81it/s] 


Saving data
Saved data


100%|██████████| 167791/167791 [00:11<00:00, 13984.40it/s]


Saving data
Saved data


In [13]:
#test_expected_file_length = int((number_of_augmentation + 1 ) * train_df.shape[0])
#valid_expected_file_length = int((number_of_augmentation + 1 ) * valid_df.shape[0])
#train_chunk = read_shuffled_chunks("train_aug_canonical_smiles.csv", 100,test_expected_file_length, has_header=True)
#valid_chunk = read_shuffled_chunks("valid_aug_canonical_smiles.csv", 100,valid_expected_file_length, has_header=True)

In [14]:
train_data = pd.read_csv("train_aug_canonical_smiles.csv", header=0).sample(frac=1).reset_index(drop=True)
valid_data = pd.read_csv("valid_aug_canonical_smiles.csv", header=0).sample(frac=1).reset_index(drop=True)

## Prepare data for training tokenizer

In [15]:
import codecs
from SmilesPE.learner import *
from SmilesPE.tokenizer import *

In [16]:
all_smiles = train_data['Smiles'].to_list()
all_smiles.extend(valid_data['Smiles'].to_list())

assert(len(train_data)+len(valid_data) == len(all_smiles))

print('Number of SMILES:', len(all_smiles))

Number of SMILES: 33558045


In [17]:
result_path = Path('results')
name = 'pretraining_new'
path = result_path/name
path.mkdir(exist_ok=True, parents=True)

mdl_path = path/'models'
mdl_path.mkdir(exist_ok=True)
token_path = 'results/tokens.txt'

In [None]:
%%time
output = codecs.open(token_path, 'w')
learn_SPE(all_smiles, output, 30000, min_frequency=2000, augmentation=0, verbose=False, total_symbols=True)

Counting SMILES...
3355010 unique Canonical SMILES
Gettting Pair Statistics


Number of unique characters & Reducing number of merge operations by: 490
Unique characters: {'[Os+4]', '[Zn+]', 'N', '[Cl+]', '[Cu+2]', '[Re-2]', '[Os+2]', '[sH+]', '[Zr-2]', '[Cs+]', '8', '[Ne]', '[OH-]', '[S-2]', '[I+3]', '[Al-]', '[pH]', '[Ca+2]', '=', '[Ru-]', '[I+2]', '[Lu+3]', '[Tm]', '[O-]', '[Ti]', '[SH-]', '[Al-2]', '[Fm]', '[Cr+4]', '[Tc+]', '[Pa]', '[Cr+2]', '%29', '[B-]', '[SiH]', '[Pd+2]', '[Si+4]', '[Os-2]', '[Mn+2]', '[n+]', '[Si+3]', 'F', '[SH3+]', '[Y+3]', '[Li+]', '[In+3]', '[Xe]', '[al]', '[P+]', '[Fe-2]', 'B', '[Ir]', '[Au-]', '[Fe+2]', '[cH+]', '[Dy+3]', '%20', '[Al]', '[SnH3]', '[Lr]', '[Hf+2]', '[In-]', '[H-]', '[Se]', '[Ti+4]', '[PbH2]', '[NH]', '[Yb]', '[W+]', '[Gd+2]', '[Ag+]', '[Bk]', '[Ta+5]', '[Y]', '[BiH]', '[Sb+3]', '[At]', '[TeH]', '[ZrH]', '[U+2]', '[Co]', '[Pt+2]', '[Tl+]', '%10', '[NH2+]', '[Ir+2]', '[Ti-]', '[Ce+3]', '[Co+3]', '[NiH]', '[Ga]', '[K+]', '[NH4+]', '[Kr]', '[Th]', '[Os]', '[La]', '[SbH]', '[RuH]', '[Be+2]', '[Mo+4]', '[Cl]', '[AsH2]', '

In [22]:
spe_vob= codecs.open(token_path)
spe = SPE_Tokenizer(spe_vob)

smi = 'CC[N+](C)(C)Cc1IcccBrcc1[Ce+3][Fe+3]'
print (spe.tokenize(smi),atomwise_tokenizer(smi))

CC [N+](C)(C) Cc1 I cc c Br cc1 [Ce+3] [Fe+3] ['C', 'C', '[N+]', '(', 'C', ')', '(', 'C', ')', 'C', 'c', '1', 'I', 'c', 'c', 'c', 'Br', 'c', 'c', '1', '[Ce+3]', '[Fe+3]']


In [35]:
train_data = pd.read_pickle("train_canonical.pkl").sample(frac=1).reset_index(drop=True)
valid_data = pd.read_pickle("valid_canonical.pkl").sample(frac=1).reset_index(drop=True)

#train_data = train_data.iloc[:int(len(train_data) * 0.0005),:]
#valid_data = valid_data.iloc[:int(len(valid_data) * 0.0005),:]

In [36]:
len(train_data)

16611236

In [37]:
class MolTokenizer(BaseTokenizer):
    def __init__(self, lang = 'en'):
        self.lang = lang
        
    def tokenizer(self, smiles):        
        
        tokens = atomwise_tokenizer(smiles)
        return tokens
    
    def add_special_cases(self, toks):
        pass

In [38]:
tok = Tokenizer(partial(MolTokenizer), n_cpus=8, pre_rules=[], post_rules=[])

In [39]:
%%time
bs = 512 # batch size

data = TextLMDataBunch.from_df(path, train_data, valid_data, bs=bs, tokenizer=tok, 
                              chunksize=50000, text_cols=0, max_vocab=60000, include_bos=False)

  return np.array(a, dtype=dtype, **kwargs)


CPU times: user 7min 35s, sys: 21min 37s, total: 29min 12s
Wall time: 33min 2s


In [40]:
data.show_batch()

idx,text
0,4 ) c ( F ) c c 3 c 1 = O ) O C C 2 C F C C c 1 c c c ( N C ( = O ) c 2 c c c ( N C ( = O ) c 3 c c c c c 3 ) c c 2 ) c c 1 C O c 1
1,( N C ( = O ) C ( c 2 c c c 3 c ( c 2 ) O C O 3 ) N ( C c 2 c c c ( C ) c c 2 ) C ( = O ) C c 2 c c c c c 2 ) c ( O C ) c 1 C c 1
2,1 O C C C c 1 n c ( S C C ( O ) C O C c 2 c c c ( O C ) c c 2 ) n c ( N ) c 1 - c 1 c c c ( Cl ) c c 1 C c 1 c c c ( - n 2 c ( C c 3
3,3 ) n c n 2 ) c 1 C C c 1 c c c c ( N C c 2 c c c c c 2 O C C # N ) c 1 C C ( C ) O c 1 c c c ( N ( C C ( = O ) N C C C c 2 c c c c (
4,1 n c ( N 2 C C O C C 2 ) s c 1 N = O Cl c 1 c c c ( N 2 C C C ( [NH2+] C 3 C O c 4 c c c c c 4 C 3 ) C C 2 ) c c 1 N C 1 = C ( c 2 c c c c ( C (


Save the databunch.

In [41]:
len(data.vocab.itos),len(data.train_ds)

(560, 16611236)

In [42]:
data.save(f'{name}_databunch')
len(data.vocab.itos),len(data.train_ds)

(560, 16611236)

## Train the Model

Load the databunch generated in last section.

In [43]:
from fastai.basic_data import load_data
import fastai
print (fastai.__version__)

1.0.61


In [44]:
bs = 512 # batch size
data_lm = load_data(path, f'{name}_databunch', bs=bs)

Define the [model](https://docs.fast.ai/text.learner.html).

In [45]:
learner = language_model_learner(data_lm, AWD_LSTM, drop_mult = 1.,pretrained=False)

Model Architecture.

In [46]:
learner.model

SequentialRNN(
  (0): AWD_LSTM(
    (encoder): Embedding(560, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(560, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1152, batch_first=True)
      )
      (1): WeightDropout(
        (module): LSTM(1152, 1152, batch_first=True)
      )
      (2): WeightDropout(
        (module): LSTM(1152, 400, batch_first=True)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=560, bias=True)
    (output_dp): RNNDropout()
  )
)

Finally, we are ready to train the model. I trained the model on a single **Quadro P4000** GPU.

In [47]:
lr = 3e-3
lr *= bs/48  # Scale learning rate by batch size

learner.unfreeze()
learner.fit_one_cycle(10, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.711222,0.663281,0.770622,3:34:28


KeyboardInterrupt: 

Save both the weights and vocabulary.

In [48]:
lm_fns = [f'{name}_wt', f'{name}_vocab']

learner.save(lm_fns[0], with_opt=False)
learner.data.vocab.save(mdl_path/(lm_fns[1] + '.pkl'))