In [1]:
# General Imports
import os
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw, Descriptors
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader,random_split, Dataset
from utils.fixes import global_seed
import warnings
global_seed(42)
%matplotlib inline

Global seed set to 42


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
smifile = "GDB17.50000000LLnoSR.smi"
data = pd.read_csv(smifile, delimiter = "\t", names = ["smiles"])
data = data.sample(n=100000).reset_index(drop=True)

FileNotFoundError: [Errno 2] No such file or directory: 'GDB17.50000000LLnoSR.smi'

In [None]:
data.smiles[5]

In [None]:
# max len smiles
MAX_SMILES_LEN = max([len(smile) for smile in data.smiles]) + 5
print(MAX_SMILES_LEN)

In [None]:
multi_char = ["Cl", "Br", "Si"]
single_char = ['#', ')', '(', '+', '-', '/', '1', '3', '2', '5', '4', '7', '6', '8', '=', '@', 'C', 'B', 'F', 'I', 'H', 'O', 'N', 'S', '[', ']', '\\', 'c', 'l', 'o', 'n', 'p', 's', 'r']

In [None]:
import re

In [None]:
# create tokenizer for smiles strings
class SMILESTokenizer:
    def __init__(self, multi_char=["Cl", "Br", "Si"], start='?', end='E', max_len=60):
        self.multi_char = multi_char
        self.single_char = ['#', ')', '(', '+', '-', '/', '1', '3', '2', '5', '4', '7', '6', '8', '=', '@', 'C', 'B', 'F', 'I', 'H', 'O', 'N', 'S', '[', ']', '\\', 'c', 'l', 'o', 'n', 'p', 's', 'r']
        self.multi_pattern = self._generate_regex(multi_char)
        self.start = start
        self.end = end
        self.max_len = max_len
        self.vocab = self.single_char + self.multi_char
        self_pad = '<pad>'
        self.char2idx = {start: 1, end: 2, self_pad: 0}
        self.char2idx.update({char: idx + 3 for idx, char in enumerate(self.vocab)})
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}
        self.vocab.extend([start])
    def tokenize(self, smiles):
        if len(smiles) > self.max_len:
            warnings.warn(f"SMILES string is longer than {self.max_len -1} characters. Skipping...")
            return None
        smiles = self.start + smiles + self.end
        split = re.split(self.multi_pattern, smiles)
        out = []
        for x in split:
            if x in self.multi_char:
                out.append(x)
                continue
            if x is None:
                continue
            for y in x:
                out.append(y)
        without_pad = [self.char2idx[x] for x in out]
        return without_pad + [0] * (self.max_len - len(without_pad))
    def detokenize(self, tokens, remove_start_end=True, remove_padding=True):
        if isinstance(tokens, torch.Tensor):
            tokens = tokens.tolist()
        raw_string = "".join([self.idx2char[x] for x in tokens])
        if remove_padding:
            raw_string = raw_string.replace("<pad>", "")
        if remove_start_end:
            raw_string = raw_string.replace(self.start, "").replace(self.end, "")
        return raw_string

    def _generate_regex(self, multi_char):
        grouped = [f"({x})" for x in multi_char]
        multi_pattern = "|".join(grouped)
        return multi_pattern


In [None]:
tokenizer = SMILESTokenizer(max_len=50)
encoded = tokenizer.tokenize('CCCNBr')

In [None]:
encoded

In [None]:
tokenizer.detokenize(encoded)

In [None]:
data.smiles[6000] == tokenizer.detokenize(tokenizer.tokenize(data.smiles[6000]))

In [None]:
class SMILESDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=50):
        self.smiles = data.smiles.to_list()
        self.tokenizer = tokenizer(max_len=max_len)
        self.max_len = max_len
    def __len__(self):
        return len(self.smiles)
    def __getitem__(self, idx):
        smiles_raw = self.smiles[idx]
        encoded = self.tokenizer.tokenize(smiles_raw)
        return torch.tensor(encoded, dtype=torch.long)

In [None]:
dataset = SMILESDataset(data, SMILESTokenizer, max_len=MAX_SMILES_LEN)

In [None]:
tokenizer = SMILESTokenizer(max_len=MAX_SMILES_LEN)

In [None]:
dataset[7]

In [None]:
tokenizer.detokenize(dataset[7]) == data.smiles[7]

In [None]:
# create dataloader
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [4]:
import torch.nn as nn
import torch
example_2D_tensor = torch.tensor([[1, 2, 3], [4, 5, 6]])
print(f'Shape of 2D tensor: {example_2D_tensor.shape}')
print(f'Shape after view: {example_2D_tensor.view(1, 1, -1).shape}')
print(example_2D_tensor.view(1, 1, -1))

Shape of 2D tensor: torch.Size([2, 3])
Shape after view: torch.Size([1, 1, 6])
tensor([[[1, 2, 3, 4, 5, 6]]])


In [5]:
import torch.functional as F
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        #src = [src len, batch size]
        embedded = self.dropout(self.embedding(src))
        #embedded = [src len, batch size, emb dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        #outputs are always from the top hidden layer
        return hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        input = input.unsqueeze(0)
        #input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        #embedded = [1, batch size, emb dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        prediction = self.fc_out(output.squeeze(0))
        #prediction = [batch size, output dim]
        return prediction, hidden, cell

In [None]:
import random


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio = 0.5):

        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):

            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)

            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio

            #get the highest predicted token from our predictions
            top1 = output.argmax(1)

            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs