In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils
import torch.functional as F
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model architecture

In [22]:
class Encoder(nn.Module):
    def __init__(self, fp_size):
        super(Encoder, self).__init__()
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(fp_size, 2048)
        self.fc2 = nn.Linear(2048, 1024)
        self.fc3 = nn.Linear(1024, 512)
        self.fc4 = nn.Linear(512, 256)
        self.fc5 = nn.Linear(256, 114)

    def forward(self, x):
        out = self.relu(self.fc1(x))
        out = self.relu(self.fc2(out))
        out = self.relu(self.fc3(out))
        out = self.relu(self.fc4(out))
        out = self.relu(self.fc5(out))
        return out

class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, drop_prob, batch_size, batch_first=True):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=batch_first)
        self.fc = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(drop_prob)

    def forward(self, x, hidden):
        out, hidden = self.gru(x, hidden)
        out = self.dropout(out)
        #out = out.contiguous().view(-1, self.hidden_size)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        return hidden

class GRUAE(nn.Module):
    def __init__(self, fp_size, input_size, hidden_size, output_size, num_layers, batch_size, drop_prob=0.2):
        super(GRUAE, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.encoder = Encoder(fp_size=fp_size)
        self.decoder = Decoder(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, 
                               output_size=output_size, drop_prob=drop_prob, batch_size=batch_size)
        self.softmax = self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        encoded = self.encoder(x)  # (batch_size, 128)
        encoded = encoded.unsqueeze(dim=2)  # (batch_size, 128, 1)
        decoded, h = self.decoder(encoded, self.decoder.init_hidden(self.batch_size))
        decoded = self.softmax(decoded)
        #print(f"Model out size = {decoded.size()}")
        #print(f"Model out = \n{decoded}")
        return decoded, h

# Data Prep

In [23]:
import selfies as sf
data = pd.read_csv('./GRU_data/selfies.csv', header=None, names=['selfies'])
alphabet = sf.get_alphabet_from_selfies(data.selfies)
alphabet.add("[nop]") # [nop] is a special padding symbol
alphabet.add("[start]")
alphabet.add("[end]")
alphabet = list(sorted(alphabet))
pad_to_len = max(sf.len_selfies(s) for s in data.selfies) + 5  # 5
symbol_to_idx = {s: i for i, s in enumerate(alphabet)}
idx2char = {i: s for i, s in enumerate(alphabet)}

In [24]:
len(alphabet)

42

In [5]:
import re
class SELFIESVectorizer:
    def __init__(self, alphabet, pad_to_len):
        self.alphabet = alphabet
        self.pad_to_len = pad_to_len
        self.char2idx = {s: i for i, s in enumerate(alphabet)}
        self.idx2char = {i: s for i, s in enumerate(alphabet)}
    def vectorize(self, selfie):
        ''' Vectorize a list of SMILES strings to a numpy array of shape (len(smiles), embed, len(charset))'''
        X = np.zeros((self.pad_to_len, len(self.alphabet)))
        splited = ['[start]'] + self.split_selfi(selfie) + ['[end]'] + ['[nop]'] * (self.pad_to_len - len(self.split_selfi(selfie)) - 2)
        for i, char in enumerate(splited):
            X[i, self.char2idx[char]] = 1
        return X
    def devectorize(self, ohe):
        ''' Devectorize a numpy array of shape (len(smiles), embed, len(charset)) to a list of SMILES strings'''
        selfie_str = ''
        for j in range(self.pad_to_len):
            char = self.idx2char[np.argmax(ohe[j])]
            if char == '[start]':
                continue
            elif char == '[end]':
                break
            else:
                selfie_str += char
        return selfie_str

    def split_selfi(self, selfie):
        pattern = r'(\[[^\[\]]*\])'
        return re.findall(pattern, selfie)

In [6]:
selfi_test = '[C][O][C][=C][C][=C][Branch2][Ring1][S][C][=C][N][=C][N][=C][Branch1][C][N][N][=C][Branch1][N][N][C][C][N][Branch1][C][C][C][C][Ring1][#Branch1][C][Ring1][=C][=N][Ring2][Ring1][C][C][=C][Ring2][Ring1][Branch2]'
vectorizer = SELFIESVectorizer(alphabet, pad_to_len)

In [7]:
vectorized = vectorizer.vectorize(selfi_test)
print(vectorized.shape)
print(vectorized)

(114, 42)
[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]]


In [8]:
devectorized = vectorizer.devectorize(vectorized)
print(devectorized)

[C][O][C][=C][C][=C][Branch2][Ring1][S][C][=C][N][=C][N][=C][Branch1][C][N][N][=C][Branch1][N][N][C][C][N][Branch1][C][C][C][C][Ring1][#Branch1][C][Ring1][=C][=N][Ring2][Ring1][C][C][=C][Ring2][Ring1][Branch2]


In [9]:
from torch.utils.data import Dataset, DataLoader

class GRUDataset(Dataset):
    def __init__(self, smiles_fp, selfies, vectorizer):
        self.smiles_fp = pd.read_csv(smiles_fp, sep=',', nrows=1000)
        self.selfies = pd.read_csv(selfies, nrows=1000)
        self.X = self.prepare_X(self.smiles_fp)
        self.X = np.array([self.reconstruct_fp(fp) for fp in self.X])
        self.y = self.prepare_y(self.selfies)
    def __len__(self):
        return len(self.smiles_fp)
    def __getitem__(self, idx):
        raw_selfie = self.y[idx][0]
        vectorized_selfie = vectorizer.vectorize(raw_selfie)
        return torch.from_numpy(self.X[idx]).float(), torch.from_numpy(vectorized_selfie).float()


    @staticmethod
    def prepare_X(smiles_fp):
        fps = smiles_fp.fps.apply(eval).apply(lambda x: np.array(x, dtype=int))
        return fps
    @staticmethod
    def prepare_y(selfies):
        return selfies.values
    @staticmethod
    def reconstruct_fp(fp, length=4860):
        fp_rec = np.zeros(length)
        fp_rec[fp] = 1
        return fp_rec

In [10]:
dataset = GRUDataset('GRU_data/chembl_klek.csv', 'GRU_data/selfies.csv', vectorizer)

In [11]:
dataset[0][0].shape, dataset[0][1].shape

(torch.Size([4860]), torch.Size([114, 42]))

In [12]:
dataset[0][0]

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [13]:
train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

In [14]:
batch_size = 32
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size, drop_last=True)

In [15]:
next(iter(train_loader))[0].shape

torch.Size([32, 4860])

# Train

In [35]:
def train(train_loader, learn_rate, device, batch_size, EPOCHS=10):
    
    # Setting common hyperparameters
    input_size = 1
    hidden_size = 128
    num_layers = 1
    fp_size = 4860
    output_size = 42

    model = GRUAE(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
                 output_size=output_size, fp_size=fp_size, batch_size=batch_size).to(device)
    
    # Defining loss function and optimizer
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate)
    model.train()
    print("Training GRU model\n")
    epoch_times = []
    # Start training loop
    for epoch in range(1,EPOCHS+1):
        start_time = time.time()
        avg_loss = 0.
        counter = 0
        for x, label in train_loader:
            counter += 1
            model.zero_grad()
            #print(f"x size = {x.size()}")
            out, h = model(x.to(device).float())
            #print(f"label size = {label.size()}")
            #print(f"label = \n{label}")
            loss = criterion(out, label.to(device).float())
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()
            if counter%200 == 0:
                print("Epoch {}......Step: {}/{}....... Average Loss for Epoch: {}".format(epoch, counter, len(train_loader), avg_loss/counter))
        current_time = time.time()
        print("Epoch {}/{} Done, Total Loss: {}".format(epoch, EPOCHS, avg_loss/len(train_loader)))
        print("Total Time Elapsed: {} seconds\n".format(str(current_time-start_time)))
        epoch_times.append(current_time-start_time)
    print("Total Training Time: {} seconds".format(str(sum(epoch_times))))
    return model

def evaluate(model, test_x, test_y, label_scalers):
    model.eval()
    outputs = []
    targets = []
    start_time = time.time()
    for i in test_x.keys():
        inp = torch.from_numpy(np.array(test_x[i]))
        labs = torch.from_numpy(np.array(test_y[i]))
        h = model.init_hidden(inp.shape[0])
        out, h = model(inp.to(device).float())
        outputs.append(label_scalers[i].inverse_transform(out.cpu().detach().numpy()).reshape(-1))
        targets.append(label_scalers[i].inverse_transform(labs.numpy()).reshape(-1))
    print("Evaluation Time: {}".format(str(time.time()-start_time)))
    sMAPE = 0
    for i in range(len(outputs)):
        sMAPE += np.mean(abs(outputs[i]-targets[i])/(targets[i]+outputs[i])/2)/len(outputs)
    print("sMAPE: {}%".format(sMAPE*100))
    return outputs, targets, sMAPE

In [37]:
lr = 0.001
model = train(train_loader, lr, batch_size=batch_size, device=device)

Training GRU model

Epoch 1/10 Done, Total Loss: 0.11740908319396633
Total Time Elapsed: 0.61138916015625 seconds

Epoch 2/10 Done, Total Loss: 0.11188674345612526
Total Time Elapsed: 0.3454430103302002 seconds

Epoch 3/10 Done, Total Loss: 0.11008964026612895
Total Time Elapsed: 0.34801769256591797 seconds

Epoch 4/10 Done, Total Loss: 0.1091553627380303
Total Time Elapsed: 0.3467230796813965 seconds

Epoch 5/10 Done, Total Loss: 0.10893878154456615
Total Time Elapsed: 0.3520541191101074 seconds

Epoch 6/10 Done, Total Loss: 0.10873947665095329
Total Time Elapsed: 0.3484659194946289 seconds

Epoch 7/10 Done, Total Loss: 0.10854174116892475
Total Time Elapsed: 0.34528088569641113 seconds

Epoch 8/10 Done, Total Loss: 0.10849022812076978
Total Time Elapsed: 0.3500702381134033 seconds

Epoch 9/10 Done, Total Loss: 0.10816339829138347
Total Time Elapsed: 0.352222204208374 seconds

Epoch 10/10 Done, Total Loss: 0.10782869585922786
Total Time Elapsed: 0.3511943817138672 seconds

Total Train

## Model output to selfies

In [38]:
example_fp = np.random.rand(4860)
example_fp = (example_fp > 0.5)
example_fp = torch.from_numpy(example_fp)
example_fp = torch.unsqueeze(example_fp, 0).to(device)
example_fp = example_fp.float()
example_fp

tensor([[0., 1., 1.,  ..., 1., 0., 1.]], device='cuda:0')

In [39]:
model.eval()

GRUAE(
  (encoder): Encoder(
    (relu): ReLU()
    (fc1): Linear(in_features=4860, out_features=2048, bias=True)
    (fc2): Linear(in_features=2048, out_features=1024, bias=True)
    (fc3): Linear(in_features=1024, out_features=512, bias=True)
    (fc4): Linear(in_features=512, out_features=256, bias=True)
    (fc5): Linear(in_features=256, out_features=114, bias=True)
  )
  (decoder): Decoder(
    (gru): GRU(1, 128, batch_first=True)
    (fc): Linear(in_features=128, out_features=42, bias=True)
    (relu): ReLU()
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (softmax): Softmax(dim=1)
)

In [40]:
raw_out = model(example_fp)

RuntimeError: Expected hidden size (1, 1, 128), got [1, 32, 128]

In [None]:
vectorized_selfie = torch.argmax(raw_out)
devectorized_selfie = vectorizer.devectorize(vectorized_selfie)
print(devectorized_selfie)