In [2]:
import os
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw, Descriptors
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, random_split, Dataset

In [3]:
# check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
device = torch.device(device)
device

device(type='cuda')

In [5]:
smifile = "GDB17.50000000LLnoSR.smi"
data = pd.read_csv(smifile, delimiter = "\t", names = ["smiles"])
data = data[:50000]

In [6]:
data.head()

Unnamed: 0,smiles
0,BrC1=C2C3=C4C(CC3CCC2=O)C(=N)NC4=N1
1,BrC1=C2C3C4CCC(C4)C3C(=N)OC2=NC=C1
2,BrC1=C2C3C4CCC(O4)C3(OC2=NC=C1)C#C
3,BrC1=C2C3C4CNC(C4)(C#N)C3OC2=NC=C1
4,BrC1=C2C3=C4C(OC(=O)C4=CC2=O)=CC3=NO1


In [21]:
class Vectorizer:
    def __init__(self, method="char_vector"):
        self.method = method
    def fit(self, smiles):
        self.charset = set("".join(list(smiles))+"?E")
        self.vocab_size = len(self.charset)
        self.char_to_int = dict((c,i) for i,c in enumerate(self.charset))
        self.int_to_char = dict((i,c) for i,c in enumerate(self.charset))
        self.max_len = max([len(smile) for smile in smiles]) + 5
        self.padding_id = self.char_to_int["E"]
    def transform(self, smiles):
        if self.method == "char_vector":
            return self.char_vectorize(smiles)
        elif self.method == "one_hot":
            return self.one_hot_vectorize(smiles)
        else:
            raise ValueError("Vectorizer method not recognized")
    def one_hot_vectorize(self, smiles):
        raise NotImplementedError

    def reconstruct(self, X):
        if self.method == "char_vector":
            return self.devectorize_char(X)
        elif self.method == "one_hot":
            return self.devectorize_one_hot(X)
        else:
            raise ValueError("Vectorizer method not recognized")
    def char_vectorize(self, smiles):
        ''' Vectorize a list of SMILES strings to a numpy array with char_to_int encoding'''
        X = np.zeros((len(smiles), self.max_len))
        for i,smile in enumerate(smiles):
            for j,char in enumerate(smile):
                X[i,j] = self.char_to_int[char]
            X[i,j+1:] = self.char_to_int["?"]
            X[i,j+2:] = self.char_to_int["E"]
        return torch.LongTensor(X)
    def devectorize_char(self, X):
        ''' Devectorize a numpy array of shape (len(smiles), embed) to a list of SMILES strings'''
        if type(X) == torch.Tensor:
            X = X.numpy()
        if type(X) == list:
            X = np.array(X)
        smiles = []
        for i in range(X.shape[0]):
            smile = ""
            for j in range(X.shape[1]):
                smile += self.int_to_char[X[i,j]]
            smiles.append(smile)
        # Remove padding
        smiles = [smile.replace("?","").replace("E","") for smile in smiles]
        return smiles
    def devectorize_one_hot(self, X):
        raise NotImplementedError

In [25]:
class SMILESDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.X = data.smiles

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.X[idx]

In [26]:
tokenizer = Vectorizer(method="char_vector")
tokenizer.fit(data.smiles)

In [27]:
tokenized = tokenizer.transform(data.smiles[:2])
tokenized

tensor([[ 7, 24,  3, 21, 13,  3, 22,  3,  9, 13,  3, 14,  3,  0,  3,  3,  9,  3,
          3,  3, 22, 13, 19, 17,  3,  0, 13, 20, 17, 20,  3, 14, 13, 20, 21, 15,
         18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18],
        [ 7, 24,  3, 21, 13,  3, 22,  3,  9,  3, 14,  3,  3,  3,  0,  3, 14, 17,
          3,  9,  3,  0, 13, 20, 17, 19,  3, 22, 13, 20,  3, 13,  3, 21, 15, 18,
         18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18]])

In [28]:
train_data = SMILESDataset(data)

In [29]:
train_data[0]

'BrC1=C2C3=C4C(CC3CCC2=O)C(=N)NC4=N1'