# SMILES String Autoencoder

Goal: Use an autoencoder into a continuous vector to represent SMILES strings. We want the network to understand SMILES well-enough to provide valid outputs when the encoding is manipulated.

General Strategy 
1. Download a batch of SMILES from pubchem
2. Encode SMILES into a matrix representation that captures characters correctly (Cl, Br, etc.)
3. Train a simple autoencoder archiecture
4. Measure the rate of valid SMILES strings produced

## Gather Data

In [None]:
#!conda install -c mcs07 pubchempy

In [96]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder

import pubchempy as pcp

import torch
from torch import nn


In [97]:
data = pd.read_csv('../data/raw/csv/CID-malaria-1-stats.csv')
data.head()

Unnamed: 0,cid,cmpdname,cmpdsynonym,mw,mf,polararea,complexity,xlogp,heavycnt,hbonddonor,hbondacc,rotbonds,inchikey,iupacname,meshheadings,annothits,annothitcnt,aids,cidcdate,dois
0,51,2-Oxoglutaric acid,2-ketoglutaric acid|2-Oxopentanedioic acid|328...,146.098,C5H6O5,91.7,171.0,,10,2,5,4,KPGXRSRHYNQIFN-UHFFFAOYSA-N,2-oxopentanedioic acid,Ketoglutaric Acids,122718,13,"1,3,5,7,9,13,15,19,21,23,25,29,31,33,35,37,39,...",20040916,10.1001/archneurol.2009.134|10.1001/archneurol...
1,525,Malic acid,malic acid|DL-malic acid|6915-15-7|2-Hydroxybu...,134.087,C4H6O5,94.8,129.0,-1.3,9,3,5,3,BJEPYKJPYRNKOW-UHFFFAOYSA-N,2-hydroxybutanedioic acid,,65534,15,"155,157,161,165,167,175,360,361,368,373,374,41...",20040916,10.1001/archneur.58.6.944|10.1002/(sici)1097-0...
2,1023,Diphosphoric acid,Diphosphoric acid|Pyrophosphoric acid|2466-09-...,177.973,H4O7P2,124.0,147.0,,9,4,7,2,XPPKVPWEQAFLFU-UHFFFAOYSA-N,phosphono dihydrogen phosphate,,48990,12,"330,348,357,376,400,410,411,444,445,446,447,44...",20040916,10.1002/0471142700.nc1310s49|10.1002/asia.2012...
3,1052,Pyridoxamine,pyridoxamine|4-(AMINOMETHYL)-5-(HYDROXYMETHYL)...,168.196,C8H12N2O2,79.4,143.0,-1.0,12,3,4,2,NHZMQXZHNVQTQA-UHFFFAOYSA-N,4-(aminomethyl)-5-(hydroxymethyl)-2-methylpyri...,Pyridoxamine,69438,10,"348,357,376,410,411,422,444,445,446,447,448,45...",20040916,10.1002/14651858.cd004393|10.1002/14651858.cd0...
4,1329,"8-Cyclopentyl-1,3-dipropylxanthine","8-Cyclopentyl-1,3-dipropylxanthine|DPCPX|10214...",304.394,C16H24N4O2,69.3,436.0,4.0,22,1,3,5,FFBDFADSZUINTG-UHFFFAOYSA-N,"8-cyclopentyl-1,3-dipropyl-7H-purine-2,6-dione",,237338,11,"357,361,364,368,371,373,374,375,410,411,422,42...",20050325,10.1002/(sici)1097-4652(199612)169:3&lt;538::a...


In [98]:
cid_list = list(data.cid)
smiles = [x['CanonicalSMILES'] for 
          x in pcp.get_properties('CanonicalSMILES', cid_list)]

In [99]:
smiles[:30]

['C(CC(=O)O)C(=O)C(=O)O',
 'C(C(C(=O)O)O)C(=O)O',
 'OP(=O)(O)OP(=O)(O)O',
 'CC1=NC=C(C(=C1O)CN)CO',
 'CCCN1C2=C(C(=O)N(C1=O)CCC)NC(=N2)C3CCCC3',
 'CCCN1C2=C(C(=O)NC1=O)NC=N2',
 'CN1C(=O)CN=C(C2=C1C=CC(=C2)Cl)C3=CC=C(C=C3)Cl',
 'CCCCCC#CCC#CCC#CCC#CCCCC(=O)O',
 'C1=CC2=C(C=C1F)C=C(N2)C(=O)O',
 'CN(C)CCC1=CNC2=C1C=C(C=C2)OC',
 'C1=CC2=C(C(=C1)[N+](=O)[O-])NN=C2',
 'CC1=C(C(=O)C(=C(C1=O)C)CCCCC#CCCCC#CCO)C',
 'CC1=C(C2=C(N1C(=O)C3=CC=C(C=C3)Cl)C=CC(=C2)OC)CC(=O)OCC(=O)O',
 'CC(=O)NC1=NN=C(S1)S(=O)(=O)N',
 'CC(=O)NC(CC1=CC=CC=C1)C(=O)O',
 'CCCSC1=CC2=C(C=C1)N=C(N2)NC(=O)OC',
 'CN(C)C1=NC(=NC(=N1)N(C)C)N(C)C',
 'CN1C2=C(C(=O)N(C1=O)C)NC=N2',
 'C1CN(CCN1)C2=NC3=CC=CC=C3OC4=C2C=C(C=C4)Cl',
 'COC1=CC=C(C=C1)C(=O)N2CCCC2=O',
 'COC1=CC=C(C=C1)CCN2CCC(CC2)NC3=NC4=CC=CC=C4N3CC5=CC=C(C=C5)F',
 'CC(C)NCC(COC1=CC=C(C=C1)CC(=O)N)O',
 'C1=CC=C(C=C1)CC2NC3=C(C=C(C(=C3)C(F)(F)F)S(=O)(=O)N)S(=O)(=O)N2',
 'CCC1=C(C2=CC=CC=C2O1)C(=O)C3=CC(=C(C(=C3)Br)O)Br',
 'C1=CC=C(C=C1)CSCC2=NS(=O)(=O)C3=CC(=C(C=C3N2)Cl)

## Pre-processing

We don't want the model to have to figure out that 'Cl' and 'Br' are actually just one atom - so let's replace these with single characters. We'll also make all the compounds the same length - if they're shorter than 150 characters, we'll pad it with spaces.


In [100]:
sub_dict = {
    'Cl' : 'R',
    'Br' : 'M',
    'Ca' : 'A',
    'Be' : 'E',
    'Na' : 'X',
    'Li' : 'L'
}
smiles_sub = []

length = 120

for atom in sub_dict:
    for s in smiles:
        if len(s) > 120: continue
        smile = s.replace(atom, sub_dict[atom])
        while len(smile) < length:
            smile += ' '
        smiles_sub.append(smile)
smiles = smiles_sub

In [101]:
# What are all the possible characters?
bank = []

for s in smiles:
    for char in set(s):
        if char not in bank:
            bank.append(char)
sorted(bank)

[' ',
 '#',
 '(',
 ')',
 '+',
 '-',
 '.',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '=',
 'B',
 'C',
 'F',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'R',
 'S',
 'X',
 '[',
 ']',
 'a',
 'e',
 'i',
 'l',
 'r']

In [102]:
smiles[:10]

['C(CC(=O)O)C(=O)C(=O)O                                                                                                   ',
 'C(C(C(=O)O)O)C(=O)O                                                                                                     ',
 'OP(=O)(O)OP(=O)(O)O                                                                                                     ',
 'CC1=NC=C(C(=C1O)CN)CO                                                                                                   ',
 'CCCN1C2=C(C(=O)N(C1=O)CCC)NC(=N2)C3CCCC3                                                                                ',
 'CCCN1C2=C(C(=O)NC1=O)NC=N2                                                                                              ',
 'CN1C(=O)CN=C(C2=C1C=CC(=C2)R)C3=CC=C(C=C3)R                                                                             ',
 'CCCCCC#CCC#CCC#CCC#CCCCC(=O)O                                                                                           ',


In [103]:
def smiles_vectorizer(s, bank):
    vector = [[0 if symbol != char else 1 for symbol in bank] 
                  for char in s]
    return vector
        

In [104]:
smiles_tensor = []
for s in smiles:
    smiles_tensor.append(smiles_vectorizer(s, bank))
smiles_tensor = torch.tensor(smiles_tensor)
smiles_tensor

tensor([[[1, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 1,  ..., 0, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0]],

        [[1, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 1,  ..., 0, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 1,  ..., 0, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0]],

        ...,

        [[1, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 1,  ..., 0, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0]],

        [[1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         [0,

In [105]:
torch.save(smiles_tensor, 'smiles_tensor.pkl')

## Simple Autoencoder

In [None]:
class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(12
                      0, 128),
            nn.ReLU(True),
            nn.Linear(128, 64),
            nn.ReLU(True), nn.Linear(64, 12), nn.ReLU(True), nn.Linear(12, 3))
        self.decoder = nn.Sequential(
            nn.Linear(3, 12),
            nn.ReLU(True),
            nn.Linear(12, 64),
            nn.ReLU(True),
            nn.Linear(64, 128),
            nn.ReLU(True), nn.Linear(128, 150), nn.Tanh())

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x