# LOAD MODEL

In [1]:
import torch
from model import TrainGAN
from tokenizer import Tokenizer
from layers import Generator, Discriminator

# Load data
data = []
with open('filtered_smiles_dataset.csv', "r") as f:
    for line in f.readlines()[1:]:
        smile = line.strip()
        data.append(smile)

step = 340000

# Define a function to read the top hyperparameters from the file
def read_top_hyperparameters(file_path):
    top_params = []
    with open(file_path, "r") as f:
        for line in f:
            score, params = line.split(', Parameters: ')
            params = eval(params.strip())
            top_params.append(params)
    return top_params

# Read the top hyperparameters
top_hyperparameters = read_top_hyperparameters("best_hyperparams.txt")

best_params_best = top_hyperparameters[0]
hidden_dim_best = best_params_best['hidden_dim']
lr_best = best_params_best['lr']
dropout_best = best_params_best['dropout']
batch_size_best = best_params_best['batch_size'] 

best_params_best

device = "cuda"

tokenizer_state = torch.load(f'models (final)/checkpoint_step_{step}/tokenizer.pth')
print(tokenizer_state)
tokenizer = Tokenizer(data)
tokenizer.mapping = tokenizer_state['mapping']
tokenizer.inv_mapping = tokenizer_state['inv_mapping']
tokenizer.start_token = tokenizer_state['start_token']
tokenizer.end_token = tokenizer_state['end_token']
tokenizer.vocab_size = tokenizer_state['vocab_size']

generator = Generator(
    latent_dim=hidden_dim_best,
    vocab_size=tokenizer.vocab_size - 1,
    start_token=tokenizer.start_token - 1,
    end_token=tokenizer.end_token - 1,
).to(device)
generator.load_state_dict(torch.load(f'models (final)/checkpoint_step_{step}/generator.pth'))

discriminator = Discriminator(
    hidden_size=hidden_dim_best,
    vocab_size=tokenizer.vocab_size,
    start_token=tokenizer.start_token,
    bidirectional=True
).to(device)
discriminator.load_state_dict(torch.load(f'models (final)/checkpoint_step_{step}/discriminator.pth'))

generator_optimizer = torch.optim.Adam(generator.parameters(), lr=lr_best)
generator_optimizer.load_state_dict(torch.load(f'models (final)/checkpoint_step_{step}/generator_optimizer.pth'))

discriminator_optimizer = torch.optim.Adam(discriminator.parameters(), lr=lr_best)
discriminator_optimizer.load_state_dict(torch.load(f'models (final)/checkpoint_step_{step}/discriminator_optimizer.pth'))

gan_model_loaded = TrainGAN(data, hidden_dim=hidden_dim_best, lr=lr_best, device=device)
gan_model_loaded.tokenizer = tokenizer
gan_model_loaded.generator = generator
gan_model_loaded.discriminator = discriminator
gan_model_loaded.generator_optim = generator_optimizer
gan_model_loaded.discriminator_optim = discriminator_optimizer


{'mapping': {'<pad>': 0, '4': 1, '1': 2, 'c': 3, '[': 4, 'L': 5, 'S': 6, '5': 7, 'a': 8, 'l': 9, 'i': 10, '9': 11, '=': 12, 'r': 13, '3': 14, 'n': 15, 'O': 16, '(': 17, '\\': 18, 's': 19, 'o': 20, 'M': 21, ']': 22, 'B': 23, '2': 24, 'A': 25, '@': 26, '8': 27, '6': 28, '#': 29, 'g': 30, ')': 31, 'C': 32, 'I': 33, 'N': 34, 'H': 35, '0': 36, '-': 37, 'F': 38, '%': 39, '7': 40, '.': 41, '/': 42, 'P': 43, 'e': 44, 'K': 45, '+': 46, '[C@H]': 47, '[C@@H]': 48, '[nH]': 49, '[O-]': 50, '[C@]': 51, '[N+]': 52, '[C@@]': 53, '[S+]': 54, '<eos>': 55, '<sos>': 56}, 'inv_mapping': {0: '<pad>', 1: '4', 2: '1', 3: 'c', 4: '[', 5: 'L', 6: 'S', 7: '5', 8: 'a', 9: 'l', 10: 'i', 11: '9', 12: '=', 13: 'r', 14: '3', 15: 'n', 16: 'O', 17: '(', 18: '\\', 19: 's', 20: 'o', 21: 'M', 22: ']', 23: 'B', 24: '2', 25: 'A', 26: '@', 27: '8', 28: '6', 29: '#', 30: 'g', 31: ')', 32: 'C', 33: 'I', 34: 'N', 35: 'H', 36: '0', 37: '-', 38: 'F', 39: '%', 40: '7', 41: '.', 42: '/', 43: 'P', 44: 'e', 45: 'K', 46: '+', 47: '[C@

In [2]:
gan_model_loaded.eval()

TrainGAN(
  (generator): Generator(
    (embedding_layer): Embedding(56, 256)
    (project): FeedForward(
      (_activations): ModuleList(
        (0): LeakyReLU(negative_slope=0.01)
        (1): ELU(alpha=0.1)
      )
      (_linear_layers): ModuleList(
        (0): Linear(in_features=256, out_features=512, bias=True)
        (1): Linear(in_features=512, out_features=512, bias=True)
      )
      (_dropout): ModuleList(
        (0-1): 2 x Dropout(p=0.3, inplace=False)
      )
    )
    (rnn): LSTMCell(256, 256)
    (output_layer): Sequential(
      (0): LeakyReLU(negative_slope=0.01)
      (1): Dropout(p=0.3, inplace=False)
      (2): Linear(in_features=256, out_features=512, bias=True)
      (3): LeakyReLU(negative_slope=0.01)
      (4): Dropout(p=0.3, inplace=False)
      (5): Linear(in_features=512, out_features=55, bias=True)
    )
  )
  (discriminator): Discriminator(
    (embedding): Embedding(57, 256, padding_idx=0)
    (rnn): LstmSeq2SeqEncoder(
      (_module): LSTM(256, 256

In [3]:
smiles_list = gan_model_loaded.generate_n(150000)

In [4]:
from rdkit import Chem
import pandas as pd
import csv

data = []
valid = 0
invalid = 0

def check_validity(smile):
    try:
        mol = Chem.MolFromSmiles(smile)
        if mol is not None:
            return True
        else:
            return False
    except:
        return False

for smile in smiles_list:
    if check_validity(smile):
        validity = 'valid'
        valid += 1
    else:
        validity = 'invalid'
        invalid += 1
    data.append([smile, validity])

print("valid: ", valid)
print("invalid: ", invalid)
df = pd.DataFrame(data, columns=['canonical_smiles', 'validity'])
df = df.drop_duplicates(subset=['canonical_smiles'])


valid:  141556
invalid:  8444


In [5]:
df

Unnamed: 0,canonical_smiles,validity
0,O=C(NC1CCCCC1)c1nc2ccccc2s1,valid
1,COc1ccc(/C=N/NC(=O)c2ccc(NC(=O)CC(=O)O)cc2)cc1,valid
2,O=C(NC1CCCCC1)Nc1ncc[nH]1,valid
3,O=[N+]([O-])c1ccc(-c2ccccc2)cc1,valid
4,CCOC(=O)c1ccccc1NC(=O)Nc1nccs1,valid
...,...,...
149977,N#Cc1ccc([C@@H](O)C[C@@H](NC(=O)c2ccccc2)cc(Br...,invalid
149979,COc1ccc(CNCC(=O)NCC(=O)O)c2ccccc21,valid
149980,CCOc1ccc(/C=N/NC(=O)c2ccc(Cl)cc2NC(=O)CCCCC2)cc1,invalid
149986,O=C(NCCOc1cccc(F)c1)c1ccc[nH]1,valid


In [6]:
df.to_csv('generated_molecules.csv', index=False)