# LOAD MODEL

In [1]:
import torch
from model import TrainGAN
from tokenizer import Tokenizer
from layers import Generator, Discriminator

# Load data
data = []
with open('NTD_filtered_smiles_dataset.csv', "r") as f:
    for line in f.readlines()[1:]:
        smile = line.strip()
        data.append(smile)

step = 340000

# Define a function to read the top hyperparameters from the file
def read_top_hyperparameters(file_path):
    top_params = []
    with open(file_path, "r") as f:
        for line in f:
            score, params = line.split(', Parameters: ')
            params = eval(params.strip())
            top_params.append(params)
    return top_params

# Read the top hyperparameters
top_hyperparameters = read_top_hyperparameters("best_hyperparams.txt")

best_params_best = top_hyperparameters[0]
hidden_dim_best = best_params_best['hidden_dim']
lr_best = best_params_best['lr']
dropout_best = best_params_best['dropout']
batch_size_best = best_params_best['batch_size'] 

best_params_best

device = "cuda"

tokenizer_state = torch.load(f'models(test)/checkpoint_step_{step}/tokenizer.pth')
tokenizer = Tokenizer(data)
tokenizer.mapping = tokenizer_state['mapping']
tokenizer.inv_mapping = tokenizer_state['inv_mapping']
tokenizer.start_token = tokenizer_state['start_token']
tokenizer.end_token = tokenizer_state['end_token']
tokenizer.vocab_size = tokenizer_state['vocab_size']

generator = Generator(
    latent_dim=hidden_dim_best,
    vocab_size=tokenizer.vocab_size - 1,
    start_token=tokenizer.start_token - 1,
    end_token=tokenizer.end_token - 1,
).to(device)
generator.load_state_dict(torch.load(f'models(test)/checkpoint_step_{step}/generator.pth'))

discriminator = Discriminator(
    hidden_size=hidden_dim_best,
    vocab_size=tokenizer.vocab_size,
    start_token=tokenizer.start_token,
    bidirectional=True
).to(device)
discriminator.load_state_dict(torch.load(f'models(test)/checkpoint_step_{step}/discriminator.pth'))

generator_optimizer = torch.optim.Adam(generator.parameters(), lr=lr_best)
generator_optimizer.load_state_dict(torch.load(f'models(test)/checkpoint_step_{step}/generator_optimizer.pth'))

discriminator_optimizer = torch.optim.Adam(discriminator.parameters(), lr=lr_best)
discriminator_optimizer.load_state_dict(torch.load(f'models(test)/checkpoint_step_{step}/discriminator_optimizer.pth'))

gan_model_loaded = TrainGAN(data, hidden_dim=hidden_dim_best, lr=lr_best, device=device)
gan_model_loaded.tokenizer = tokenizer
gan_model_loaded.generator = generator
gan_model_loaded.discriminator = discriminator
gan_model_loaded.generator_optim = generator_optimizer
gan_model_loaded.discriminator_optim = discriminator_optimizer

gan_model_loaded.generator.output_layer[1].p = dropout_best
gan_model_loaded.generator.output_layer[4].p = dropout_best
gan_model_loaded.discriminator.fc[1].p = dropout_best
gan_model_loaded.discriminator.fc[4].p = dropout_best
gan_model_loaded.generator.project._dropout[0].p = dropout_best
gan_model_loaded.generator.project._dropout[1].p = dropout_best

print(gan_model_loaded.generator.output_layer[1].p)
print(gan_model_loaded.generator.output_layer[4].p)
print(gan_model_loaded.discriminator.fc[1].p)
print(gan_model_loaded.discriminator.fc[4].p)
print(gan_model_loaded.generator.project._dropout[0].p)
print(gan_model_loaded.generator.project._dropout[1].p) 

0.3
0.3
0.3
0.3
0.3
0.3


In [2]:
gan_model_loaded.eval()

TrainGAN(
  (generator): Generator(
    (embedding_layer): Embedding(56, 256)
    (project): FeedForward(
      (_activations): ModuleList(
        (0): LeakyReLU(negative_slope=0.01)
        (1): ELU(alpha=0.1)
      )
      (_linear_layers): ModuleList(
        (0): Linear(in_features=256, out_features=512, bias=True)
        (1): Linear(in_features=512, out_features=512, bias=True)
      )
      (_dropout): ModuleList(
        (0-1): 2 x Dropout(p=0.3, inplace=False)
      )
    )
    (rnn): LSTMCell(256, 256)
    (output_layer): Sequential(
      (0): LeakyReLU(negative_slope=0.01)
      (1): Dropout(p=0.3, inplace=False)
      (2): Linear(in_features=256, out_features=512, bias=True)
      (3): LeakyReLU(negative_slope=0.01)
      (4): Dropout(p=0.3, inplace=False)
      (5): Linear(in_features=512, out_features=55, bias=True)
    )
  )
  (discriminator): Discriminator(
    (embedding): Embedding(57, 256, padding_idx=0)
    (rnn): LstmSeq2SeqEncoder(
      (_module): LSTM(256, 256

In [3]:
smiles_list = gan_model_loaded.generate_n(50000)

In [4]:
from rdkit import Chem
import pandas as pd
import csv

data = []
valid = 0
invalid = 0

def check_validity(smile):
    try:
        mol = Chem.MolFromSmiles(smile)
        if mol is not None:
            return True
        else:
            return False
    except:
        return False

for smile in smiles_list:
    if check_validity(smile):
        validity = 'valid'
        valid += 1
    else:
        validity = 'invalid'
        invalid += 1
    data.append([smile, validity])

print("valid: ", valid)
print("invalid: ", invalid)
df = pd.DataFrame(data, columns=['canonical_smiles', 'validity'])
df = df.drop_duplicates(subset=['canonical_smiles'])


valid:  47273
invalid:  2727


In [5]:
df

Unnamed: 0,canonical_smiles,validity
0,COC(=O)c1ccc(OC(C)=O)cc1,valid
1,CC(C)(C)Oc1ccc(/C=N/NC(=O)c2ccccc2)cc1,valid
2,O=C(NC1CCCCC1)N1CCN(Cc2ccccc2)C1,valid
3,O=C(NC1CCCCC1)c1ccccc1OCc1ccccc1,valid
4,O=C(NCCc1csc(-c2ccccc2)n1)c1ccccc1,valid
...,...,...
49983,CCCCCCCCCCCC/C(=N\NC(=O)c1ccccc1)c1ccccc1Cl,valid
49989,O=C(O)C(=O)Oc1ccc(-c2ccccc2)cc1,valid
49995,COC(=O)c1ccc(-c2csc(-c3cccnc3)n2)cc1,valid
49997,CCCCOc1ccc([C@@H](COc2ccccc2)c(OC)cc1)c1ccccc1,invalid


In [6]:
df.to_csv('generated_molecules1.csv', index=False)