# Load and Re-Train Model

In [1]:
! pip install torch
! pip install numpy
! pip install rdkit
! pip install allennlp-light
! pip install zipfile




[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Could not find a version that satisfies the requirement zipfile (from versions: none)
ERROR: No matching distribution found for zipfile

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
from model import TrainGAN
from tokenizer import Tokenizer
from layers import Generator, Discriminator

import zipfile

# Read the CSV file from the zip file
zip_file_path = "filtered_smiles_dataset.zip"
data = []
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    with zip_ref.open('filtered_smiles_dataset.csv') as csv_file:
        for line in csv_file.readlines()[1:]:
            smile = line.decode('utf-8').strip()
            data.append(smile)


# Define a function to read the top hyperparameters from the file
def read_top_hyperparameters(file_path):
    top_params = []
    with open(file_path, "r") as f:
        for line in f:
            score, params = line.split(', Parameters: ')
            params = eval(params.strip())
            top_params.append(params)
    return top_params

In [3]:
# Read the top hyperparameters
top_hyperparameters = read_top_hyperparameters("best_hyperparams.txt")

best_params_best = top_hyperparameters[0]
hidden_dim_best = best_params_best['hidden_dim']
lr_best = best_params_best['lr']
dropout_best = best_params_best['dropout']
batch_size_best = best_params_best['batch_size'] 
best_params_best

{'hidden_dim': 256,
 'lr': 0.0002362588471099862,
 'dropout': 0.3,
 'batch_size': 64}

In [4]:
hidden_dim = hidden_dim_best
lr = lr_best
device = "cuda"
step = 10

# Load tokenizer state
tokenizer_state = torch.load(f'models/checkpoint_step_{step}/tokenizer.pth')
tokenizer = Tokenizer(data)
tokenizer.mapping = tokenizer_state['mapping']
tokenizer.inv_mapping = tokenizer_state['inv_mapping']
tokenizer.start_token = tokenizer_state['start_token']
tokenizer.end_token = tokenizer_state['end_token']
tokenizer.vocab_size = tokenizer_state['vocab_size']

# Load generator and discriminator models
generator = Generator(
    latent_dim=hidden_dim,
    vocab_size=tokenizer.vocab_size - 1,
    start_token=tokenizer.start_token - 1,
    end_token=tokenizer.end_token - 1,
).to(device)
generator.load_state_dict(torch.load(f'models/checkpoint_step_{step}/generator.pth'))

discriminator = Discriminator(
    hidden_size=hidden_dim,
    vocab_size=tokenizer.vocab_size,
    start_token=tokenizer.start_token,
    bidirectional=True
).to(device)
discriminator.load_state_dict(torch.load(f'models/checkpoint_step_{step}/discriminator.pth'))

# Load generator and discriminator optimizer states
generator_optimizer = torch.optim.Adam(generator.parameters(), lr=lr)
generator_optimizer.load_state_dict(torch.load(f'models/checkpoint_step_{step}/generator_optimizer.pth'))

discriminator_optimizer = torch.optim.Adam(discriminator.parameters(), lr=lr)
discriminator_optimizer.load_state_dict(torch.load(f'models/checkpoint_step_{step}/discriminator_optimizer.pth'))

# Create MolGen instance and assign loaded models and optimizer
gan_model_loaded = TrainGAN(data, hidden_dim=hidden_dim, lr=lr, device=device)
gan_model_loaded.tokenizer = tokenizer
gan_model_loaded.generator = generator
gan_model_loaded.discriminator = discriminator
gan_model_loaded.generator_optim = generator_optimizer
gan_model_loaded.discriminator_optim = discriminator_optimizer



In [5]:
# Update dropout rates in the model
gan_model_loaded.generator.output_layer[1].p = dropout_best
gan_model_loaded.generator.output_layer[4].p = dropout_best
gan_model_loaded.discriminator.fc[1].p = dropout_best
gan_model_loaded.discriminator.fc[4].p = dropout_best

gan_model_loaded.generator.project._dropout[0].p = dropout_best
gan_model_loaded.generator.project._dropout[1].p = dropout_best

print(gan_model_loaded.generator.output_layer[1].p)
print(gan_model_loaded.generator.output_layer[4].p)

print(gan_model_loaded.discriminator.fc[1].p)
print(gan_model_loaded.discriminator.fc[4].p)

print(gan_model_loaded.generator.project._dropout[0].p)
print(gan_model_loaded.generator.project._dropout[1].p) 

0.3
0.3
0.3
0.3
0.3
0.3


In [6]:
# Prepare DataLoader
retrain_loader = gan_model_loaded.create_dataloader(data, batch_size=batch_size_best, shuffle=True, num_workers=8)

tokens:  ['/', '\\', 'S', '(', 'F', '-', '0', '#', '8', '[', 'I', 'B', 'i', 'e', 'M', '3', '6', 'C', '@', 'P', 'r', '4', 'l', '5', 'K', 'o', 'c', ']', 'O', 'N', '7', '2', '%', ')', '9', 'a', '.', 'g', 'L', 'H', '1', 'n', 's', 'A', '+', '=', '[C@H]', '[C@@H]', '[nH]', '[O-]', '[C@]', '[N+]', '[C@@]', '[S+]', '<eos>', '<sos>']
vocab_size:  57
start:  56
end:  55


In [7]:
gan_model_loaded.train_n_steps(retrain_loader, max_step=206300, evaluate_every=100, save_every=20630) # 180k # 100 epochs (200k max_step)

O=C(CNc1cccc(Cl)c1)NO
disc_loss: 0.03145124, gen_loss: -8.91070271, valid 1 =  0.92
CCCCCCOc1ccc(CC(=O)NCc2ccc(Cl)cc2)cc1
disc_loss: 0.03861886, gen_loss: -0.13558279, valid 2 =  0.87
N#Cc1ccc(-c2nn(COc3cccnc3)c2)cc(CCC(=O)O)c1
disc_loss: 0.03981999, gen_loss: -0.14986336, valid 3 =  0.74
CC(=O)N[C@@H]1CC[C@H]1N
disc_loss: 0.05317968, gen_loss: -0.12859744, valid 4 =  0.88
O=C(NNC(=O)c1ccccc1O)c1ccccc1O
disc_loss: 0.04253417, gen_loss: -0.14341541, valid 5 =  0.87
O=C(NNC(=O)c1ccc(O)c(O)c1)c1ccc(O)cc1
disc_loss: 0.05175112, gen_loss: -0.03313867, valid 6 =  0.84
COc1ccc(-c2cn(CC(C)C3CCC3)cc2)cc1
disc_loss: 0.02218449, gen_loss: -0.21787880, valid 7 =  0.90
O=C(NNC(=O)c1ccccc1O)c1ccccc1
disc_loss: 0.04847129, gen_loss: -0.35647717, valid 8 =  0.69
CN1CCC(O)[C@H]1N
disc_loss: 0.04163642, gen_loss: -0.08700028, valid 9 =  0.94
CC(=O)O.OCCNc1nc2cccnc2[nH]1
disc_loss: 0.02066740, gen_loss: -0.13340834, valid 10 =  0.88
O=C(CCn1ccnc1)c1ccc(Cl)cc1
disc_loss: 0.12254788, gen_loss: 0.38150841, 

0.84

In [8]:
gan_model_loaded.eval()

TrainGAN(
  (generator): Generator(
    (embedding_layer): Embedding(56, 256)
    (project): FeedForward(
      (_activations): ModuleList(
        (0): LeakyReLU(negative_slope=0.01)
        (1): ELU(alpha=0.1)
      )
      (_linear_layers): ModuleList(
        (0): Linear(in_features=256, out_features=512, bias=True)
        (1): Linear(in_features=512, out_features=512, bias=True)
      )
      (_dropout): ModuleList(
        (0-1): 2 x Dropout(p=0.3, inplace=False)
      )
    )
    (rnn): LSTMCell(256, 256)
    (output_layer): Sequential(
      (0): LeakyReLU(negative_slope=0.01)
      (1): Dropout(p=0.3, inplace=False)
      (2): Linear(in_features=256, out_features=512, bias=True)
      (3): LeakyReLU(negative_slope=0.01)
      (4): Dropout(p=0.3, inplace=False)
      (5): Linear(in_features=512, out_features=55, bias=True)
    )
  )
  (discriminator): Discriminator(
    (embedding): Embedding(57, 256, padding_idx=0)
    (rnn): LstmSeq2SeqEncoder(
      (_module): LSTM(256, 256

In [9]:
smiles_list = gan_model_loaded.generate_n(50000)

In [10]:
smiles_list

['C/C(=C\\c1ccc(OC)cc1)[C@@H]1CCCC1',
 'CC(C)[C@@H](N)C(=O)N/N=C/c1ccc(O)cc1',
 'COc1ccc(/C=C/C(=O)c2ccc(F)cc2)cc1',
 'C/C(=N\\Nc1ccnc2cc(Cl)ccc12)c1ccccc1',
 'C/C(=N/NC(N)=S)c1ccc(C(F)(F)F)cc1',
 'Cc1ccc(Nc2ccc(C(=O)N/N=C/c3ccccc3)ccc2Cl)c(OC)c1',
 'CC(C)[C@@H](C)[C@@H]1[C@H](C[C@H](C#N)=O)[C@H]([N+](=O)[O-])N1',
 'CC(C)=CCc1coc(=O)c1[N+](=O)[O-]',
 'CC(C)[C@@H](N)C(=O)Nc1ccccc1SCc1ccccc1',
 'COc1ccc(/C=C/c2ccc(OC)cc2)cc1',
 'CC(C)=CCc1ccc(O)cc1',
 'O=C(/C=C/c1ccc(Cl)o1)c1ccccc1',
 'COc1ccc(/C=C/C(=O)c2ccc(OC)cc2)cc1',
 'COc1ccc(/C=C/c2ccc(Cl)cc2)cc1',
 'CC(C)[C@@H](C)[C@@H](N)[C@H](CC(=O)O)C(O)CO',
 'Cc1cccc(-c2csc3ccccc3NC(=O)CC3CCCC3C2)cc1',
 'C/C(=N\\NC(N)=S)c1ccc(C(F)(F)F)cc1',
 'CC(C)C[C@@H](C(=O)NO)C(=O)CC(=O)OCCc1ccccc1',
 'Nc1ccccc1O',
 'Nc1nc2ccccc2n1-c1ccc(O)cc1',
 'COc1ccc(/C=C/C(=O)c2ccc(F)cc2)c(OC)c1',
 'CC(C)[C@@H](Cc1nc2ccccc2s1)c1ccccc1',
 'CC(C)=CCc1ccc(O)cc1',
 'COc1ccc(C(=O)Nc2ccc(Cl)cc2C#N)cc1',
 'CC(C)[C@@H]1CCN(c2nc3ccccc3o2)CC1',
 'Nc1ccccc1Sc1ccccc1',
 'CC(C)=

In [11]:
# Specify the output file name
output_file = 'smiles_list.txt'

# Write the SMILES list to a .txt file
with open(output_file, 'w') as file:
    for smiles in smiles_list:
        file.write(smiles + '\n')

print(f"SMILES list saved to '{output_file}'.")

SMILES list saved to 'smiles_list.txt'.


In [12]:
from rdkit import Chem
import pandas as pd
import csv

data = []
valid = 0
invalid = 0

def check_validity(smile):
    try:
        mol = Chem.MolFromSmiles(smile)
        if mol is not None:
            return True
        else:
            return False
    except:
        return False

for smile in smiles_list:
    if check_validity(smile):
        validity = 'valid'
        valid += 1
    else:
        validity = 'invalid'
        invalid += 1
    data.append([smile, validity])

print("valid: ", valid)
print("invalid: ", invalid)



valid:  47233
invalid:  2767


In [13]:
df = pd.DataFrame(data, columns=['smile', 'validity'])

# Drop duplicate SMILES strings
df = df.drop_duplicates(subset=['smile'])
# Filter to keep only valid SMILES strings
valid_smiles = df[df['validity'] == 'valid']['smile']

# Specify the output file name
output_file = 'smiles_list.txt'

# Write the valid SMILES list to a .txt file
with open(output_file, 'w') as file:
    for smiles in valid_smiles:
        file.write(smiles + '\n')

print(f"SMILES list saved to '{output_file}'.")

SMILES list saved to 'smiles_list.txt'.


In [14]:
import torch

# Save generator and discriminator models
torch.save(gan_model_loaded.generator.state_dict(), f"generator.pth")
torch.save(gan_model_loaded.discriminator.state_dict(), f"discriminator.pth")

# Save tokenizer state
tokenizer_state = {
    'mapping': gan_model_loaded.tokenizer.mapping,
    'inv_mapping': gan_model_loaded.tokenizer.inv_mapping,
    'start_token': gan_model_loaded.tokenizer.start_token,
    'end_token': gan_model_loaded.tokenizer.end_token,
    'vocab_size': gan_model_loaded.tokenizer.vocab_size
}
torch.save(tokenizer_state, f"tokenizer.pth")

# Save generator optimizer state
torch.save(gan_model_loaded.generator_optim.state_dict(), f"generator_optimizer.pth")

# Save discriminator optimizer state
torch.save(gan_model_loaded.discriminator_optim.state_dict(), f"discriminator_optimizer.pth")

print("Model saved successfully.")

Model saved successfully.
