In [1]:
import optuna
from functools import partial
import torch
import numpy as np
import random

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
def objective(trial, data):
    # Set seed for reproducibility
    set_seed(42)

    # Define the search space
    hidden_dim = 512
    lr = trial.suggest_float('lr', 1e-4, 1e-3, log=True)
    dropout = trial.suggest_categorical('dropout', [0.1, 0.2])
    batch_size = 32

    # Initialize model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    gan_model = TrainGAN(data, hidden_dim=hidden_dim, lr=lr, device=device)

    # Update dropout rates in model
    gan_model.generator.output_layer[1].p = dropout
    gan_model.generator.output_layer[4].p = dropout
    gan_model.discriminator.fc[1].p = dropout
    gan_model.discriminator.fc[4].p = dropout
    gan_model.generator.project._dropout[0].p = dropout
    gan_model.generator.project._dropout[1].p = dropout
    
    # Create dataloader
    loader = gan_model.create_dataloader(data, batch_size=batch_size, shuffle=True, num_workers=8)

    # Train for a fixed number of steps
    score = gan_model.train_n_steps(loader, max_step=5000, evaluate_every=100)

    return score



In [None]:
from model import TrainGAN

# Load data
data = []
with open('cleaned_smiles_dataset.csv', "r") as f:
    for line in f.readlines()[1:]:
        smile = line.strip()
        data.append(smile)

# Create a study and optimize
study = optuna.create_study(direction="maximize")
study.optimize(partial(objective, data=data), n_trials=12)

[I 2024-07-15 05:52:21,176] A new study created in memory with name: no-name-8a8bc722-07d0-40bb-b757-12d8ee7d23cc


tokens:  ['N', '1', ')', 'P', 'S', '-', '6', 'n', '%', 'r', '7', '(', 'g', '=', '0', 'B', '+', '\\', 'l', 'a', '.', '[', 'i', ']', '5', '#', '8', 'C', 'O', '@', 'L', 'A', 'K', 'F', 'I', 'c', '/', 'o', '9', '2', 's', 'H', 'M', '3', '4', '[C@H]', '[C@@H]', '[nH]', '[O-]', '[C@]', '[N+]', '[C@@]', '[n+]', '<eos>', '<sos>']
vocab_size:  56
start:  55
end:  54
batch_size:  32
H[C@@]]l@FgoL47MMO[C@H]+[O-]+6OP[C@]s0ioIi2BS=P[C@H][C@@]C(3%Ola[C@H][nH]IF%+[C@@H]7N=[n+]%71.[/FiBH3+.F([n+]@5SO[C@@H]OH[nH]Hi@N%%@4[C@]IC#aS#96FN]+a
valid 1 = 0.00
)))c()c(cN(c5))+CN2=c)1))c(22c2C1)=cCOOC)A)(1
valid 2 = 0.01
C==C1(1n
valid 3 = 0.08
)3Nc=1cC[C1c13)OCnBAK3n(=3CON)[OC=C13[O-]==1
valid 4 = 0.02
CM
valid 5 = 0.05
C=3c=)=cC2n)(c)3n2c1=O(cCAO11OC2lOCC(cN1Oc%Cc)3c2
valid 6 = 0.03
C)2C9@C5Pc(cOCnoN
valid 7 = 0.06
CnOC)1C=1nC(#COC(=(C1(N[C@]C
valid 8 = 0.09
C1=NFO3O3c)Lc
valid 9 = 0.02
C1c)1(Scc(.O3
valid 10 = 0.03
Cc)c2(c2cO)cnc[C@@H]/cC)1C())cCO1cO2(OC2
valid 11 = 0.07
ccC[C@@]
valid 12 = 0.03
3cC)1c3
valid 

[I 2024-07-15 06:22:55,269] Trial 0 finished with value: 0.0 and parameters: {'lr': 5.526890496778637e-05, 'dropout': 0.3}. Best is trial 0 with value: 0.0.


tokens:  ['N', '1', ')', 'P', 'S', '-', '6', 'n', '%', 'r', '7', '(', 'g', '=', '0', 'B', '+', '\\', 'l', 'a', '.', '[', 'i', ']', '5', '#', '8', 'C', 'O', '@', 'L', 'A', 'K', 'F', 'I', 'c', '/', 'o', '9', '2', 's', 'H', 'M', '3', '4', '[C@H]', '[C@@H]', '[nH]', '[O-]', '[C@]', '[N+]', '[C@@]', '[n+]', '<eos>', '<sos>']
vocab_size:  56
start:  55
end:  54
batch_size:  32
H[C@@]]l@FgoL47MMO[C@H]+[O-]+6OP[C@]s0ioIi2BS=P[C@H][C@@]C(3%Ola[C@H][nH]IF%+[C@@H]7N=[n+]%71.[/FiBH3+.F([n+]@5SO[C@@H]OH[nH]Hi@N%%@4[C@]IC#aS#96FN]+a
valid 1 = 0.00
osO121.((CON111aOc2)1)1n1
valid 2 = 0.01
)(CccN3(22(cC=C1[C@@]
valid 3 = 0.10
ncnCC3rO2c[O-]n=(KccO3Cccc[C@@](c2C)Kc3r2cNCO6MCn3(c
valid 4 = 0.05
C3[
valid 5 = 0.06
Oc=CL(1=
valid 6 = 0.06
)Occn(3c2[)C)sK]]Oc.[LcOc[C@@]cF1
valid 7 = 0.05
COn2c2C23/F(CaC](C[n+]=orc
valid 8 = 0.05
CCcC3=)1C)C[C@H]O2oOC1CccnLB[nH]3COn[=(31(NCc)
valid 9 = 0.03
C1n[N[C@@]3)c(.o(C=F#24ONc
valid 10 = 0.07
CCn(%C1Cc)1=S[C@H]o
valid 11 = 0.03
OC11(CnFC\CO3c())CS(N2sO)cSC[C@H]1C1C)c

[I 2024-07-15 07:07:05,847] Trial 1 finished with value: 0.04 and parameters: {'lr': 5.069775969476095e-05, 'dropout': 0.3}. Best is trial 1 with value: 0.04.


tokens:  ['N', '1', ')', 'P', 'S', '-', '6', 'n', '%', 'r', '7', '(', 'g', '=', '0', 'B', '+', '\\', 'l', 'a', '.', '[', 'i', ']', '5', '#', '8', 'C', 'O', '@', 'L', 'A', 'K', 'F', 'I', 'c', '/', 'o', '9', '2', 's', 'H', 'M', '3', '4', '[C@H]', '[C@@H]', '[nH]', '[O-]', '[C@]', '[N+]', '[C@@]', '[n+]', '<eos>', '<sos>']
vocab_size:  56
start:  55
end:  54
batch_size:  32


In [None]:
# Rank trials based on their scores
trials_ranked = sorted(study.trials, key=lambda t: t.value, reverse=True)

# Get the trials
top_trials = trials_ranked[:30]

# Print and save the top trials' parameters
with open("top_hyperparameters2.txt", "w") as f:
    for trial in top_trials:
        params = trial.params
        score = trial.value
        f.write(f"Score: {score}, Parameters: {params}\n")

print("Top hyperparameters saved successfully.")
