In [1]:
import os

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import optuna
from optuna.trial import TrialState
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
# from torchvision import datasets
from torchvision import transforms


DEVICE = torch.device("cpu")
BATCHSIZE = 128
CLASSES = 2
DIR = os.getcwd()
EPOCHS = 10
N_TRAIN_EXAMPLES = BATCHSIZE * 30
N_VALID_EXAMPLES = BATCHSIZE * 10

def define_model(trial):
    # We optimize the number of layers, hidden units and dropout ratio in each layer.
    n_layers = trial.suggest_int("n_layers", 1, 3)
    layers = []

    in_features = 1933
    for i in range(n_layers):
        out_features = trial.suggest_int("n_units_l{}".format(i), 4, 128)
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        p = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5)
        layers.append(nn.Dropout(p))

        in_features = out_features
    layers.append(nn.Linear(in_features, CLASSES))
    layers.append(nn.LogSoftmax(dim=1))

    return nn.Sequential(*layers)

# Load your DataFrame (replace 'your_data.csv' with your actual file)
df = pd.read_csv("dataset2_and_atomic_features.csv", index_col=0)

# Display the data frame
print('Shape of df: ', df.shape)
df.head()

# Assuming the last column is the target variable
X = df.iloc[:, :-1].values.astype(np.float32)  # Features
y = df.iloc[:, -1].values.astype(np.float32)  # Target

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the *training* data
scaler.fit(X_train)

# Transform the training data
X_train = scaler.transform(X_train)

# Transform the *testing* data using the same scaler
X_val = scaler.transform(X_val)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train)
y_train_tensor = torch.tensor(y_train).long()
X_val_tensor = torch.tensor(X_val)
y_val_tensor = torch.tensor(y_val).long()

# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCHSIZE, shuffle=True)

val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
valid_loader = DataLoader(val_dataset, batch_size=BATCHSIZE, shuffle=False)

def objective(trial):
    # Generate the model.
    model = define_model(trial).to(DEVICE)

    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

    # Get the FashionMNIST dataset.
    # train_loader, valid_loader = get_mnist()

    # Training of the model.
    for epoch in range(EPOCHS):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            # Limiting training data for faster epochs.
            if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
                break

            data, target = data.to(DEVICE), target.to(DEVICE)

            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()

        # Validation of the model.
        model.eval()
        correct = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(valid_loader):
                # Limiting validation data.
                if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:
                    break
                # data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)
                output = model(data)
                # Get the index of the max log-probability.
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / min(len(valid_loader.dataset), N_VALID_EXAMPLES)

        trial.report(accuracy, epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=600)

    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

Shape of df:  (19501, 1934)


[I 2025-04-05 12:24:54,602] A new study created in memory with name: no-name-c24365f7-5174-473d-abc0-ae16f4813cc4
[I 2025-04-05 12:24:56,748] Trial 0 finished with value: 0.80390625 and parameters: {'n_layers': 2, 'n_units_l0': 26, 'dropout_l0': 0.34984199764831103, 'n_units_l1': 90, 'dropout_l1': 0.4603260984935838, 'optimizer': 'RMSprop', 'lr': 0.0012925896712629964}. Best is trial 0 with value: 0.80390625.
[I 2025-04-05 12:24:59,336] Trial 1 finished with value: 0.59609375 and parameters: {'n_layers': 3, 'n_units_l0': 69, 'dropout_l0': 0.401403467302053, 'n_units_l1': 113, 'dropout_l1': 0.3027474353632066, 'n_units_l2': 101, 'dropout_l2': 0.2565112658040606, 'optimizer': 'Adam', 'lr': 4.4919104848397124e-05}. Best is trial 0 with value: 0.80390625.
[I 2025-04-05 12:25:01,067] Trial 2 finished with value: 0.790625 and parameters: {'n_layers': 1, 'n_units_l0': 43, 'dropout_l0': 0.2022025221362281, 'optimizer': 'RMSprop', 'lr': 0.004503170457003042}. Best is trial 0 with value: 0.80390

Study statistics: 
  Number of finished trials:  100
  Number of pruned trials:  74
  Number of complete trials:  26
Best trial:
  Value:  0.8234375
  Params: 
    n_layers: 2
    n_units_l0: 113
    dropout_l0: 0.3316500162114466
    n_units_l1: 104
    dropout_l1: 0.3691858686507848
    optimizer: RMSprop
    lr: 0.0013291533636181113
