In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../')
from src.models import CrossEntropyClassification
from src.data import train_val_test_split, get_descriptor_and_labels
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, RichProgressBar
import torch

In [2]:
desc_type = "steinhardt"
use_mda = True
numb_train_samples = 8_000

In [3]:
train_structs, val_structs, test_structs = train_val_test_split(mda=use_mda,num_files=None)

In [4]:
len(train_structs), len(val_structs), len(test_structs)

(1285, 20, 1245)

In [5]:
train_x, train_y, label_mapping = get_descriptor_and_labels(train_structs, num_samples_per_type=numb_train_samples)
val_x, val_y, _ = get_descriptor_and_labels(val_structs, num_samples_per_type=2_500)
test_x, test_y, _ = get_descriptor_and_labels(test_structs, num_samples_per_type=2_500)

In [6]:
label_mapping

{'hda': 0, 'lda': 1, 'mda': 2}

In [7]:
from sklearn import preprocessing

# fit to training data
scaler = preprocessing.StandardScaler().fit(train_x)
scaled_train_x = torch.FloatTensor(scaler.transform(train_x))
scaled_val_x = torch.FloatTensor(scaler.transform(val_x))

In [8]:
train_dataset = TensorDataset(scaled_train_x,train_y)
val_dataset = TensorDataset(scaled_val_x,val_y)

train_loader = DataLoader(train_dataset, batch_size=250, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=10000, shuffle=False)

output_size = train_y.shape[1]

In [9]:
import optuna
from src.data import predict_test_set_classes
from sklearn.metrics import balanced_accuracy_score
from pytorch_lightning.loggers import TensorBoardLogger

def optimise_NN(trial: optuna.Trial):
    # Optuna optimisation function for the NN
    
    # 1. Suggest the hyperparameters
    n_layers = trial.suggest_int("n_layers", 1, 5)
    neurons_per_layer = trial.suggest_int("n_units_l0", 8, 256, log=True)
    hidden_units = [neurons_per_layer] * n_layers
    weight_decay = trial.suggest_float("weight_decay", 1e-8, 1e-1, log=True)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    
    input_size = 30

    # 2. Create the model
    torch.manual_seed(42)
    
    model = CrossEntropyClassification(
        input_size,
        *hidden_units,
        output_size,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
    )

    # 3. Train the model
    trainer = Trainer(
        accelerator="auto",
        max_epochs=200,
        callbacks=[
            RichProgressBar(),
            EarlyStopping(monitor="validation_loss", patience=10),
        ],
        logger=TensorBoardLogger("lightning_logs"),
    )
    trainer.fit(model, train_loader, val_loader)
    
    # 4. Load the best model
    model.load_state_dict(torch.load(trainer.checkpoint_callback.best_model_path)['state_dict'])
    
    # 5. Evaluate the model
    pred_classes, val_classes, _ = predict_test_set_classes(val_structs,model=model, scaler=scaler)
    
    return balanced_accuracy_score(val_classes, pred_classes)

In [10]:
study_name = "optimise_NN"  # Unique identifier of the study.
storage_name = f"sqlite:///{study_name}.db"
study = optuna.create_study(study_name=study_name, storage=storage_name, direction="maximize",load_if_exists=True)

[I 2023-12-04 11:45:58,148] Using an existing study with name 'optimise_NN' instead of creating a new one.


In [11]:
study.optimize(optimise_NN, n_trials=30)

In [12]:
df = study.trials_dataframe(attrs=("number", "value", "params", "state"))
df.sort_values(by="value", ascending=False, inplace=True)

In [22]:
df

Unnamed: 0,number,value,params_learning_rate,params_n_layers,params_n_units_l0,params_weight_decay,state
82,82,0.857914,0.000136,3,82,0.006364,COMPLETE
95,95,0.857698,0.000330,3,73,0.007273,COMPLETE
27,27,0.857695,0.000316,3,128,0.005039,COMPLETE
24,24,0.857629,0.000924,2,117,0.003476,COMPLETE
31,31,0.857506,0.000112,3,110,0.013906,COMPLETE
...,...,...,...,...,...,...,...
50,50,0.836967,0.001145,4,165,0.000130,COMPLETE
47,47,0.834233,0.000296,2,35,0.092648,COMPLETE
0,0,0.724030,0.000017,3,110,0.097190,COMPLETE
9,9,0.333333,0.001573,4,39,0.061781,COMPLETE


In [32]:
# get row 106 and use those params to train the model
# 106 is the worst model in the top 97% of models
n_layers = df.iloc[106]['params_n_layers']
neurons_per_layer = df.iloc[106]['params_n_units_l0']
hidden_units = [neurons_per_layer] * n_layers
weight_decay = df.iloc[106]['params_weight_decay']
lr = df.iloc[106]['params_learning_rate']

In [33]:
# optimised_NN_params = study.best_params
# n_layers, neurons_per_layer, weight_decay, lr = optimised_NN_params.values()

In [34]:
# define the neural network architecture
input_size = scaled_train_x.shape[1]
output_size = train_y.shape[1] # number of labels

hidden_layers = [neurons_per_layer] * n_layers

torch.manual_seed(42)
neural_net = CrossEntropyClassification(
    input_size,
    *hidden_layers,
    output_size,
    learning_rate=lr,
    weight_decay=weight_decay,
)

trainer = Trainer(
        accelerator="auto",
        max_epochs=200,
        callbacks=[
            RichProgressBar(),
            EarlyStopping(monitor="validation_loss", patience=10),
        ],
    )
trainer.fit(neural_net, train_loader, val_loader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Output()

In [35]:
from src.data import predict_test_set_classes
from sklearn.metrics import balanced_accuracy_score

pred_classes, test_classes, confidences = predict_test_set_classes(test_structs,model=neural_net, scaler=scaler)

test_av_accuracy = balanced_accuracy_score(test_classes, pred_classes)

In [36]:
print(f"Test set accuracy: {test_av_accuracy:.3f}")

Test set accuracy: 0.823
