In [2]:
import os
import pandas as pd
import optuna
from optuna.trial import TrialState
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import numpy as np


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCHSIZE = 16
CLASSES = 3
DIR = os.getcwd()
EPOCHS = 400
N_TRAIN_EXAMPLES = BATCHSIZE * 20
N_VALID_EXAMPLES = BATCHSIZE * 10


def define_model(trial):
    # We optimize the number of layers, hidden units and dropout ratio in each layer.
    n_layers = trial.suggest_int("n_linear_layers", 1, 2)
    layers = []

    in_features = 3*69 #heart rate, speed and altitude data
    for i in range(n_layers):
        out_features = trial.suggest_int("n_units_l{}".format(i), 4, 900)
        layers.append(nn.Linear(in_features, out_features))
        activation_fn = trial.suggest_categorical("activation", ["ReLU", "Tanh", "CELU", "Sigmoid", "LeakyReLU"])
        if activation_fn == "ReLU":
            layers.append(nn.ReLU())
        elif activation_fn == "Tanh":
            layers.append(nn.Tanh())
        elif activation_fn == "CELU":
            layers.append(nn.CELU())
        elif activation_fn == "Sigmoid":
            layers.append(nn.Sigmoid())
        elif activation_fn == "LeakyReLU":
            layers.append(nn.LeakyReLU())
        #p = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5)
        #layers.append(nn.Dropout(p))

        in_features = out_features
    layers.append(nn.Linear(in_features, CLASSES))
    #layers.append(nn.LogSoftmax(dim=1))

    return nn.Sequential(*layers)


def get_mnist():

    # Load training data
    name='./Exercise_Train_data.xlsx'

    df = pd.read_excel(name,sheet_name='Sheet1') #load data in dataframe
    df.drop([0],axis=0, inplace=True) #remove first row

    y=df["Column208"]
    X1 = df.drop("Column208", axis=1)

    #We have to trasform the input and output arrays to 32-bit torch tensors
    inputx = torch.tensor(X1.values).float()   #time series vectors
    outputy = torch.tensor(y.values).float() #contains the classification data 0,1,2
    train_data = torch.utils.data.TensorDataset(inputx, outputy)  # Let us encapsulate the data for
                                                            # easier splitting and shuffling for training
    train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=BATCHSIZE, shuffle=True)
                                                            #let us take 5 rows at once, shufflin allowed
    
    # Load test data
    name='./Exercise_Test_data.xlsx'

    df_test = pd.read_excel(name,sheet_name='Sheet1') #load data in dataframe
    df_test.drop([0],axis=0, inplace=True) #remove first row

    yx=df_test["Column208"]
    X1_test = df_test.drop("Column208", axis=1)
    y_test=yx.to_numpy()
    y_test=y_test.astype(int)

    tensori=torch.tensor(X1_test.values).float()
    input_tensor=tensori.to(DEVICE)

    #We have to trasform the input and output arrays to 32-bit torch tensors
    inputx_test = torch.tensor(X1_test.values).float()   #time series vectors
    outputy_test = torch.tensor(y_test).float() #contains the classification data 0,1,2
    test_data = torch.utils.data.TensorDataset(inputx_test, outputy_test)  # Let us encapsulate the data for
                                                            # easier splitting and shuffling for training
    test_loader = torch.utils.data.DataLoader(dataset=test_data, batch_size=BATCHSIZE, shuffle=True)
                                                            #let us take 5 rows at once, shufflin allowed
    
    return train_loader, test_loader, input_tensor, y_test #, valid_loader

def validate(model, input_tensor, y_test):
    test_values=model(input_tensor)
    test_values=test_values.detach().cpu().numpy()
    correct=0 
    total=len(test_values) 
    total_mse = 0.0
    for i in range(0,len(test_values)):
        predicted = np.argmax(test_values[i])
        #print(y_test[i], predicted, test_values[i])
        correct += (predicted == y_test[i]).item()
        # One-hot encode the labels for MSE calculation
        label_one_hot = F.one_hot(torch.tensor(y_test[i]), num_classes=3).float()

        # Calculate MSE for the current example
        mse = F.mse_loss(torch.tensor(test_values[i]), label_one_hot, reduction="sum")
        total_mse += mse.item()


    return total_mse / total, correct / total * 100

def objective(trial):
    # Generate the model.
    model = define_model(trial).to(DEVICE)

    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
    #criterion=nn.MSELoss(reduction='sum')
    criterion = nn.CrossEntropyLoss()
    # Get the FashionMNIST dataset.
    train_loader, test_loader, input_tensor, y_test = get_mnist() #, valid_loader

    #scheduler reduces learning rate in places where are no significant gradient sloes in error surface
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,factor=0.9, patience=10)

    #let us train the network
    for epoch in range(EPOCHS):
        #logs = {}
        train_loss = 0.0
        # supress Learning Rate after the first epoch
        if epoch>0:
            scheduler.step(loss)
        #go the data trough by the baches
        for (xd,yd) in train_loader:
            yd = yd.type(torch.LongTensor) #for classification problems

            #load the data to the device one batch at a time (input+output)       
            xd = xd.to(DEVICE)
            yd = yd.to(DEVICE)
            
            #Get predictions from the input values (batch at a time)
            outputti = model(xd)
            #zero the parameter gradients
            optimizer.zero_grad()

            # Calculate Loss:  
            loss = criterion(outputti, yd)
            # Fed the error backwards to the network (learning from mistakes!)
            loss.backward()
            # Updating parameters
            optimizer.step()
            # Collect error for the user
            train_loss += loss.item()

        # Print Learning Rate and temoral epoch error = loss 
        #if epoch % 100 == 0:
        #    print("Epoch:",epoch, "\tLR:",optimizer.param_groups[0]['lr'],"\tTraining Loss: ", (train_loss / len(train_loader)))   


        # calculate the accuracy of the model
        total_mse, accuracy = validate(model, input_tensor, y_test)
        trial.report(accuracy, epoch)

        trial.set_user_attr("final_mse", total_mse)
        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
    return accuracy


if __name__ == "__main__":
    num_repeat = 4
    run_stats = []
    for i in range(num_repeat):
        current_run = {"repeat": i+1}
        print("Repeat: ", i)
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=100, timeout=600)

        pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
        complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

        print("Study statistics: ")
        print("  Number of finished trials: ", len(study.trials))
        print("  Number of pruned trials: ", len(pruned_trials))
        print("  Number of complete trials: ", len(complete_trials))
        current_run["num_trials"] = len(study.trials)
        current_run["num_pruned_trials"] = len(pruned_trials)
        current_run["num_complete_trials"] = len(complete_trials)
        print("Best trial:")
        trial = study.best_trial
        current_run["best_trial"] = trial
        run_stats.append(current_run)
        print("  Value: ", trial.value)

        print("  Params: ")
        for key, value in trial.params.items():
            print("    {}: {}".format(key, value))

        if "final_mse" in trial.user_attrs:
            print(f"  Final MSE: {trial.user_attrs['final_mse']}")

    print("-----------------")
    print()
    for i, params in enumerate(run_stats):
        print(f"Repeat {i+1}:")
        print("Study statistics: ")
        print("  Number of finished trials:", str(params['num_trials']))
        print("  Number of pruned trials:", str(params['num_pruned_trials']))
        print("  Number of complete trials:", str(params['num_complete_trials']))
        
        print("Best trial:")
        print("  Value (accuracy):", params['best_trial'].value)
        print("  Params: ")
        for key, value in params["best_trial"].params.items():
            print(f"    {key}: {value}")

        if "final_mse" in params["best_trial"].user_attrs:
            print(f"  Final MSE: {params['best_trial'].user_attrs['final_mse']}")
        
        print()

[I 2024-12-21 23:13:48,540] A new study created in memory with name: no-name-39654841-83de-478c-bc25-91e01f8ffc06


Repeat:  0


[I 2024-12-21 23:14:17,977] Trial 0 finished with value: 90.08620689655173 and parameters: {'n_linear_layers': 1, 'n_units_l0': 158, 'activation': 'Sigmoid', 'optimizer': 'Adam', 'lr': 0.004705244629527424}. Best is trial 0 with value: 90.08620689655173.
[I 2024-12-21 23:14:46,379] Trial 1 finished with value: 89.65517241379311 and parameters: {'n_linear_layers': 1, 'n_units_l0': 441, 'activation': 'ReLU', 'optimizer': 'Adam', 'lr': 3.1155868346230006e-05}. Best is trial 0 with value: 90.08620689655173.
[I 2024-12-21 23:15:15,041] Trial 2 finished with value: 89.22413793103449 and parameters: {'n_linear_layers': 2, 'n_units_l0': 195, 'activation': 'Tanh', 'n_units_l1': 228, 'optimizer': 'RMSprop', 'lr': 5.669118454828521e-05}. Best is trial 0 with value: 90.08620689655173.
[I 2024-12-21 23:15:40,067] Trial 3 finished with value: 87.93103448275862 and parameters: {'n_linear_layers': 1, 'n_units_l0': 110, 'activation': 'Sigmoid', 'optimizer': 'SGD', 'lr': 0.020215203725185715}. Best is t

Study statistics: 
  Number of finished trials:  43
  Number of pruned trials:  24
  Number of complete trials:  19
Best trial:
  Value:  90.94827586206897
  Params: 
    n_linear_layers: 2
    n_units_l0: 347
    activation: ReLU
    n_units_l1: 120
    optimizer: Adam
    lr: 0.0012373312350259724
  Final MSE: 2218.880998726549
Repeat:  1


[I 2024-12-21 23:24:23,503] Trial 0 finished with value: 88.79310344827587 and parameters: {'n_linear_layers': 1, 'n_units_l0': 101, 'activation': 'LeakyReLU', 'optimizer': 'RMSprop', 'lr': 6.332712646506002e-05}. Best is trial 0 with value: 88.79310344827587.
[I 2024-12-21 23:24:52,120] Trial 1 finished with value: 90.94827586206897 and parameters: {'n_linear_layers': 1, 'n_units_l0': 389, 'activation': 'ReLU', 'optimizer': 'Adam', 'lr': 7.223883211761309e-05}. Best is trial 1 with value: 90.94827586206897.
[I 2024-12-21 23:25:20,104] Trial 2 finished with value: 90.08620689655173 and parameters: {'n_linear_layers': 1, 'n_units_l0': 848, 'activation': 'Sigmoid', 'optimizer': 'RMSprop', 'lr': 0.0013664908343332585}. Best is trial 1 with value: 90.94827586206897.
[I 2024-12-21 23:25:49,374] Trial 3 finished with value: 88.36206896551724 and parameters: {'n_linear_layers': 2, 'n_units_l0': 159, 'activation': 'CELU', 'n_units_l1': 32, 'optimizer': 'RMSprop', 'lr': 0.00020539938662121341}.

Study statistics: 
  Number of finished trials:  83
  Number of pruned trials:  65
  Number of complete trials:  18
Best trial:
  Value:  90.94827586206897
  Params: 
    n_linear_layers: 1
    n_units_l0: 389
    activation: ReLU
    optimizer: Adam
    lr: 7.223883211761309e-05
  Final MSE: 258.60660647575196
Repeat:  2


[I 2024-12-21 23:34:28,134] Trial 0 finished with value: 90.51724137931035 and parameters: {'n_linear_layers': 1, 'n_units_l0': 256, 'activation': 'Sigmoid', 'optimizer': 'RMSprop', 'lr': 0.0006219377223549593}. Best is trial 0 with value: 90.51724137931035.
[I 2024-12-21 23:34:54,530] Trial 1 finished with value: 90.08620689655173 and parameters: {'n_linear_layers': 1, 'n_units_l0': 502, 'activation': 'ReLU', 'optimizer': 'RMSprop', 'lr': 0.00044071951767954935}. Best is trial 0 with value: 90.51724137931035.
[I 2024-12-21 23:35:19,743] Trial 2 finished with value: 85.34482758620689 and parameters: {'n_linear_layers': 2, 'n_units_l0': 773, 'activation': 'Tanh', 'n_units_l1': 527, 'optimizer': 'SGD', 'lr': 0.0006287641942722175}. Best is trial 0 with value: 90.51724137931035.
[I 2024-12-21 23:35:46,611] Trial 3 finished with value: 89.22413793103449 and parameters: {'n_linear_layers': 1, 'n_units_l0': 332, 'activation': 'LeakyReLU', 'optimizer': 'Adam', 'lr': 1.5014890030339346e-05}. B

Study statistics: 
  Number of finished trials:  100
  Number of pruned trials:  84
  Number of complete trials:  16
Best trial:
  Value:  91.37931034482759
  Params: 
    n_linear_layers: 2
    n_units_l0: 796
    activation: Sigmoid
    n_units_l1: 343
    optimizer: RMSprop
    lr: 0.0021272351775612026
  Final MSE: 1868.166895767738
Repeat:  3


[I 2024-12-21 23:43:30,554] Trial 0 finished with value: 89.22413793103449 and parameters: {'n_linear_layers': 1, 'n_units_l0': 173, 'activation': 'CELU', 'optimizer': 'Adam', 'lr': 7.388419886122607e-05}. Best is trial 0 with value: 89.22413793103449.
[I 2024-12-21 23:43:58,570] Trial 1 finished with value: 88.79310344827587 and parameters: {'n_linear_layers': 2, 'n_units_l0': 89, 'activation': 'Tanh', 'n_units_l1': 648, 'optimizer': 'Adam', 'lr': 7.112105738485635e-05}. Best is trial 0 with value: 89.22413793103449.
[I 2024-12-21 23:44:21,188] Trial 2 finished with value: 75.86206896551724 and parameters: {'n_linear_layers': 1, 'n_units_l0': 733, 'activation': 'Sigmoid', 'optimizer': 'SGD', 'lr': 0.00020039563003033915}. Best is trial 0 with value: 89.22413793103449.
[I 2024-12-21 23:44:48,387] Trial 3 finished with value: 90.51724137931035 and parameters: {'n_linear_layers': 1, 'n_units_l0': 686, 'activation': 'Sigmoid', 'optimizer': 'RMSprop', 'lr': 0.0013900692309662716}. Best is 

Study statistics: 
  Number of finished trials:  67
  Number of pruned trials:  47
  Number of complete trials:  20
Best trial:
  Value:  90.94827586206897
  Params: 
    n_linear_layers: 1
    n_units_l0: 507
    activation: Sigmoid
    optimizer: Adam
    lr: 0.001916978918938718
  Final MSE: 2116.407550302045
-----------------

Repeat 1:
Study statistics: 
  Number of finished trials: 43
  Number of pruned trials: 24
  Number of complete trials: 19
Best trial:
  Value (accuracy): 90.94827586206897
  Params: 
    n_linear_layers: 2
    n_units_l0: 347
    activation: ReLU
    n_units_l1: 120
    optimizer: Adam
    lr: 0.0012373312350259724
  Final MSE: 2218.880998726549

Repeat 2:
Study statistics: 
  Number of finished trials: 83
  Number of pruned trials: 65
  Number of complete trials: 18
Best trial:
  Value (accuracy): 90.94827586206897
  Params: 
    n_linear_layers: 1
    n_units_l0: 389
    activation: ReLU
    optimizer: Adam
    lr: 7.223883211761309e-05
  Final MSE: 258.60

Original model results, obtained by running train_test.py :

- Correct: 90.51724137931035 %
- MSE: 8006.621066488069

Test is run for the Exercise_test_data and trained with Exercise_Train_data (same as in Parctical example code folder).

Here is results of 4 different runs on trials. For 3/4 runs, the timeout (10min) stopped the run. The results for each run is better than for the original model (90.5%). The Accuracy and MSE is computed in the same way for the original and the test set. 

Results indicate best accuracy for the most complex model with 4 layers (86, 329, 377, 665 units). However, the MSE values for more complex models is high, which indicate that results for models with simpler architecture are more reliable. High MSE value may suggest overfitting. Based on these result, the best arhitecture would be from Run 2 or Run 4. Thus, test should be run next with layer variability set to 1-2, this would also increase the Number of Finished Trials since more computing is required for deeper architectures. Also test could be done using MSE report to trial, which would increase focus on MSE (if we want to minimize it). 

The dropout and logsoftmax layers are commented out but could be used to optimize model even further. This test only includes the architecture that is shown in the example code and optimization process only does search for number of layers, sizes, activation functions, optimizer and learning rate.

| Metric                      | Run 1                                       | Run 2                                      | Run 3                                      | Run 4                                      |
|-----------------------------|-----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|
| **Number of Finished Trials** | 55                                            | 100                                          | 63                                           | 73                                           |
| **Number of Pruned Trials**   | 36                                            | 88                                           | 45                                           | 55                                           |
| **Number of Complete Trials** | 19                                            | 12                                           | 18                                           | 18                                           |
| **Best Accuracy (Value)**     | **92.24%**                                        | 91.38%                                       | 91.38%                                       | 91.81%                                       |
| **Model Structure**           | 4 layers (86, 329, 377, 665 units)            | 1 layer (432 units)                          | 4 layers (554, 615, 530, 124 units)          | 2 layers (683, 104 units)                   |
| **Activation Function**       | LeakyReLU                                     | Tanh                                         | LeakyReLU                                    | LeakyReLU                                    |
| **Optimizer**                 | Adam                                          | Adam                                         | RMSprop                                      | SGD                                          |
| **Learning Rate**             | 0.001604                                      | 0.00199                                      | 0.000376                                     | 0.02833                                      |
| **Final MSE**                 | 4165.37                                       | **361.75**                                       | 5985.22                                      | 406.46                                       |

Second test was made based on previous results. Now using less layers, now 1-2 layers can be selected and max layer size was increased from 700 -> 900.
Results are in table below


| Metric                      | Run 1                                    | Run 2                                    | Run 3                                    | Run 4                                    |
|-----------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|
| **Number of Finished Trials** | 43                                          | 83                                          | 100                                         | 67                                          |
| **Number of Pruned Trials**   | 24                                          | 65                                          | 84                                          | 47                                          |
| **Number of Complete Trials** | 19                                          | 18                                          | 16                                          | 20                                          |
| **Best Accuracy (Value)**     | 90.95%                                      | 90.95%                                      | **91.38%**                                  | 90.95%                                      |
| **Model Structure**           | 2 layers (347, 120 units)                  | 1 layer (389 units)                        | 2 layers (796, 343 units)                  | 1 layer (507 units)                        |
| **Activation Function**       | ReLU                                       | ReLU                                       | Sigmoid                                    | Sigmoid                                    |
| **Optimizer**                 | Adam                                       | Adam                                       | RMSprop                                    | Adam                                       |
| **Learning Rate**             | 0.001237                                   | 0.0000722                                  | 0.002127                                   | 0.001916                                   |
| **Final MSE**                 | 2218.88                                    | **258.61**                                 | 1868.17                                    | 2116.41                                    |



Results show best accuracy trend for 90.95 %, which was baseline for 3 runs. Run 3 had best accuracy 91.38 % using 2 layers and Sigmoid activation function. However, the lowest MSE was for Run 2, which had 1 layer. The same trend continues as more complex architecture leads to increased MSE. One more thing to consider is that activation functions are completely different when comparing to first results.