# Mount drive and append path to PYTONPATH


In [1]:
import os
import sys

from google.colab import drive

drive.mount("/content/drive")
sys.path.append("/content/drive/MyDrive/DeepLCMS/train_google_colab")

Mounted at /content/drive


# Import and install libraries

In [None]:
%%capture
!pip install lightning
!pip install timm
!pip install torchinfo
!pip install scikit-posthocs
!pip install optuna

In [None]:
import gc
from typing import Optional, Tuple
from pathlib import Path

import colab_functions
import colab_utils
import pandas as pd
import prepare_data
import pytorch_lightning as pl
import timm
import torch
import torch.nn.functional as F
import torchinfo
import train_NN
from google.colab import drive
from lightning.pytorch.loggers import CSVLogger, TensorBoardLogger
from pytorch_lightning import LightningModule
from pytorch_lightning.callbacks import Callback, EarlyStopping
from pytorch_lightning.trainer.trainer import Trainer
from timm import create_model
from torchmetrics.classification import (
    BinaryAUROC,
    BinaryF1Score,
    BinaryPrecision,
    BinaryRecall,
)

import optuna
from torch import nn
from torch.optim import Adam, SGD, RMSprop
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingLR
from pytorch_lightning.callbacks import EarlyStopping
from torchmetrics.classification import BinaryF1Score, BinaryPrecision, BinaryRecall
import pickle


from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_contour
from optuna.integration import PyTorchLightningPruningCallback

In [None]:
# Set the CUDA_VISIBLE_DEVICES environment variable
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Unzip data

In [None]:
%%script echo skipping
!unzip -q "*.zip"

# Check if GPU is used

In [None]:
device = colab_functions.get_device()

Using GPU: Tesla T4


# Getting a tunable model

In [None]:
class Resnet_model_tune(pl.LightningModule):
    def __init__(self, hyperparameters):
        super().__init__()
        self.hyperparameters = hyperparameters
        self.model = create_model("resnet50d.a3_in1k", pretrained=True, num_classes=1)

        # Freeze all layers except for the last one
        for param in self.model.parameters():
            param.requires_grad = False

        self.model.fc = nn.Sequential(
            nn.Linear(in_features=2048, out_features=512, bias=True),
            nn.ReLU(),
            nn.Dropout(p=self.hyperparameters["dropout"]),
            nn.Linear(in_features=512, out_features=256, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=1, bias=True),
        )

    def forward(self, x):
        x = self.model(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch

        loss_fn = nn.BCELoss()

        y_pred_logits = self(x).squeeze()
        y_pred = torch.sigmoid(y_pred_logits)
        loss = loss_fn(y_pred, y.float())

        self.log(
            "train_loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True
        )

        # Calculate metrics

        # Calculate Accuracy
        y_pred_class = torch.round(y_pred)
        acc = (y_pred_class == y).sum().item() / len(y_pred)
        self.log(
            "train_acc", acc, on_step=False, on_epoch=True, prog_bar=False, logger=True
        )
        # Calculate F1
        metric_f1 = BinaryF1Score().to(y.device)
        f1 = metric_f1(y_pred_class, y)
        self.log(
            "train_f1", f1, on_step=False, on_epoch=True, prog_bar=False, logger=True
        )
        # Calculate Precision
        metric_precision = BinaryPrecision().to(y.device)
        precision = metric_precision(y_pred_class, y)
        self.log(
            "train_precision",
            precision,
            on_step=False,
            on_epoch=True,
            prog_bar=False,
            logger=True,
        )
        # Calculate Recall
        metric_f1 = BinaryRecall().to(y.device)
        recall = metric_f1(y_pred_class, y)
        self.log(
            "train_recall",
            recall,
            on_step=False,
            on_epoch=True,
            prog_bar=False,
            logger=True,
        )

        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch

        loss_fn = nn.BCELoss()

        y_pred_logits = self(x).squeeze()
        y_pred = torch.sigmoid(y_pred_logits)
        loss = loss_fn(y_pred, y.float())
        self.log(
            "val_loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True
        )

        # Calculate metrics

        # Calculate Accuracy
        y_pred_class = torch.round(y_pred)
        acc = (y_pred_class == y).sum().item() / len(y_pred)
        self.log(
            "val_acc", acc, on_step=False, on_epoch=True, prog_bar=True, logger=True
        )
        # Calculate F1
        metric_f1 = BinaryF1Score().to(y.device)
        f1 = metric_f1(y_pred_class, y)
        self.log("val_f1", f1, on_step=False, on_epoch=True, prog_bar=True, logger=True)

        # Calculate Precision
        metric_precision = BinaryPrecision().to(y.device)
        precision = metric_precision(y_pred_class, y)
        self.log(
            "val_precision",
            precision,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        # Calculate Recall
        metric_f1 = BinaryRecall().to(y.device)
        recall = metric_f1(y_pred_class, y)
        self.log(
            "val_recall",
            recall,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        if isinstance(batch, list):
            # Assuming the first element in the list is the input tensor
            input_tensor = batch[0]
            return self(input_tensor)
        else:
            # If batch is already a tensor, proceed as usual
            print("Input Shape:", batch.shape)
            return self(batch)

    def configure_optimizers(self):
        optimizer = None

        if self.hyperparameters["optimizer"] == "Adam":
            optimizer = Adam(
                self.parameters(), lr=self.hyperparameters["lr"], weight_decay=2e-5
            )
        elif self.hyperparameters["optimizer"] == "SGD":
            optimizer = SGD(
                self.parameters(), lr=self.hyperparameters["lr"], weight_decay=2e-5
            )
        elif self.hyperparameters["optimizer"] == "RMSprop":
            optimizer = RMSprop(
                self.parameters(), lr=self.hyperparameters["lr"], weight_decay=2e-5
            )
        else:
            raise ValueError(
                f"Unsupported optimizer: {self.hyperparameters['optimizer']}"
            )

        scheduler = None

        if self.hyperparameters["scheduler"] == "ReduceLROnPlateau":
            scheduler = {
                "scheduler": ReduceLROnPlateau(
                    optimizer, mode="min", factor=0.1, patience=3
                ),
                "interval": "epoch",
                "monitor": "val_loss",
            }
        elif self.hyperparameters["scheduler"] == "CosineAnnealingLR":
            scheduler = CosineAnnealingLR(optimizer, T_max=50, eta_min=0)

        return [optimizer], [scheduler]


def objective(trial):
    hyperparameters = {
        "optimizer": trial.suggest_categorical("optimizer", ["Adam", "SGD", "RMSprop"]),
        "scheduler": trial.suggest_categorical(
            "scheduler", ["ReduceLROnPlateau", "CosineAnnealingLR"]
        ),
        "lr": trial.suggest_loguniform("lr", 1e-5, 1e-1),
        "dropout": trial.suggest_float("dropout", 0.01, 1),
    }

    model = Resnet_model_tune(hyperparameters)
    logger = CSVLogger("logs", name=str(trial.number))
    trainer = pl.Trainer(
        logger=logger,
        max_epochs=50,
        callbacks=[
            EarlyStopping(monitor="val_loss", patience=1),
            PyTorchLightningPruningCallback(trial, monitor="val_loss"),
        ],
    )

    trainer.fit(model, train_dataloader, val_dataloader)

    return trainer.callback_metrics["val_loss"].item()


def print_callback(study, trial):
    print(
        f"Trial {trial.number} finished with value: {trial.value} and parameters: {trial.params}"
    )

In [None]:
(
    preprocess_train,
    preprocess_val,
    preprocess_test,
) = prepare_data.get_timm_transforms(train_NN.Resnet_model())

(
    train_dataloader,
    val_dataloader,
    test_dataloader,
) = prepare_data.get_dataloaders(
    preprocess_train=preprocess_train,
    preprocess_val=preprocess_val,
    preprocess_test=preprocess_test,
)

# exception handling including due to bug
# https://github.com/pytorch/pytorch/issues/67978
try:
    study.optimize(objective, n_trials=100, callbacks=[print_callback])
except RuntimeError as e:
    print(e)
    pass

In [None]:
with open("optuna_params.pickle", "wb") as handle:
    pickle.dump(study.best_params, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open("optuna_params.pickle", "rb") as handle:
    optuna_params = pickle.load(handle)

print(study.best_params == optuna_params)

In [None]:
plot_parallel_coordinate(study)

In [None]:
plot_contour(study)

In [None]:
plot_param_importances(study)

# Evaluate results

In [58]:
results_df = colab_functions.get_experiment_results().assign(
    experiment=lambda df: df.experiment.astype(int)
)
optuna_trials = pd.read_csv("optuna_trials.csv")

# merge results_df with optuna_trials so that we have access to the full training
# data with all epochs
# this is needed since optuna made a decision based on overfitted data

df = results_df.merge(optuna_trials, left_on="experiment", right_on="number")

Reading CSV: /content/logs/3/version_0/metrics.csv
Reading CSV: /content/logs/46/version_0/metrics.csv
Reading CSV: /content/logs/28/version_0/metrics.csv
Reading CSV: /content/logs/30/version_0/metrics.csv
Reading CSV: /content/logs/31/version_0/metrics.csv
Reading CSV: /content/logs/34/version_0/metrics.csv
Reading CSV: /content/logs/38/version_0/metrics.csv
Reading CSV: /content/logs/9/version_0/metrics.csv
Reading CSV: /content/logs/54/version_0/metrics.csv
Reading CSV: /content/logs/41/version_0/metrics.csv
Reading CSV: /content/logs/18/version_0/metrics.csv
Reading CSV: /content/logs/49/version_0/metrics.csv
Reading CSV: /content/logs/52/version_0/metrics.csv
Reading CSV: /content/logs/24/version_0/metrics.csv
Reading CSV: /content/logs/43/version_0/metrics.csv
Reading CSV: /content/logs/10/version_0/metrics.csv
Reading CSV: /content/logs/47/version_0/metrics.csv
Reading CSV: /content/logs/7/version_0/metrics.csv
Reading CSV: /content/logs/33/version_0/metrics.csv
Reading CSV: /c

Optuna originally selected trial #27, which only achieved a validation loss of 0.1353. This is because Optuna considers the validation loss of the last epoch before terminating the trial due to overfitting. Therefore, the final conclusion reached by Optuna is based on an already overfitted model. Based on the learning curves logged, we can determine the best conditions and the number of epochs we should train our model for.

These are the validation losses regarding trial #27:

In [74]:
(df.query("experiment == 27 and variable.str.contains('val_loss')")).sort_values(
    by="epoch"
)

Unnamed: 0,epoch,experiment,variable,value_x,number,value_y,datetime_start,datetime_complete,duration,params_dropout,params_lr,params_optimizer,params_scheduler,state
1685,0,27,val_loss,0.289606,27,0.185086,06:17.8,08:01.9,0 days 00:01:44.088180,0.18566,0.001766,Adam,CosineAnnealingLR,COMPLETE
1686,1,27,val_loss,0.188185,27,0.185086,06:17.8,08:01.9,0 days 00:01:44.088180,0.18566,0.001766,Adam,CosineAnnealingLR,COMPLETE
1687,2,27,val_loss,0.135349,27,0.185086,06:17.8,08:01.9,0 days 00:01:44.088180,0.18566,0.001766,Adam,CosineAnnealingLR,COMPLETE
1688,3,27,val_loss,0.185086,27,0.185086,06:17.8,08:01.9,0 days 00:01:44.088180,0.18566,0.001766,Adam,CosineAnnealingLR,COMPLETE


The absolute minimum validation loss reached during the optimization procedure was 0.1289 at epoch 3 for experiment 12. However, by the next epoch, the model was overfitted and recorded a final validation loss of 0.2212, which is higher than the validation loss of trial #27. Therefore, trial #27 was selected over experiment 12.

In [66]:
(
    df.loc[lambda df: df.groupby("variable")["value_x"].idxmin(), :].query(
        "variable.str.contains('val_loss') "
    )
)

Unnamed: 0,epoch,experiment,variable,value_x,number,value_y,datetime_start,datetime_complete,duration,params_dropout,params_lr,params_optimizer,params_scheduler,state
1138,3,12,val_loss,0.128919,12,0.221268,49:53.0,51:43.2,0 days 00:01:50.225362,0.257247,0.002731,Adam,CosineAnnealingLR,COMPLETE


These experiments represent the overall best trials and corresponding epochs per metric monitored. As you can see, trial #12 is still present with a validation accuracy of 0.98. Most of these trials used the `Adam` optimizer and `CosineAnnealingLR` scheduler. Based on these findings, we can opt for the conditions described for trial #12 and train the model for 3 epochs.

In [68]:
(
    df.loc[lambda df: df.groupby("variable")["value_x"].idxmax(), :].query(
        "~variable.str.contains('train|loss')"
    )
)

Unnamed: 0,epoch,experiment,variable,value_x,number,value_y,datetime_start,datetime_complete,duration,params_dropout,params_lr,params_optimizer,params_scheduler,state
1147,2,12,val_acc,0.981481,12,0.221268,49:53.0,51:43.2,0 days 00:01:50.225362,0.257247,0.002731,Adam,CosineAnnealingLR,COMPLETE
1368,3,15,val_f1,0.990525,15,0.276522,53:58.4,55:47.5,0 days 00:01:49.111978,0.028516,0.000439,Adam,CosineAnnealingLR,COMPLETE
363,1,31,val_precision,1.0,31,0.23986,10:30.2,11:44.1,0 days 00:01:13.942110,0.310884,0.001321,Adam,ReduceLROnPlateau,COMPLETE
30,0,3,val_recall,1.0,3,0.33884,28:51.6,39:58.6,0 days 00:11:06.989295,0.534953,0.030747,SGD,CosineAnnealingLR,COMPLETE
