# Mount drive and append path to PYTONPATH


In [None]:
import os
import sys

from google.colab import drive

drive.mount("/content/drive")
sys.path.append("/content/drive/MyDrive/DeepLCMS/train_google_colab")

# Import and install libraries

In [None]:
%%capture
!pip install lightning
!pip install timm
!pip install torchinfo
!pip install scikit-posthocs
!pip install optuna
!pip install torchcam

In [None]:
import colab_functions
import colab_utils
import pandas as pd
import prepare_data
import timm
import train_NN
from lightning.pytorch import loggers, callbacks, tuner, trainer
from pathlib import Path
from tqdm import tqdm
from IPython.display import clear_output
import seaborn as sns

In [None]:
# Set the CUDA_VISIBLE_DEVICES environment variable
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Unzip data

In [None]:
!unzip -q experiment.zip

# Check if GPU is used

In [None]:
device = colab_functions.get_device()

# Getting the candidate models based on Experiment #1


In [None]:
candidates_df = pd.read_csv("exp_2_candidates.csv")

pretrained_models = candidates_df.model

#Inspect a model and its dataloader


In [None]:
PRETRAINED_MODEL = "convnextv2_nano.fcmae_ft_in22k_in1k_384"

model = train_NN.PretrainedModel(
    pretrained_model_name=PRETRAINED_MODEL, learning_rate=0.001
)
datamodule = prepare_data.LCMSDataModule(
    model,
    data_dir=Path("/content/ST001618_Opium_study_LC_MS_500"),
)
model.show_architecture()

In [None]:
datamodule.inspect_dataloader("train")

# Training loop

In [None]:
for model_name in tqdm(pretrained_models):
    try:
        model = train_NN.PretrainedModel(
            pretrained_model_name=model_name, learning_rate=0.001
        )
        datamodule = prepare_data.LCMSDataModule(
            model,
            data_dir=Path("/content/ST001618_Opium_study_LC_MS_500"),
        )

        logger = loggers.CSVLogger("logs", name=str(model_name))

        trainer_ = trainer.Trainer(
            max_epochs=50,
            log_every_n_steps=1,
            logger=logger,
            precision="16-mixed",
            callbacks=[
                callbacks.EarlyStopping(monitor="val_loss", mode="min", patience=10)
            ],
        )
        # # Create a Tuner
        # tuner_ = tuner.Tuner(trainer_)

        # # Find optimal learning rate
        # lr_finder = tuner_.lr_find(model,
        #                            datamodule = datamodule,
        #                            early_stop_threshold=None)

        # # Extract best learning rate
        # best_lr = lr_finder.suggestion()

        # Train the model using the found learning rate
        trainer_.fit(model=model, datamodule=datamodule)

        del model, datamodule, trainer_
        clear_output(wait=True)

    except RuntimeError as e:
        print(f"{model_name} could not run because {e}")

results_df = colab_functions.get_experiment_results()
results_df.to_csv("pretrained_model_results.csv", index=False)

# Results

In [None]:
# This experiment was divided in two parts, hence the two csv files read_ins
results_df = (
    pd.concat(
        [
            pd.read_csv(csv_file)
            for csv_file in list(Path.cwd().glob("pretrained_model_results*"))
        ],
        axis="index",
    )
    .reset_index(drop=True)
    .assign(
        param_count=lambda df: df.experiment.map(
            candidates_df.loc[:, ["model", "param_count"]]
            .set_index("model")
            .squeeze()
            .to_dict()
        )
    )
)

results_df

In [None]:
# next we take a look at what models achieved the lowest val_losses

best_models = (
    results_df.pivot(
        index=["epoch", "experiment", "param_count"], columns="variable", values="value"
    )
    .reset_index()
    .sort_values(by=["experiment", "epoch"])
    .loc[lambda df: df.groupby("experiment")["val_loss"].idxmin()]
    .sort_values(["val_loss", "val_f1"], ascending=[True, False])
)

best_models.head(10)

In [None]:
best_models_melted = (
    best_models.assign(
        family=lambda df: df.experiment.str.split("_", expand=True)[0]
        .str.split(".", expand=True)[0]
        .str.replace("\d+", "")
        .replace({"convnextv": "convnext", "densenetblurd": "densenet"})
    )
    .loc[
        :,
        [
            "experiment",
            "param_count",
            "val_accuracy",
            "val_f1",
            "val_loss",
            "val_precision",
            "val_recall",
            "family",
        ],
    ]
    .melt(
        id_vars=["experiment", "family", "param_count"],
        value_vars=[
            "val_accuracy",
            "val_f1",
            "val_loss",
            "val_precision",
            "val_recall",
        ],
    )
)
best_models_melted

In [None]:
with sns.plotting_context("talk", font_scale=0.8):
    grid = sns.lmplot(
        x="param_count",
        y="value",
        hue="variable",
        col="family",
        data=best_models_melted,
        height=3,
        facet_kws=dict(sharex=False, sharey=True),
    )
    # Add a main title to the entire FacetGrid
    # grid.fig.suptitle(f"{metric}", fontweight="bold", size=16, y=1.05)
    grid.set_titles(
        row_template="{row_name}", col_template="{col_name}", fontweight="bold", size=16
    )
    grid.savefig("summary.png")