# Mount drive and append path to PYTONPATH


In [None]:
import os
import sys

from google.colab import drive

drive.mount("/content/drive")
sys.path.append("/content/drive/MyDrive/DeepLCMS/train_google_colab")

# Import and install libraries

In [None]:
%%capture
!pip install lightning
!pip install timm
!pip install torchinfo
!pip install scikit-posthocs

In [None]:
import gc
from typing import Optional, Tuple
from pathlib import Path

import colab_functions
import colab_utils
import pandas as pd
import prepare_data
import pytorch_lightning as pl
import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchinfo
import train_NN
from google.colab import drive
from lightning.pytorch.loggers import CSVLogger
from pytorch_lightning import LightningModule
from pytorch_lightning.callbacks import Callback, EarlyStopping
from pytorch_lightning.trainer.trainer import Trainer
from timm import create_model
from torchmetrics.classification import (
    BinaryAUROC,
    BinaryF1Score,
    BinaryPrecision,
    BinaryRecall,
)
import seaborn as sns
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import scikit_posthocs as sp

In [None]:
# Set the CUDA_VISIBLE_DEVICES environment variable
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Unzip data

In [None]:
%%script echo skipping
!unzip -q "*.zip"

# Check if GPU is used

In [None]:
device = colab_functions.get_device()

# Testing experimental conditions


In [None]:
%%script echo skipping
img_paths = [path for path in Path("./").rglob("ST001618_Opium_study_LC_MS*") if path.is_dir()]
img_paths

In [None]:
%%script echo skipping

for img_path in img_paths:
  for round in range(1,6):
      try:
          print(f"Round {round}, working on: {img_path}")
          temp_model = train_NN.Resnet_model()

          (
              preprocess_train,
              preprocess_val,
              preprocess_test,
          ) = prepare_data.get_timm_transforms(temp_model)

          (
              train_dataloader,
              val_dataloader,
              test_dataloader,
          ) = prepare_data.get_dataloaders(
              train_dir = img_path / "train",
              val_dir = img_path / "val",
              test_dir = img_path / "test",
              preprocess_train=preprocess_train,
              preprocess_val=preprocess_val,
              preprocess_test=preprocess_test,
          )

          logger = CSVLogger("logs", name=str(img_path))

          trainer = Trainer(
              max_epochs=50,
              log_every_n_steps=1,
              logger=logger,
              callbacks=[EarlyStopping(monitor="val_loss", mode="min")],
          )

          trainer.fit(
              model=temp_model,
              train_dataloaders=train_dataloader,
              val_dataloaders=val_dataloader,
          )

          # Clean up resources
          resources_to_delete = [
              temp_model,
              preprocess_train,
              preprocess_val,
              preprocess_test,
              train_dataloader,
              val_dataloader,
              test_dataloader,
              trainer,
          ]

          gc.collect()
      except RuntimeError or ValueError as e:
          pass

results_df = colab_functions.get_experiment_results()
results_df.to_csv("experimental_conditions.csv", index=True)
colab_functions.plot_experiment_results(results_df)

# Load saved result from disk


In [None]:
# Load saved result from disk
results_df = pd.read_csv("experimental_conditions.csv")

In [None]:
# Since we ran 5 expeiments per dataset we have to assign a new column
# called replicate to designate which replicate the epochs belong to

experiments = []
counter = 0
for experiment in results_df.epoch:
    if experiment == 0:
        counter = counter + 1
        experiments.append(counter)
    else:
        experiments.append(counter)

results_df_w_experiment = pd.concat(
    [results_df, pd.Series(experiments)], axis=1
).rename(columns={0: "replicate"})
results_df_w_experiment

In [None]:
# extracting the maximum values per replicate (except for the loss values)

max_values = (
    results_df_w_experiment.groupby(["variable", "replicate", "experiment"])
    .value.max()
    .to_frame()
    .reset_index(drop=False)
    .query("variable != 'val_loss' | variable != 'train_loss'")
    .drop(columns="replicate")
)

In [None]:
# extracting the minimum values per replicate (loss values only)

min_values = (
    results_df_w_experiment.groupby(["variable", "replicate", "experiment"])
    .value.min()
    .to_frame()
    .reset_index(drop=False)
    .query("variable == 'val_loss' | variable == 'train_loss'")
    .drop(columns="replicate")
)

In [None]:
experiment_replicates = (
    pd.concat([max_values, min_values])
    .replace(
        {
            "ST001618_Opium_study_LC_MS_500": "500 images",
            "ST001618_Opium_study_LC_MS_1000": "1000 images",
            "ST001618_Opium_study_LC_MS_500_augmented": "500 images augmented",
            "ST001618_Opium_study_LC_MS_1000_augmented": "1000 images augmented",
        }
    )
    .assign(variable=lambda df: df.variable.str.replace("_", " ").str.title())
)
experiment_replicates.to_csv("experiment_replicates.csv", index=False)

In [None]:
with sns.plotting_context("talk", font_scale=0.8):
    grid = sns.FacetGrid(experiment_replicates, col="variable", col_wrap=5)
    grid.map_dataframe(
        sns.barplot,
        y="experiment",
        x="value",
        capsize=0.15,
    )

    grid.set_titles(
        row_template="{row_name}", col_template="{col_name}", fontweight="bold", size=16
    )
    grid.set_axis_labels("", "")

    # Add labels to each bar
    for ax in grid.axes.flatten():
        for container in ax.containers:
            ax.bar_label(
                container,
                labels=[f"{x:.2f}" for x in container.datavalues],
                fontsize=10,
                padding=17,
            )

    plt.tight_layout()

    grid.savefig("experiment_result.png")

# Testing statistical significance with Dunn’s test

In [None]:
results_dict = {}

for metric in experiment_replicates.variable.unique():
    temp_df = experiment_replicates.query("variable == @metric")
    print(metric)
    dunn_test_results = sp.posthoc_dunn(
        a=temp_df, val_col="value", group_col="experiment", p_adjust="fdr_bh"
    )

    # Add the results to the dictionary
    results_dict[metric] = dunn_test_results

In [None]:
(
    pd.concat(results_dict)
    .loc[lambda df: df.apply(lambda row: any(row < 0.05), axis=1), :]
    .assign(sum_value=lambda df: df.sum(axis=1))
    .drop_duplicates(subset="sum_value")
    .drop(columns="sum_value")
)