# Mount drive and append path to PYTONPATH


In [None]:
import os
import sys

from google.colab import drive

drive.mount("/content/drive")
sys.path.append("/content/drive/MyDrive/DeepLCMS/gpu_modules")

# Import and install libraries

In [None]:
%%capture
!pip install lightning
!pip install timm
!pip install torchinfo
!pip install torchmetrics
!pip install torchcam
!pip install catboost

In [None]:
from pathlib import Path
from tqdm.auto import tqdm

import colab_functions
import colab_utils
import pandas as pd
import prepare_data
import numpy as np
import PIL
import matplotlib.pyplot as plt
from typing import Tuple

import catboost
import lightgbm as lgb
import xgboost
from IPython.display import clear_output, display
from sklearn import (
    compose,
    dummy,
    ensemble,
    impute,
    linear_model,
    metrics,
    model_selection,
    pipeline,
    preprocessing,
    svm,
    tree,
)

# Unzip data

In [None]:
!unzip -q "/content/drive/MyDrive/DeepLCMS/ST001618_Opium_study_LC_MS_500.zip"

# Final training with optimized settings

In [None]:
from pathlib import Path
import PIL
import numpy as np
import pandas as pd
from typing import Tuple


def convert_images_to_df(
    path: str,
    size: Tuple[int, int] = (25, 25),
    slice: str = "train",
    img_type: str = ".jpeg",
    last_col_name: str = "label",
) -> Tuple[pd.Series, pd.DataFrame]:
    """
    Converts images from a specified directory to a pandas DataFrame and a Series.
    Each row in the DataFrame corresponds to an image, where each column represents a pixel.
    The Series contains the labels of the images.

    Parameters:
    path (str): The directory path where the images are stored.
    size (Tuple[int, int]): The size to which all images will be resized. Default is (25, 25).
    slice (str): The name of the subdirectory that contains the images to be processed. Default is "train".
    img_type (str): The file type of the images. Default is ".jpeg".
    last_col_name (str): The name to be given to the last column of the DataFrame. Default is "label".

    Returns:
    Tuple[pd.Series, pd.DataFrame]: A tuple where the first element is a Series with the labels of the images,
    and the second element is a DataFrame where each row corresponds to an image and each column represents a pixel.

    Example:
    >>> labels, df = convert_images_to_df(path="/path/to/images", size=(50, 50),
    slice="test", img_type=".png", last_col_name="class")
    """

    # Check if path exists
    if not Path(path).exists():
        raise ValueError(f"The path {path} does not exist.")

    # Get all image paths
    img_paths = list(Path(path).rglob(f"*{img_type}"))

    # Check if there are any images
    if not img_paths:
        raise ValueError(f"No {img_type} files found in {path}.")

    img_list = []
    labels = []
    for img_path in img_paths:
        if img_path.parents[1].name == slice:
            # Open and resize image
            img = PIL.Image.open(img_path)
            resized_img = img.resize(size)

            # Convert image to numpy array and flatten
            np_img = np.asarray(resized_img)
            np_img_flat = np_img.reshape(-1)

            img_list.append(np_img_flat)
            labels.append(img_path.parent.name)

    flattened_imgs = np.vstack(img_list)

    return pd.Series(labels), pd.DataFrame(flattened_imgs)

# Read in the train and test images and convert to dataframes

In [None]:
y_train, X_train = convert_images_to_df(
    r"/content/ST001618_Opium_study_LC_MS_500", slice="train", size=(224, 224)
)
y_test, X_test = convert_images_to_df(
    r"/content/ST001618_Opium_study_LC_MS_500", slice="test", size=(224, 224)
)

print(f"Shape of train set: {X_train.shape}, shape of test set: {X_test.shape}")

In [None]:
# convert targets to numerical values

y_test = y_test.map({"User": 1, "Non-User": 0})
y_train = y_train.map({"User": 1, "Non-User": 0})

In [None]:
X_train, y_train = X_train.sample(frac=1, random_state=42), y_train.sample(
    frac=1, random_state=42
)

# Assessing the performance of ML algorithms


In [None]:
MLA = [
    linear_model.LogisticRegression(),
    tree.DecisionTreeClassifier(),
    ensemble.RandomForestClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.AdaBoostClassifier(),
    lgb.LGBMClassifier(verbose=-1),
    xgboost.XGBClassifier(verbosity=0),
    dummy.DummyClassifier(),
    # ensemble.GradientBoostingClassifier(),
    # catboost.CatBoostClassifier(silent=True),
]

# note: this is an alternative to train_test_split
cv_split = model_selection.RepeatedStratifiedKFold(
    n_splits=10, n_repeats=1, random_state=0
)

# create table to compare MLA metrics
MLA_columns = [
    "MLA Name",
    "MLA Parameters",
    "Fit Time",
    "Val Mean Acc",
    "Val Mean F1",
    "Val Mean Precision",
    "Val Mean Recall",
]
MLA_compare = pd.DataFrame(columns=MLA_columns)

# index through MLA and save performance to table
row_index = 0
for alg in tqdm(MLA):
    # set name and parameters
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, "MLA Name"] = MLA_name
    MLA_compare.loc[row_index, "MLA Parameters"] = str(alg.get_params())

    model_pipeline = pipeline.Pipeline(
        steps=[
            ("pre_processing", preprocessing.StandardScaler()),
            ("model", alg),
        ]
    )

    cv_results = model_selection.cross_validate(
        model_pipeline,
        X_train,
        y_train,
        cv=cv_split,
        scoring={
            "accuracy": "accuracy",
            "f1": "f1",
            "precision": "precision",
            "recall": "recall",
        },
        return_train_score=True,
    )

    MLA_compare.loc[row_index, "Fit Time"] = cv_results["fit_time"].mean()
    MLA_compare.loc[row_index, "Val Mean Acc"] = cv_results["test_accuracy"].mean()
    MLA_compare.loc[row_index, "Val Mean F1"] = cv_results["test_f1"].mean()

    MLA_compare.loc[row_index, "Val Mean Precision"] = cv_results[
        "test_precision"
    ].mean()
    MLA_compare.loc[row_index, "Val Mean Recall"] = cv_results["test_recall"].mean()

    row_index += 1

    clear_output(wait=True)
    display(MLA_compare.sort_values(by=["Val Mean F1"], ascending=False))

**AdaBoostClassifier** showed the best performance:


*   Val Acc : 0.934
*   Val F1 : 0.954
*   Val Precision : 0.973
*   Val Recall : 0.936


In [None]:
model_pipeline = pipeline.Pipeline(
    steps=[
        ("pre_processing", preprocessing.StandardScaler()),
        (
            "model",
            lgb.LGBMClassifier(verbose=-1),
        ),
    ]
)

model_pipeline.fit(X_train, y_train)
predictions = model_pipeline.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, predictions))

In [None]:
disp = metrics.ConfusionMatrixDisplay(metrics.confusion_matrix(y_test, predictions))
disp.plot()
plt.savefig("ConfusionMatrixDisplay.png", dpi=300)

In [None]:
MLA_compare.to_csv("MLA_compare.csv", index=False)