In [1]:
import sys
sys.path.append('../')

import torch
from sklearn.model_selection import StratifiedKFold
from data_prep.dataset import Dataset
from data_prep.dataset_loader import LoadData
from loguru import logger
import pandas as pd
import numpy as np
from utils.utils import excute_cmd, execute_cmd_realtime
from collections import Counter

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_curve, auc

from tqdm import tqdm
from scipy.stats import mode

import os
from glob import glob

from networks.NetworkController import getNetwork
from experiments.ClassifierController import getExperiment

# Custom log format
fmt = "{message}"
config = {
    "handlers": [
        {"sink": sys.stderr, "format": fmt},
    ],
}
logger.configure(**config)

# To allow auto reload to this notebook after modifying any external file imported
%load_ext autoreload
%autoreload 2

In [2]:
dataset_path_ch1_train = '../datasets/challenge1/train'
dataset_path_ch1_valid = '../datasets/challenge1/val'

train_dataset_df, train_images, train_labels, n_classes = LoadData(dataset_path_ch1_train, class_labels = {'nevus': 0, 'others': 1})
valid_dataset_df, valid_images, valid_labels, n_classes = LoadData(dataset_path_ch1_valid, class_labels = {'nevus': 0, 'others': 1})

Loading the data from ../datasets/challenge1/train
Loading the data from ../datasets/challenge1/val


In [3]:
val_dataset = Dataset(
        images_path=valid_images, labels=valid_labels, transform=True, split="val", 
        input_size=(224, 224)
        )
val_dataset

<data_prep.dataset.Dataset at 0x7f1792b1e530>

In [4]:
val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=32, shuffle=True)
val_loader

<torch.utils.data.dataloader.DataLoader at 0x7f1792b1e3e0>

In [5]:
print(len(train_images), len(train_labels))
print(len(valid_images), len(valid_labels))

15195 15195
3796 3796


In [6]:
train_dataset_df['Label'].value_counts()

Label
0    7725
1    7470
Name: count, dtype: int64

In [7]:
# https://chat.openai.com/share/a41b7798-5a48-4f8f-8603-4babcb68bb37

In [8]:
# Concatenate the training and validation datasets
all_images = train_images + valid_images
all_labels = train_labels + valid_labels

In [9]:
np.array(all_images).shape

(18991,)

In [10]:
np.array(all_labels).shape

(18991,)

In [11]:
# Create a DataFrame with all the data
all_data_df = pd.concat([train_dataset_df, valid_dataset_df], axis=0)

In [12]:
all_data_df

Unnamed: 0,Image_Path,Label
0,/mnt/c/Users/abdal/Documents/Master/EMJMD MAIA...,0
1,/mnt/c/Users/abdal/Documents/Master/EMJMD MAIA...,0
2,/mnt/c/Users/abdal/Documents/Master/EMJMD MAIA...,0
3,/mnt/c/Users/abdal/Documents/Master/EMJMD MAIA...,0
4,/mnt/c/Users/abdal/Documents/Master/EMJMD MAIA...,0
...,...,...
3791,/mnt/c/Users/abdal/Documents/Master/EMJMD MAIA...,1
3792,/mnt/c/Users/abdal/Documents/Master/EMJMD MAIA...,1
3793,/mnt/c/Users/abdal/Documents/Master/EMJMD MAIA...,1
3794,/mnt/c/Users/abdal/Documents/Master/EMJMD MAIA...,1


In [11]:
num_folds = 3

# Create lists to store the split indices
train_indices_list = []
valid_indices_list = []

# Assuming train_images and train_labels are numpy arrays or lists
skf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=42)

val_rows_list = []

for fold, (train_index, val_index) in enumerate(skf.split(all_images, all_labels)):
    print(f"\n[CV {fold + 1}/{num_folds}]: Train index length {len(train_index)}, Validation index length {len(val_index)}...")
    print(train_index.dtype, train_index)
    print(len(val_index))
    images_path = [all_images[i] for i in train_index]
    labels_path = [all_images[i] for i in val_index]

# for fold in range(num_folds):
#     # Get the indices for the current split
#     train_indices = train_indices_list[fold]
#     valid_indices = valid_indices_list[fold]

#     # Use the indices to create the training and validation datasets
#     train_dataset = all_data_df.iloc[fold]
#     valid_dataset = all_data_df.iloc[fold]

#     # Separate the images and labels for training and validation
#     train_images = all_images[fold]
#     train_labels = all_labels[fold]

#     valid_images = all_images[fold]
#     valid_labels = all_labels[fold]


    
#     # Select rows based on the validation index
#     train_rows = train_dataset_df.iloc[train_index]
#     val_rows = train_dataset_df.iloc[val_index]

#     # Append val_rows to the list
#     val_rows_list.append(val_rows)

#     # Perform value counts on train_dataset_df for the selected rows
#     value_counts = val_rows['Label'].value_counts()
#     print(value_counts)

# # Check if all val_rows are the same
# are_val_rows_same = all(val_rows.equals(val_rows_list[0]) for val_rows in val_rows_list[1:])

# if are_val_rows_same:
#     print("\nValidation rows are the same in every fold.")
# else:
#     print("\nValidation rows differ across folds.")



[CV 1/3]: Train index length 12660, Validation index length 6331...
int64 [    0     1     3 ... 18986 18988 18990]
6331

[CV 2/3]: Train index length 12661, Validation index length 6330...
int64 [    0     1     2 ... 18987 18988 18989]
6330

[CV 3/3]: Train index length 12661, Validation index length 6330...
int64 [    2     4     5 ... 18987 18989 18990]
6330


In [12]:
num_folds = 7

# Create lists to store the split indices
train_indices_list = []
valid_indices_list = []

# Assuming train_images and train_labels are numpy arrays or lists
skf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=42)

val_rows_list = []

for fold, (train_index, val_index) in enumerate(skf.split(all_images, all_labels)):
    print(f"\n[CV {fold + 1}/{num_folds}]: Train index length {len(train_index)}, Validation index length {len(val_index)}...")
    print(len(train_index))
    print(len(val_index))
    images_path = [all_images[i] for i in train_index]
    labels_path = [all_images[i] for i in val_index]



[CV 1/7]: Train index length 16278, Validation index length 2713...
16278
2713

[CV 2/7]: Train index length 16278, Validation index length 2713...
16278
2713

[CV 3/7]: Train index length 16278, Validation index length 2713...
16278
2713

[CV 4/7]: Train index length 16278, Validation index length 2713...
16278
2713

[CV 5/7]: Train index length 16278, Validation index length 2713...
16278
2713

[CV 6/7]: Train index length 16278, Validation index length 2713...
16278
2713

[CV 7/7]: Train index length 16278, Validation index length 2713...
16278
2713


In [13]:
val_index

array([    0,     1,    10, ..., 18957, 18962, 18974])

In [32]:
# command = 'python ../train_cv.py --train_path "../datasets/challenge1/train" \
#                         --valid_path "../datasets/challenge1/val" \
#                         --experiment_name "ClassifierExperimentCV" \
#                         --network_name "VGG16_BN_Attention" \
#                         --max_epochs "1" \
#                         --num_folds "2" \
#                         --batch_size "32" \
#                         --verbose "2"'

# # run the function to excute the command
# _ = execute_cmd_realtime(command, log=True)

Excuting training pipeline with 1 epochs and 2 folds.
Loading the data from ../datasets/challenge1/train
Loading the data from ../datasets/challenge1/val
Creating a classifier experiment using VGG16_BN_Attention network.
Using VGG16_BN_Attention with configurations: num_classes='2', normalize_attn='False'
[CV 1/2]: Epoch: 1, train batch 10, loss: 0.638522207736969, 3.4% complete
[CV 1/2]: Epoch: 1, train batch 20, loss: 0.4631451964378357, 6.7% complete
[CV 1/2]: Epoch: 1, train batch 30, loss: 0.49124935269355774, 10.1% complete
[CV 1/2]: Epoch: 1, train batch 40, loss: 0.6418288946151733, 13.5% complete
[CV 1/2]: Epoch: 1, train batch 50, loss: 0.5985834002494812, 16.8% complete
[CV 1/2]: Epoch: 1, train batch 60, loss: 0.5354160070419312, 20.2% complete
[CV 1/2]: Epoch: 1, train batch 70, loss: 0.3409406244754791, 23.6% complete
[CV 1/2]: Epoch: 1, train batch 80, loss: 0.40570464730262756, 26.9% complete
[CV 1/2]: Epoch: 1, train batch 90, loss: 0.5311029553413391, 30.3% complete
[

Inference is below..

In [13]:
network = getNetwork('VGG16_BN_Attention')
network

networks.VGG16.VGG16_BN_Attention

In [14]:
# model = network(num_classes=2)

In [15]:
output_path = 'outputs/ClassifierExperimentCV_224_epo20_bs32_lr0.0001_s42/2023-12-03_2157_VGG16_BN_Attention'

os.listdir(output_path)

['fold_1', 'fold_2', 'fold_3', 'fold_4', 'fold_5']

In [16]:
models_paths = sorted(glob(os.path.join(output_path, "***", "*.pth"), recursive=True))
models_paths

['outputs/ClassifierExperimentCV_224_epo20_bs32_lr0.0001_s42/2023-12-03_2157_VGG16_BN_Attention/fold_1/ClassifierExperimentCV_224_VGG16_BN_Attention_epo20_bs32_lr0.0001_seed42_fold1.pth',
 'outputs/ClassifierExperimentCV_224_epo20_bs32_lr0.0001_s42/2023-12-03_2157_VGG16_BN_Attention/fold_2/ClassifierExperimentCV_224_VGG16_BN_Attention_epo20_bs32_lr0.0001_seed42_fold2.pth',
 'outputs/ClassifierExperimentCV_224_epo20_bs32_lr0.0001_s42/2023-12-03_2157_VGG16_BN_Attention/fold_3/ClassifierExperimentCV_224_VGG16_BN_Attention_epo20_bs32_lr0.0001_seed42_fold3.pth',
 'outputs/ClassifierExperimentCV_224_epo20_bs32_lr0.0001_s42/2023-12-03_2157_VGG16_BN_Attention/fold_4/ClassifierExperimentCV_224_VGG16_BN_Attention_epo20_bs32_lr0.0001_seed42_fold4.pth',
 'outputs/ClassifierExperimentCV_224_epo20_bs32_lr0.0001_s42/2023-12-03_2157_VGG16_BN_Attention/fold_5/ClassifierExperimentCV_224_VGG16_BN_Attention_epo20_bs32_lr0.0001_seed42_fold5.pth']

In [17]:
# Assuming your 'network' class has a 'load_state_dict' method
def load_model(model, path):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()

In [18]:
# Load all models
models = [network(num_classes=2) for _ in range(len(models_paths))]
for i, path in enumerate(models_paths):
    load_model(models[i], path)

Using VGG16_BN_Attention with configurations: num_classes='2', normalize_attn='False'
Using VGG16_BN_Attention with configurations: num_classes='2', normalize_attn='False'
Using VGG16_BN_Attention with configurations: num_classes='2', normalize_attn='False'
Using VGG16_BN_Attention with configurations: num_classes='2', normalize_attn='False'
Using VGG16_BN_Attention with configurations: num_classes='2', normalize_attn='False'


In [19]:
# Predictions for each model
all_predictions = [[] for _ in range(len(models))]
all_predictions

[[], [], [], [], []]

In [20]:
validation_labels = []

In [21]:
with torch.no_grad():
    for images, labels in tqdm(val_loader):
        for i, model in enumerate(models):
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            all_predictions[i].extend(predicted.cpu().numpy())
        
        validation_labels.extend(labels.cpu().numpy())

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 119/119 [34:07<00:00, 17.20s/it]


In [22]:
validation_labels = np.array(validation_labels)

In [23]:
validation_labels.shape

(3796,)

In [24]:
all_predictions = np.array(all_predictions)

In [25]:
all_predictions.shape

(5, 3796)

In [26]:
ensemble_predictions, _ = mode(all_predictions, axis=0)

In [27]:
ensemble_predictions.shape

(3796,)

In [28]:
# Calculate metrics using scikit-learn
accuracy = accuracy_score(validation_labels, ensemble_predictions)
kappa = cohen_kappa_score(validation_labels, ensemble_predictions)
auc = roc_auc_score(validation_labels, ensemble_predictions)

print(f"Ensemble Accuracy: {accuracy}")
print(f"Ensemble Cohen's Kappa: {kappa}")
print(f"Ensemble AUC: {auc}")

Ensemble Accuracy: 0.9230769230769231
Ensemble Cohen's Kappa: 0.8459548756987352
Ensemble AUC: 0.9225588430892604
