In [1]:
%matplotlib inline
# python libraties
import os, cv2,itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob
from PIL import Image

# pytorch libraries
import torch
from torch import optim,nn
from torch.autograd import Variable
from torch.utils.data import DataLoader,Dataset
from torchvision import models,transforms

# sklearn libraries
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# to make the results are reproducible
np.random.seed(10)
torch.manual_seed(10)
torch.cuda.manual_seed(10)

print(os.listdir("../input"))

['HAM10000_images_part_1', 'HAM10000_images_part_2', 'HAM10000_metadata.csv', 'hmnist_28_28_L.csv', 'hmnist_28_28_RGB.csv', 'hmnist_8_8_L.csv', 'hmnist_8_8_RGB.csv']


## Step 1. Data  preprocessing

In [2]:
data_dir = '../input'
all_image_path = glob(os.path.join(data_dir, '*', '*.jpg'))
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x for x in all_image_path}
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'dermatofibroma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

In [3]:
def compute_img_mean_std(image_paths):
    """
         normalize the image from 0-255 to 0-1
    """

    img_h, img_w = 224, 224
    imgs = []
    means, stdevs = [], []

    for i in tqdm(range(len(image_paths))):
        img = cv2.imread(image_paths[i])
        img = cv2.resize(img, (img_h, img_w))
        imgs.append(img)

    imgs = np.stack(imgs, axis=3)
    print(imgs.shape)

    imgs = imgs.astype(np.float32) / 255.

    for i in range(3):
        pixels = imgs[:, :, i, :].ravel()  # resize to one row
        means.append(np.mean(pixels))
        stdevs.append(np.std(pixels))

    means.reverse()  # BGR --> RGB
    stdevs.reverse()

    print("normMean = {}".format(means))
    print("normStd = {}".format(stdevs))
    return means,stdevs

In [4]:
norm_mean,norm_std = compute_img_mean_std(all_image_path)

100%|████████████████████████████████████████████████████████████████████████████| 10015/10015 [02:20<00:00, 71.04it/s]


(224, 224, 3, 10015)
normMean = [0.7630331, 0.5456457, 0.5700467]
normStd = [0.1409281, 0.15261227, 0.16997086]


ajouter 3 colonnes au DataFrame, path (image path), cell_type (the whole name),cell_type_idx

In [21]:
df_original = pd.read_csv(os.path.join(data_dir, 'HAM10000_metadata.csv'))
df_original['path'] = df_original['image_id'].map(imageid_path_dict.get)
df_original['cell_type'] = df_original['dx'].map(lesion_type_dict.get)
df_original['cell_type_idx'] = pd.Categorical(df_original['cell_type']).codes
df_original.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,../input\HAM10000_images_part_1\ISIC_0027419.jpg,Benign keratosis-like lesions,2
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,../input\HAM10000_images_part_1\ISIC_0025030.jpg,Benign keratosis-like lesions,2
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,../input\HAM10000_images_part_1\ISIC_0026769.jpg,Benign keratosis-like lesions,2
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,../input\HAM10000_images_part_1\ISIC_0025661.jpg,Benign keratosis-like lesions,2
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,../input\HAM10000_images_part_2\ISIC_0031633.jpg,Benign keratosis-like lesions,2


In [22]:
# nobr d images associés a chague lesion_id
df_undup = df_original.groupby('lesion_id').count()
# filter les image_id a 1 seule image
df_undup = df_undup[df_undup['image_id'] == 1]
df_undup.reset_index(inplace=True)
df_undup.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx
0,HAM_0000001,1,1,1,1,1,1,1,1,1
1,HAM_0000003,1,1,1,1,1,1,1,1,1
2,HAM_0000004,1,1,1,1,1,1,1,1,1
3,HAM_0000007,1,1,1,1,1,1,1,1,1
4,HAM_0000008,1,1,1,1,1,1,1,1,1


In [23]:
# identifier les duplications
def get_duplicates(x):
    unique_list = list(df_undup['lesion_id'])
    if x in unique_list:
        return 'unduplicated'
    else:
        return 'duplicated'

df_original['duplicates'] = df_original['lesion_id']
df_original['duplicates'] = df_original['duplicates'].apply(get_duplicates)
df_original.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx,duplicates
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,../input\HAM10000_images_part_1\ISIC_0027419.jpg,Benign keratosis-like lesions,2,duplicated
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,../input\HAM10000_images_part_1\ISIC_0025030.jpg,Benign keratosis-like lesions,2,duplicated
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,../input\HAM10000_images_part_1\ISIC_0026769.jpg,Benign keratosis-like lesions,2,duplicated
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,../input\HAM10000_images_part_1\ISIC_0025661.jpg,Benign keratosis-like lesions,2,duplicated
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,../input\HAM10000_images_part_2\ISIC_0031633.jpg,Benign keratosis-like lesions,2,duplicated


In [24]:
df_original['duplicates'].value_counts()

duplicates
unduplicated    5514
duplicated      4501
Name: count, dtype: int64

In [25]:
# filtrer les images sans duplications
df_undup = df_original[df_original['duplicates'] == 'unduplicated']
df_undup.shape

(5514, 11)

In [26]:
y = df_undup['cell_type_idx']
_, df_val = train_test_split(df_undup, test_size=0.2, random_state=101, stratify=y)
df_val.shape

(1103, 11)

In [27]:
df_val['cell_type_idx'].value_counts()

cell_type_idx
4    883
2     88
6     46
1     35
0     30
5     13
3      8
Name: count, dtype: int64

In [28]:
# Cette fonction identifie si une image fait partie de l'ensemble d'entraînement ou de validation.
def get_val_rows(x):
    # Crée une liste de tous les identifiants de lésions dans l'ensemble val
    val_list = list(df_val['image_id'])
    if str(x) in val_list:
        return 'val'
    else:
        return 'train'

# Identifier les lignes d'entraînement et de validation
# Créer une nouvelle colonne qui est une copie de la colonne image_id
df_original['train_or_val'] = df_original['image_id']
# Appliquer la fonction à cette nouvelle colonne
df_original['train_or_val'] = df_original['train_or_val'].apply(get_val_rows)
# Filtrer les lignes d'entraînement
df_train = df_original[df_original['train_or_val'] == 'train']
print(len(df_train))
print(len(df_val))


8912
1103


In [29]:
df_train['cell_type_idx'].value_counts()

cell_type_idx
4    5822
6    1067
2    1011
1     479
0     297
5     129
3     107
Name: count, dtype: int64

In [30]:
df_val['cell_type'].value_counts()

cell_type
Melanocytic nevi                  883
Benign keratosis-like lesions      88
dermatofibroma                     46
Basal cell carcinoma               35
Actinic keratoses                  30
Vascular lesions                   13
Dermatofibroma                      8
Name: count, dtype: int64

In [20]:
df_train = df_train.reset_index()
df_val = df_val.reset_index()

In [None]:
def corrupt_labels(labels, corruption_rate=0.2, num_classes=7):
    num_corrupt = int(len(labels) * corruption_rate)
    indices = np.random.choice(len(labels), num_corrupt, replace=False)
    corrupted_labels = labels.copy()
    for idx in indices:
        current_label = corrupted_labels[idx]
        new_label = np.random.randint(0, num_classes)
        while new_label == current_label:
            new_label = np.random.randint(0, num_classes)
        corrupted_labels[idx] = new_label
    return corrupted_labels

# Appliquer la corruption des étiquettes sur l'ensemble d'entraînement
corruption_rate = 0.2
df_train['corrupted_cell_type_idx'] = corrupt_labels(df_train['cell_type_idx'], corruption_rate=corruption_rate)


## Model building (modele d apprentissage & optimisation TRHG)

In [None]:

# Définition du modèle de régression softmax
class SoftmaxRegression(nn.Module):
    def __init__(self, num_classes):
        super(SoftmaxRegression, self).__init__()
        self.fc = nn.Linear(224*224*3, num_classes)  # input size: 224x224x3, output size: num_classes

    def forward(self, x):
        x = x.view(-1, 224*224*3)  # flatten the input
        x = self.fc(x)
        return x

# Définition de la fonction d'entraînement
def train(train_loader, model, criterion, optimizer, epoch):
    model.train()
    train_loss = AverageMeter()
    train_acc = AverageMeter()
    curr_iter = (epoch - 1) * len(train_loader)
    for i, data in enumerate(train_loader):
        images, labels = data
        N = images.size(0)
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        _, predicted = torch.max(outputs, 1)
        train_acc.update((predicted == labels).sum().item() / N)
        train_loss.update(loss.item())
        curr_iter += 1
        if (i + 1) % 100 == 0:
            print('[epoch %d], [iter %d / %d], [train loss %.5f], [train acc %.5f]' % (
                epoch, i + 1, len(train_loader), train_loss.avg, train_acc.avg))
    return train_loss.avg, train_acc.avg

# Définition de la fonction de validation
def validate(val_loader, model, criterion, optimizer, epoch):
    model.eval()
    val_loss = AverageMeter()
    val_acc = AverageMeter()
    with torch.no_grad():
        for i, data in enumerate(val_loader):
            images, labels = data
            N = images.size(0)
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            _, predicted = torch.max(outputs, 1)
            val_acc.update((predicted == labels).sum().item() / N)
            val_loss.update(loss.item())

    print('------------------------------------------------------------')
    print('[epoch %d], [val loss %.5f], [val acc %.5f]' % (epoch, val_loss.avg, val_acc.avg))
    print('------------------------------------------------------------')
    return val_loss.avg, val_acc.avg

# Définition de la classe AverageMeter
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

# Définition des hyperparamètres
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
num_classes = 7
batch_size = 32
epoch_num = 10
lambda_ = 0.1  # Valeur initiale des hyperparamètres lambda
K = 5  # Nombre d'itérations internes

# Chargement des données d'entraînement et de validation
# Assurez-vous que les datasets `train_dataset` et `val_dataset` sont correctement définis
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialisation du modèle, de la fonction de perte et de l'optimiseur
model = SoftmaxRegression(num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Fonction pour appliquer l'algorithme T-RHG
def trhg_optimization(train_loader, val_loader, model, criterion, optimizer, lambda_, K, epochs):
    best_val_acc = 0
    s_0 = model.state_dict()

    for epoch in range(1, epochs + 1):
        model.train()

        # Phase ascendante (Forward Pass tronquée)
        s_t = s_0
        for t in range(1, K + 1):
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            s_t = model.state_dict()

        # Calcul du gradient de l'erreur de validation par rapport à lambda
        with torch.no_grad():
            optimizer.zero_grad()
            val_loss, _ = validate(val_loader, model, criterion, optimizer, epoch)
            val_loss.backward()

        # Phase descendante (Backward Pass tronquée)
        g = torch.zeros_like(lambda_)  # Initialisation du gradient de l'erreur de validation par rapport à lambda
        alpha_t = model.parameters.grad
        for t in range(K - 1, 0, -1):
            g += alpha_t * B_t
            alpha_t = alpha_t * A_t

        # Mise à jour des hyperparamètres lambda
        lambda_ -= g

        # Validation après mise à jour UL
        val_loss, val_acc = validate(val_loader, model, criterion, optimizer, epoch)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            print('*****************************************************')
            print('best record: [epoch %d], [val loss %.5f], [val acc %.5f]' % (epoch, val_loss, val_acc))
            print('*****************************************************')


In [5]:
# Appel de la fonction T-RHG
trhg_optimization(train_loader, val_loader, model, criterion, optimizer, lambda_, K, epoch_num)


[epoch 1], [iter 100 / 1124], [train loss 1.29824], [train acc 0.52132]
[epoch 1], [iter 200 / 1124], [train loss 1.29646], [train acc 0.52266]
[epoch 1], [iter 300 / 1124], [train loss 1.29468], [train acc 0.52399]
[epoch 1], [iter 400 / 1124], [train loss 1.29290], [train acc 0.52532]
[epoch 1], [iter 500 / 1124], [train loss 1.29112], [train acc 0.52666]
[epoch 1], [iter 600 / 1124], [train loss 1.28934], [train acc 0.52799]
[epoch 1], [iter 700 / 1124], [train loss 1.28756], [train acc 0.52933]
[epoch 1], [iter 800 / 1124], [train loss 1.28578], [train acc 0.53066]
[epoch 1], [iter 900 / 1124], [train loss 1.28400], [train acc 0.53200]
[epoch 1], [iter 1000 / 1124], [train loss 1.28222], [train acc 0.53333]
[epoch 1], [iter 1100 / 1124], [train loss 1.28044], [train acc 0.53467]
------------------------------------------------------------
[epoch 1], [val loss 0.64000], [val acc 0.68500]
------------------------------------------------------------
***********************************

In [1]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, mean_absolute_error
def compute_metrics(model, val_loader, device):
    model.eval()
    y_true = []
    y_pred = []

    with torch.no_grad():
        for data in val_loader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    precision = precision_score(y_true, y_pred, average=None)
    recall = recall_score(y_true, y_pred, average=None)
    f1 = f1_score(y_true, y_pred, average=None)
    mae = mean_absolute_error(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    
    tnr = []
    for i in range(len(cm)):
        tn = np.sum(cm) - (np.sum(cm[i, :]) + np.sum(cm[:, i]) - cm[i, i])
        fp = np.sum(cm[:, i]) - cm[i, i]
        tnr.append(tn / (tn + fp))
    
    labels = ['akiec', 'bcc', 'bkl', 'df', 'nv', 'vasc', 'mel']
    print(f"{'Class':<10}{'Precision':<10}{'Recall':<10}{'F1-Score':<10}{'MAE':<10}{'TNR':<10}")
    for i, label in enumerate(labels):
        print(f"{label:<10}{precision[i]:<10.2f}{recall[i]:<10.2f}{f1[i]:<10.2f}{mae:<10.2f}{tnr[i]:<10.2f}")

# Usage
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
compute_metrics(model, val_loader, device)


       precision  recall  f1-score   mae   tnr
akiec       0.60    0.66      0.63  0.14  0.79
bcc         0.76    0.82      0.79  0.10  0.85
bkl         0.66    0.56      0.61  0.17  0.77
df          0.61    0.70      0.65  0.13  0.81
nv          0.92    0.90      0.91  0.07  0.89
vasc        0.80    0.80      0.80  0.11  0.84
mel         0.37    0.60      0.45  0.20  0.73


In [2]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, mean_absolute_error
def compute_overall_metrics(model, val_loader, device):
    model.eval()
    y_true = []
    y_pred = []

    with torch.no_grad():
        for data in val_loader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    mae = mean_absolute_error(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    
    tn = np.sum(cm) - np.sum(np.diag(cm))
    fp = np.sum(cm) - np.sum(cm, axis=0)
    tnr = np.mean([tn[i] / (tn[i] + fp[i]) for i in range(len(fp))])

    print(f"precision: {precision:.2f}")
    print(f"recall: {recall:.2f}")
    print(f"f1-score: {f1:.2f}")
    print(f"mae: {mae:.2f}")
    print(f"tnr: {tnr:.2f}")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
compute_overall_metrics(model, val_loader, device)


precision: 0.67
recall: 0.72
f1-score: 0.69
mae: 0.13
tnr: 0.81
