# Prétraitement et augmentation des données

### Division en ensembles Train/Validation/Test

In [1]:
import joblib
import numpy as np
from sklearn.preprocessing import LabelEncoder
from PIL import Image 
images, labels = joblib.load("../data/processed_data.pkl")

target_size = (224, 224)

x_resized = []
for img in images:
    img_pil = Image.fromarray(img)
    img_resized = img_pil.resize(target_size)
    x_resized.append(np.array(img_resized))

x = np.array(x_resized)
y = np.array(labels)
le = LabelEncoder()
y_enc = le.fit_transform(y)  

### Encodage des étiquettes catégorielles avec LabelEncoder

In [2]:
import os
import random

train_split = 0.7
val_split = 0.15
test_split = 0.15

for subset in ["train", "val", "test"]:
    os.makedirs(os.path.join("../data/data_splite", subset), exist_ok=True)

classes = np.unique(y)
for cls in classes:
    for subset in ["train", "val", "test"]:
        os.makedirs(os.path.join("../data/data_splite", subset, cls), exist_ok=True)

data = list(range(len(x)))
random.shuffle(data)

n_total = len(data)
n_train = int(0.7 * n_total)
n_val = int(0.15 * n_total)
n_test = n_total - n_train - n_val

splits = {
    "train": data[:n_train],
    "val": data[n_train:n_train + n_val],
    "test": data[n_train + n_val:]
}

image_count = {"train": 0, "val": 0, "test": 0}

for subset_name, subset_indices in splits.items():
    for idx_in_subset, idx in enumerate(subset_indices):
        img_array = x[idx]
        label = y[idx]
        img = Image.fromarray(img_array)
        save_path = os.path.join("../data/data_splite", subset_name, label)
        img.save(os.path.join(save_path, f"{idx_in_subset}.png"))
        image_count[subset_name] += 1
        
for subset_name, count in image_count.items():
    print(f"{subset_name}: {count} images")

train: 2269 images
val: 486 images
test: 487 images


### Nombre d’images par classe dans le dossier d’entraînement

In [3]:
train_dir = "../data/data_splite/train"

for cls in os.listdir(train_dir):
    path = os.path.join(train_dir, cls)
    if os.path.isdir(path):
        print(f"{cls} : {len(os.listdir(path))}")

Benign : 914
early Pre-B : 1511
Pre-B : 1478
Pro-B : 1301


### Augmentation et équilibrage des données d’entraînement par transformations d’images

In [4]:
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),             
    transforms.RandomApply([                    
        transforms.GaussianBlur(kernel_size=(3,3), sigma=(0.1, 2.0)),
        transforms.ColorJitter(brightness=0.2)    
    ], p=0.5),
    transforms.ToTensor(),
    transforms.Normalize([0.5,0.5,0.5], [0.5,0.5,0.5])                          
])


val_test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5,0.5,0.5], [0.5,0.5,0.5])
])

train_dataset = datasets.ImageFolder("../data/data_splite/train", transform=train_transforms)
val_dataset   = datasets.ImageFolder("../data/data_splite/val", transform=val_test_transforms)
test_dataset  = datasets.ImageFolder("../data/data_splite/test", transform=val_test_transforms)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False)

print("Nombre d'images d'entraînement :", len(train_dataset))
print("Nombre d'images de validation :", len(val_dataset))
print("Nombre d'images de test :", len(test_dataset))


Nombre d'images d'entraînement : 5204
Nombre d'images de validation : 1105
Nombre d'images de test : 1108


### Sauvegarde des datasets PyTorch pour réutilisation

In [5]:
import os
os.makedirs("../data/Data_Loaders", exist_ok=True)


torch.save(train_dataset, "../data/Data_Loaders/train_dataset.pt")
torch.save(val_dataset, "../data/Data_Loaders/val_dataset.pt")
torch.save(test_dataset, "../data/Data_Loaders/test_dataset.pt")