# Preprocessing Images - Feature Extraction ResNet50

Extraction de features images avec transfer learning (ResNet50 pré-entraîné sur ImageNet).

In [None]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.utils import resample

from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D

In [None]:
# Configuration
PROJECT_ROOT = ".."  # Adapter selon votre config
CSV_PATH = os.path.join(PROJECT_ROOT, "data", "raw")
IMG_TRAIN_PATH = os.path.join(PROJECT_ROOT, "data", "raw", "images", "images", "image_train")
IMG_TEST_PATH = os.path.join(PROJECT_ROOT, "data", "raw", "images", "images", "image_test")
OUTPUT_PATH = os.path.join(PROJECT_ROOT, "data", "processed")

IMG_SIZE = (224, 224)
BATCH_SIZE = 32
TARGET_PER_CLASS = 15000

## 1. Chargement des données

In [None]:
df_x = pd.read_csv(os.path.join(CSV_PATH, "X_train_update.csv"), index_col=0)
df_y = pd.read_csv(os.path.join(CSV_PATH, "Y_train_CVw08PX.csv"), index_col=0)
df = pd.merge(df_x, df_y, left_index=True, right_index=True)

df['filename'] = df.apply(lambda x: f"image_{x['imageid']}_product_{x['productid']}.jpg", axis=1)
df['class_str'] = df['prdtypecode'].astype(str)

print(f"Dataset: {len(df)} images, {df['prdtypecode'].nunique()} classes")
df['prdtypecode'].value_counts()

## 2. Équilibrage des classes (Oversampling)

In [None]:
df_balanced = pd.DataFrame()

for code in df['prdtypecode'].unique():
    df_class = df[df['prdtypecode'] == code]
    df_resampled = resample(df_class, replace=True, n_samples=TARGET_PER_CLASS, random_state=42)
    df_balanced = pd.concat([df_balanced, df_resampled])

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"Après équilibrage: {len(df_balanced)} images")

## 3. Modèle ResNet50 (Feature Extractor)

In [None]:
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=IMG_SIZE + (3,))
x = GlobalAveragePooling2D()(base_model.output)
model = Model(inputs=base_model.input, outputs=x)

print(f"Output shape: {model.output_shape}")

## 4. Data Augmentation + Extraction (Train)

In [None]:
datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

train_generator = datagen.flow_from_dataframe(
    dataframe=df_balanced,
    directory=IMG_TRAIN_PATH,
    x_col="filename",
    y_col="class_str",
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode=None,
    shuffle=False
)

In [None]:
# Extraction (peut prendre plusieurs heures)
train_features = model.predict(train_generator, verbose=1)
train_labels = df_balanced['prdtypecode'].values

print(f"Features shape: {train_features.shape}")

## 5. Extraction Test (sans augmentation)

In [None]:
df_test = pd.read_csv(os.path.join(CSV_PATH, "X_test_update.csv"), index_col=0)
df_test['filename'] = df_test.apply(lambda x: f"image_{x['imageid']}_product_{x['productid']}.jpg", axis=1)

test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=df_test,
    directory=IMG_TEST_PATH,
    x_col="filename",
    y_col=None,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode=None,
    shuffle=False
)

test_features = model.predict(test_generator, verbose=1)
print(f"Test features shape: {test_features.shape}")

## 6. Sauvegarde

In [None]:
os.makedirs(OUTPUT_PATH, exist_ok=True)

np.save(os.path.join(OUTPUT_PATH, 'train_features_resnet50.npy'), train_features)
np.save(os.path.join(OUTPUT_PATH, 'train_labels.npy'), train_labels)
np.save(os.path.join(OUTPUT_PATH, 'test_features_resnet50.npy'), test_features)
np.save(os.path.join(OUTPUT_PATH, 'test_ids.npy'), df_test.index.values)

# Metadata
class_mapping = {str(c): i for i, c in enumerate(sorted(df['prdtypecode'].unique()))}
with open(os.path.join(OUTPUT_PATH, 'metadata.json'), 'w') as f:
    json.dump({'num_classes': 27, 'feature_dim': 2048, 'class_mapping': class_mapping}, f)

print("Done.")