In [1]:
# ======================================
# 1️⃣ Imports
# ======================================
import os, shutil, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io
import kagglehub

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.metrics import TopKCategoricalAccuracy

from PIL import Image

In [2]:

# ======================================
# 2️⃣ Download Dataset
# ======================================
print("Downloading dataset...")
DATASET_PATH = kagglehub.dataset_download("eduardo4jesus/stanford-cars-dataset")
print("Dataset path:", DATASET_PATH)

IMAGES_PATH, ANNOS_PATH, META_PATH = None, None, None

for root, dirs, files in os.walk(DATASET_PATH):
    if "cars_train_annos.mat" in files:
        ANNOS_PATH = os.path.join(root, "cars_train_annos.mat")
    if "cars_meta.mat" in files:
        META_PATH = os.path.join(root, "cars_meta.mat")
    if os.path.basename(root) == "cars_train":
        IMAGES_PATH = root

print("Images path:", IMAGES_PATH)
print("Annotations path:", ANNOS_PATH)
print("Meta path:", META_PATH)

if IMAGES_PATH is None or ANNOS_PATH is None or META_PATH is None:
    raise FileNotFoundError("Could not locate cars_train / cars_train_annos.mat / cars_meta.mat داخل DATASET_PATH")


Downloading dataset...
Using Colab cache for faster access to the 'stanford-cars-dataset' dataset.
Dataset path: /kaggle/input/stanford-cars-dataset
Images path: /kaggle/input/stanford-cars-dataset/cars_train/cars_train
Annotations path: /kaggle/input/stanford-cars-dataset/car_devkit/devkit/cars_train_annos.mat
Meta path: /kaggle/input/stanford-cars-dataset/car_devkit/devkit/cars_meta.mat


In [3]:

# ======================================
# 2.5️⃣ Load Class Names (cars_meta.mat)
# ======================================
meta = scipy.io.loadmat(META_PATH)
class_names = meta["class_names"][0]  # usually 196 classes

# mapping: class_id (1-based) -> real car name
id_to_name = {i + 1: class_names[i][0] for i in range(len(class_names))}

# reverse mapping: car name -> original class_id
name_to_id = {v: k for k, v in id_to_name.items()}

print("Example class 1 name:", id_to_name[1])



Example class 1 name: AM General Hummer SUV 2000


In [4]:
# ======================================
# 3️⃣ Load Annotations
# ======================================
annos = scipy.io.loadmat(ANNOS_PATH)
annotations = annos["annotations"][0]

class_dict = {}
for ann in annotations:
    img_name = ann[5][0]
    class_id = int(ann[4][0][0])
    class_dict.setdefault(class_id, []).append(img_name)

print("Total classes in train annos:", len(class_dict))


Total classes in train annos: 196


In [5]:
# ======================================
# 4️⃣ Select Top 40 Classes + Create folders by REAL CAR NAME
# ======================================
def safe_folder_name(name: str) -> str:
    name = name.strip()
    name = re.sub(r'[\\/*?:"<>|]', "_", name)  # Windows-safe
    return name

sorted_classes = sorted(class_dict.items(), key=lambda x: len(x[1]), reverse=True)
selected_classes = [cls_id for cls_id, imgs in sorted_classes[:40]]

OUTPUT_PATH = "/kaggle/working/cars_40_classes"
os.makedirs(OUTPUT_PATH, exist_ok=True)

missing = 0
for cls_id in selected_classes:
    car_name = id_to_name.get(cls_id, f"unknown_{cls_id}")
    folder_name = safe_folder_name(car_name)

    class_folder = os.path.join(OUTPUT_PATH, folder_name)
    os.makedirs(class_folder, exist_ok=True)

    for img in class_dict[cls_id]:
        src = os.path.join(IMAGES_PATH, img)
        dst = os.path.join(class_folder, img)
        if os.path.exists(src):
            shutil.copy(src, dst)
        else:
            missing += 1

print("✅ Created dataset with 40 car-name classes at:", OUTPUT_PATH)
print("⚠️ Missing images:", missing)



✅ Created dataset with 40 car-name classes at: /kaggle/working/cars_40_classes
⚠️ Missing images: 0


In [6]:
# ======================================
# 5️⃣ Build DataFrame
# ======================================
DATA_PATH = OUTPUT_PATH
data = []
for cls in sorted(os.listdir(DATA_PATH)):
    cls_path = os.path.join(DATA_PATH, cls)
    if not os.path.isdir(cls_path):
        continue
    for img in os.listdir(cls_path):
        if img.lower().endswith(('.jpg', '.jpeg', '.png')):
            data.append({"filename": os.path.join(cls_path, img), "label": cls})

df = pd.DataFrame(data)
print("Total images:", len(df))
print("Total classes:", df["label"].nunique())
print("Sample labels:", df["label"].unique()[:5])


Total images: 1847
Total classes: 40
Sample labels: ['AM General Hummer SUV 2000' 'Acura Integra Type R 2001'
 'Aston Martin V8 Vantage Convertible 2012' 'Audi S4 Sedan 2007'
 'Audi S6 Sedan 2011']


In [7]:
# ======================================
# 6️⃣ Train / Validation / Test Split (70% / 10% / 20%)
# ======================================

# 1) Split out TEST (20%)
df_temp, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)

# 2) Split remaining 80% into TRAIN (70%) and VAL (10%)
#    10% of total = 0.1, but from df_temp (which is 0.8 of total):
#    val_ratio_inside_temp = 0.1 / 0.8 = 0.125
train_df, val_df = train_test_split(
    df_temp,
    test_size=0.125,
    stratify=df_temp["label"],
    random_state=42
)

IMG_SIZE = 299
BATCH_SIZE = 32

# Train augmentation only
train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    zoom_range=0.25,
    shear_range=0.2,
    horizontal_flip=True,
    brightness_range=[0.7, 1.3]
)

# Val/Test no augmentation
val_datagen  = ImageDataGenerator(preprocessing_function=preprocess_input)
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

# Generators
train_gen = train_datagen.flow_from_dataframe(
    train_df,
    x_col="filename",
    y_col="label",
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode="categorical",
    shuffle=True
)

val_gen = val_datagen.flow_from_dataframe(
    val_df,
    x_col="filename",
    y_col="label",
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode="categorical",
    shuffle=False
)

test_gen = test_datagen.flow_from_dataframe(
    test_df,
    x_col="filename",
    y_col="label",
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode="categorical",
    shuffle=False
)

NUM_CLASSES = len(train_gen.class_indices)
print("NUM_CLASSES:", NUM_CLASSES)

total_samples = len(df)
print(f"Train samples: {len(train_df)} ({len(train_df)/total_samples*100:.1f}%)")
print(f"Validation samples: {len(val_df)} ({len(val_df)/total_samples*100:.1f}%)")
print(f"Test samples: {len(test_df)} ({len(test_df)/total_samples*100:.1f}%)")


Found 1292 validated image filenames belonging to 40 classes.
Found 185 validated image filenames belonging to 40 classes.
Found 370 validated image filenames belonging to 40 classes.
NUM_CLASSES: 40
Train samples: 1292 (70.0%)
Validation samples: 185 (10.0%)
Test samples: 370 (20.0%)
