In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms, models
import os
import shutil
import json
import zipfile
import numpy as np
from tqdm.notebook import tqdm
from google.colab import drive
from PIL import Image, UnidentifiedImageError # Import UnidentifiedImageError

# --- CONFIGURATION ---
PROJECT_NAME = "BMW_Universal_Classifier"
INPUT_SIZE = 380  # EfficientNet-B4 Native Resolution
BATCH_SIZE = 64   # L4 Optimized
NUM_EPOCHS = 20

# 1. MOUNT DRIVE (Root Access)
drive.mount('/content/drive')
DRIVE_ROOT = "/content/drive/MyDrive" # Files are here

# Check what we are working with
ZIP_NAME = "ddg_custom_bmw_dataset.zip"
FOLDER_NAME = "ddg_custom_bmw_dataset"

ZIP_PATH = os.path.join(DRIVE_ROOT, ZIP_NAME)
FOLDER_PATH = os.path.join(DRIVE_ROOT, FOLDER_NAME)

# Local Fast SSD Paths
LOCAL_WORK_DIR = "/content/bmw_work_dir"
LOCAL_DATA_DIR = os.path.join(LOCAL_WORK_DIR, FOLDER_NAME)

MODEL_SAVE_PATH = os.path.join(DRIVE_ROOT, "bmw_final_model.pth")
JSON_SAVE_PATH = os.path.join(DRIVE_ROOT, "bmw_class_map.json")

# 2. DEVICE SETUP
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on: {device}")

# 3. SMART DATA TRANSFER
def setup_data():
    if os.path.exists(LOCAL_DATA_DIR):
        print("Data already on local disk. Skipping copy.")
        return

    os.makedirs(LOCAL_WORK_DIR, exist_ok=True)

    # Strategy A: Priority - Use Zip (Fastest)
    if os.path.exists(ZIP_PATH):
        print(f"Found ZIP at {ZIP_PATH}. Copying and unzipping...")
        # Copy zip to local first (faster extraction)
        local_zip = os.path.join(LOCAL_WORK_DIR, "temp.zip")
        shutil.copy2(ZIP_PATH, local_zip)

        with zipfile.ZipFile(local_zip, 'r') as zip_ref:
            zip_ref.extractall(LOCAL_WORK_DIR)

        print("Unzip complete.")

    # Strategy B: Fallback - Copy Folder (Slower but works if zip is missing)
    elif os.path.exists(FOLDER_PATH):
        print(f"No ZIP found, but found folder at {FOLDER_PATH}.")
        print("Copying folder structure... (This might take a few minutes)")
        shutil.copytree(FOLDER_PATH, LOCAL_DATA_DIR)
        print("Copy complete.")

    else:
        raise FileNotFoundError(f"CRITICAL: Could not find '{ZIP_NAME}' OR '{FOLDER_NAME}' in {DRIVE_ROOT}")

# Custom function to filter out corrupted images
def is_valid_image(path):
    try:
        with Image.open(path) as img:
            img.verify()
        return True
    except (IOError, SyntaxError, UnidentifiedImageError) as e:
        # print(f"Skipping corrupted or unreadable image: {path} - {e}") # Uncomment for debugging
        return False


# 4. CLASS WEIGHTING (The "Anti-Bias" Math)
def get_class_weights(dataset):
    targets = dataset.targets
    # Count how many images are in each class
    class_counts = np.bincount(targets)

    # Calculate weights: Rare classes get HIGHER weights
    # Formula: Total_Images / (Num_Classes * Image_Count_Per_Class)
    total = len(dataset)
    n_classes = len(dataset.classes)
    weights = total / (n_classes * class_counts)

    # Normalize weights so they aren't too extreme
    weights = torch.FloatTensor(weights).to(device)

    # DEBUG: Print the weights so you can see the logic working
    print("\n--- Class Weighting Debug ---")
    for i, count in enumerate(class_counts):
        label = dataset.classes[i]
        # Only print a few examples
        if i < 3 or i > n_classes - 3 or "non" in label:
            print(f"Class '{label}': {count} images -> Weight: {weights[i]:.4f}")
    print("-----------------------------\n")

    return weights

def train_model():
    setup_data()

    # 5. TRANSFORMS
    train_transform = transforms.Compose([
        transforms.Resize((400, 400)),
        transforms.RandomCrop(INPUT_SIZE),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness=0.2, contrast=0.2),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    val_transform = transforms.Compose([
        transforms.Resize((INPUT_SIZE, INPUT_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    # 6. LOAD & PREPARE
    # Pass the custom is_valid_file function to ImageFolder
    full_dataset = datasets.ImageFolder(LOCAL_DATA_DIR, is_valid_file=is_valid_image)

    # Get Weights based on the IMBALANCE you described
    class_weights = get_class_weights(full_dataset)

    # Split
    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size
    train_subset, val_subset = random_split(full_dataset, [train_size, val_size])

    # Transform Wrappers
    class TransformedSubset(torch.utils.data.Dataset):
        def __init__(self, subset, transform=None):
            self.subset = subset
            self.transform = transform
        def __len__(self):
            return len(self.subset)
        def __getitem__(self, idx):
            x, y = self.subset[idx]
            return self.transform(x), y

    train_loader = DataLoader(TransformedSubset(train_subset, train_transform),
                              batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(TransformedSubset(val_subset, val_transform),
                            batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

    # Save mapping for later use
    idx_to_class = {i: c for i, c in enumerate(full_dataset.classes)}
    with open(JSON_SAVE_PATH, 'w') as f:
        json.dump(idx_to_class, f, indent=4)

    # 7. MODEL (EfficientNet-B4)
    print("Loading EfficientNet-B4...")
    model = models.efficientnet_b4(weights=models.EfficientNet_B4_Weights.DEFAULT)

    # Freeze backbone initially
    for param in model.parameters():
        param.requires_grad = False

    num_ftrs = model.classifier[1].in_features
    model.classifier[1] = nn.Linear(num_ftrs, len(full_dataset.classes))
    model = model.to(device)

    # 8. TRAINING
    # Apply the WEIGHTS here
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = optim.AdamW(model.parameters(), lr=1e-3)
    scaler = torch.amp.GradScaler('cuda')

    # Phase 1: Train Head
    print("Phase 1: Warming up head...")
    model.train()
    for inputs, labels in tqdm(train_loader, desc="Warmup"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        with torch.amp.autocast('cuda'):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    # Phase 2: Train All
    print("Phase 2: Fine-tuning full model...")
    for param in model.parameters():
        param.requires_grad = True

    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=3)

    best_acc = 0.0
    for epoch in range(NUM_EPOCHS):
        model.train()
        running_loss = 0.0
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")

        for inputs, labels in pbar:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            with torch.amp.autocast('cuda'):
                outputs = model(inputs)
                loss = criterion(outputs, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            running_loss += loss.item()
            pbar.set_postfix({'loss': f"{loss.item():.4f}"})

        # Validation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                with torch.amp.autocast('cuda'):
                    outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (preds == labels).sum().item()

        epoch_acc = correct / total
        scheduler.step(epoch_acc)
        print(f"Epoch {epoch+1} | Acc: {epoch_acc*100:.2f}%")

        if epoch_acc > best_acc:
            best_acc = epoch_acc
            torch.save(model.state_dict(), MODEL_SAVE_PATH)
            print("--> Saved Best Model")

if __name__ == "__main__":
    train_model()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training on: cuda
Data already on local disk. Skipping copy.

--- Class Weighting Debug ---
Class 'BMW_02_Series_(E10)': 46 images -> Weight: 1.5335
Class 'BMW_02_Series_Touring_(E6)': 72 images -> Weight: 0.9798
Class 'BMW_2002_Turbo_(E20)': 76 images -> Weight: 0.9282
Class 'non_bmw_cars': 500 images -> Weight: 0.1411
Class 'non_cars': 1049 images -> Weight: 0.0672
-----------------------------

Loading EfficientNet-B4...
Phase 1: Warming up head...


Warmup:   0%|          | 0/134 [00:00<?, ?it/s]



Phase 2: Fine-tuning full model...


Epoch 1/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 1 | Acc: 38.43%
--> Saved Best Model


Epoch 2/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 2 | Acc: 47.63%
--> Saved Best Model


Epoch 3/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 3 | Acc: 54.11%
--> Saved Best Model


Epoch 4/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 4 | Acc: 60.39%
--> Saved Best Model


Epoch 5/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 5 | Acc: 63.82%
--> Saved Best Model


Epoch 6/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 6 | Acc: 67.34%
--> Saved Best Model


Epoch 7/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 7 | Acc: 69.64%
--> Saved Best Model


Epoch 8/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 8 | Acc: 70.15%
--> Saved Best Model


Epoch 9/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 9 | Acc: 72.55%
--> Saved Best Model


Epoch 10/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 10 | Acc: 73.72%
--> Saved Best Model


Epoch 11/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 11 | Acc: 75.32%
--> Saved Best Model


Epoch 12/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 12 | Acc: 75.69%
--> Saved Best Model


Epoch 13/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 13 | Acc: 76.58%
--> Saved Best Model


Epoch 14/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 14 | Acc: 76.96%
--> Saved Best Model


Epoch 15/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 15 | Acc: 76.30%


Epoch 16/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 16 | Acc: 77.62%
--> Saved Best Model


Epoch 17/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 17 | Acc: 78.46%
--> Saved Best Model


Epoch 18/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 18 | Acc: 77.71%


Epoch 19/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 19 | Acc: 78.98%
--> Saved Best Model


Epoch 20/20:   0%|          | 0/134 [00:00<?, ?it/s]



Epoch 20 | Acc: 78.41%


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, WeightedRandomSampler
from torchvision import datasets, transforms, models
import os
import shutil
import zipfile
import numpy as np
from tqdm.notebook import tqdm
from google.colab import drive
from PIL import Image # <-- Import PIL here for cleaning

# --- CONFIGURATION ---
DRIVE_ZIP_PATH = "/content/drive/MyDrive/ddg_custom_bmw_dataset.zip"
LOCAL_EXTRACT_PATH = "/content/bmw_retrain_data"
OLD_MODEL_PATH = "/content/drive/MyDrive/bmw_final_model.pth"
NEW_MODEL_PATH = "/content/drive/MyDrive/bmw_bugatti_fixed.pth"

BATCH_SIZE = 32
NUM_EPOCHS = 10
HEAD_LR = 1e-3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def clean_corrupted_images(root_dir):
    """Scans and removes corrupted or unreadable image files."""
    print(f"\nüî¨ Scanning '{root_dir}' for corrupted images...")
    bad_files_count = 0

    for dirpath, dirnames, filenames in os.walk(root_dir):
        # Skip the __MACOSX directory if it wasn't cleaned on extraction
        if '__MACOSX' in dirpath:
             continue

        for filename in filenames:
            if not filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                # Optionally remove non-image files (like hidden files, .DS_Store)
                continue

            file_path = os.path.join(dirpath, filename)

            try:
                # Try to open image and verify integrity
                with Image.open(file_path) as img:
                    img.verify()
            except (IOError, SyntaxError, IndexError, Exception):
                # If error, delete the file
                print(f"    üóëÔ∏è Deleting corrupt file: {filename}")
                os.remove(file_path)
                bad_files_count += 1

    print(f"Sanitization done, removed {bad_files_count} bad files.")

def setup_data_from_zip():
    print("--- STEP 1: Handling Zip File ---")

    # 1. Clean old local data
    if os.path.exists(LOCAL_EXTRACT_PATH):
        shutil.rmtree(LOCAL_EXTRACT_PATH)
    os.makedirs(LOCAL_EXTRACT_PATH, exist_ok=True)

    # 2. Check if Zip exists
    if not os.path.exists(DRIVE_ZIP_PATH):
        raise FileNotFoundError(f"CRITICAL: Could not find zip at {DRIVE_ZIP_PATH}")

    # 3. Copy and Unzip
    print(f"Unzipping {DRIVE_ZIP_PATH} to local disk...")
    with zipfile.ZipFile(DRIVE_ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(LOCAL_EXTRACT_PATH)

    print("Unzip complete.")

    # 4. Find the actual image root (IGNORING __MACOSX)
    print("Searching for valid dataset root...")
    for root, dirs, files in os.walk(LOCAL_EXTRACT_PATH):
        if "__MACOSX" in root:
            continue

        if "non_bmw_cars" in dirs and "non_cars" in dirs:
            print(f"Found REAL dataset root at: {root}")
            # --- MANDATORY CLEANING STEP HERE ---
            clean_corrupted_images(root)
            # ------------------------------------
            return root

    raise FileNotFoundError("Could not find class folders (non_bmw_cars) inside the zip!")

def train_bugatti_fix():
    # 1. PREPARE DATA
    data_root = setup_data_from_zip() # Sanitization happens inside this call

    transform = transforms.Compose([
        transforms.Resize((400, 400)),
        transforms.RandomCrop(380),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness=0.3, contrast=0.3),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    # ImageFolder will now load a clean dataset
    dataset = datasets.ImageFolder(data_root, transform=transform)

    # 2. CHECK FOR BUGATTIS (Sanity Check)
    try:
        non_bmw_idx = dataset.class_to_idx['non_bmw_cars']
        targets = np.array(dataset.targets)
        count = np.sum(targets == non_bmw_idx)
        print(f"Verified: Found {count} images in 'non_bmw_cars'.")
        if count == 0:
            raise ValueError("Found 'non_bmw_cars' folder but it is empty!")
    except KeyError:
        raise ValueError("CRITICAL: 'non_bmw_cars' folder is missing! Check your zip file structure.")

    # 3. WEIGHTED SAMPLER
    print("Calculating weights...")
    class_counts = np.bincount(targets)
    class_weights = np.zeros_like(class_counts, dtype=np.float32)
    for i, c in enumerate(class_counts):
        if c > 0:
            class_weights[i] = 1.0 / c

    # BOOST the Non-BMW weight
    class_weights[non_bmw_idx] *= 3.0

    # Sampler needs weights converted to a PyTorch tensor
    sample_weights = class_weights[targets]
    sampler = WeightedRandomSampler(torch.from_numpy(sample_weights), len(sample_weights), replacement=True)

    # num_workers=2 is recommended for fast loading
    train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, sampler=sampler, num_workers=2)

    # 4. LOAD MODEL
    print("\n--- STEP 2: Loading Model & Freezing Backbone ---")
    model = models.efficientnet_b4(weights=None)
    model.classifier[1] = nn.Linear(model.classifier[1].in_features, len(dataset.classes))

    # Load previous best weights
    model.load_state_dict(torch.load(OLD_MODEL_PATH, map_location=device))

    # FREEZE BACKBONE
    for param in model.features.parameters():
        param.requires_grad = False

    model.to(device)
    model.train()

    # 5. TRAINING LOOP
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.classifier.parameters(), lr=HEAD_LR)
    scaler = torch.amp.GradScaler('cuda')

    print("\n--- STEP 3: Retraining Head (Bugatti Correction) ---")
    for epoch in range(NUM_EPOCHS):
        running_loss = 0.0
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")

        for inputs, labels in pbar:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()

            with torch.amp.autocast('cuda'):
                outputs = model(inputs)
                loss = criterion(outputs, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            running_loss += loss.item()
            pbar.set_postfix({'loss': f"{loss.item():.4f}"})

    # 6. SAVE
    torch.save(model.state_dict(), NEW_MODEL_PATH)
    print(f"\n‚úÖ Retraining Complete! Model saved to {NEW_MODEL_PATH}")

if __name__ == "__main__":
    if not os.path.exists("/content/drive"):
        drive.mount('/content/drive')
    train_bugatti_fix()

--- STEP 1: Handling Zip File ---
Unzipping /content/drive/MyDrive/ddg_custom_bmw_dataset.zip to local disk...
Unzip complete.
Searching for valid dataset root...
Found REAL dataset root at: /content/bmw_retrain_data/ddg_custom_bmw_dataset

üî¨ Scanning '/content/bmw_retrain_data/ddg_custom_bmw_dataset' for corrupted images...
    üóëÔ∏è Deleting corrupt file: beach_sunset_029.jpg
Sanitization done, removed 1 bad files.
Verified: Found 581 images in 'non_bmw_cars'.
Calculating weights...

--- STEP 2: Loading Model & Freezing Backbone ---

--- STEP 3: Retraining Head (Bugatti Correction) ---


Epoch 1/10:   0%|          | 0/336 [00:00<?, ?it/s]

Epoch 2/10:   0%|          | 0/336 [00:00<?, ?it/s]

Epoch 3/10:   0%|          | 0/336 [00:00<?, ?it/s]

Epoch 4/10:   0%|          | 0/336 [00:00<?, ?it/s]

Epoch 5/10:   0%|          | 0/336 [00:00<?, ?it/s]

Epoch 6/10:   0%|          | 0/336 [00:00<?, ?it/s]

Epoch 7/10:   0%|          | 0/336 [00:00<?, ?it/s]

Epoch 8/10:   0%|          | 0/336 [00:00<?, ?it/s]

Epoch 9/10:   0%|          | 0/336 [00:00<?, ?it/s]

Epoch 10/10:   0%|          | 0/336 [00:00<?, ?it/s]


‚úÖ Retraining Complete! Model saved to /content/drive/MyDrive/bmw_bugatti_fixed.pth
