In [None]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import warnings
import shutil

In [None]:
warnings.filterwarnings('ignore')
print("Mounting Google Drive...")
try:
    from google.colab import drive
    drive.mount('/content/drive')
    BASE_PATH = '/content/drive/MyDrive/Vehicles_Dataset'
except ImportError:
    print("Not running in Colab. Using local paths.")
    BASE_PATH = './data'

MASTER_CSV_PATH = os.path.join(BASE_PATH, 'master_df.csv')
IMAGE_PATH = os.path.join(BASE_PATH, 'Images')

YOLO_LABEL_PATH = '/content/labels_yolo'
YOLO_ZIP_PATH = os.path.join(BASE_PATH, 'labels_yolo.zip')

IMG_WIDTH = 1280
IMG_HEIGHT = 720

print(f"Base Path (Drive): {BASE_PATH}")
print(f"Master CSV (Drive): {MASTER_CSV_PATH}")
print(f"Image Path (Drive): {IMAGE_PATH}")
print(f"Local Label Path (Colab): {YOLO_LABEL_PATH}")
print(f"Final Zip Path (Drive): {YOLO_ZIP_PATH}")

In [None]:
CATEGORY_NAMES = [
    'car', 'person', 'traffic sign', 'bus', 'truck', 'train',
    'rider', 'motor', 'bike', 'traffic light'
]

CATEGORY_MAP = {name: idx for idx, name in enumerate(CATEGORY_NAMES)}

print(f"Categories mapped: {CATEGORY_MAP}")

if os.path.exists(YOLO_LABEL_PATH):
    shutil.rmtree(YOLO_LABEL_PATH)

for split in ['train', 'val', 'test']:
    split_path = os.path.join(YOLO_LABEL_PATH, split)
    os.makedirs(split_path, exist_ok=True)

print(f"Local YOLO label directories created in: {YOLO_LABEL_PATH}")

print(f"Loading master data from {MASTER_CSV_PATH}...")
try:
    master_df = pd.read_csv(MASTER_CSV_PATH)
    print("Setup complete. master_df is loaded.")
    print(f"Total annotations to process: {len(master_df):,}")
    master_df.info()
except FileNotFoundError:
    print(f"ERROR: {MASTER_CSV_PATH} not found.")
    print("Please run the 'Milestone_1_EDA.ipynb' notebook first.")
    raise

In [None]:
# --- 5. YOLO Conversion Functions ---
def convert_to_yolo(row, img_w=IMG_WIDTH, img_h=IMG_HEIGHT):
    """
    Converts a master_df row (x1, y1, x2, y2) to YOLO format
    (class_id, x_center_norm, y_center_norm, width_norm, height_norm)
    """
    # 1. Get Class ID
    class_id = CATEGORY_MAP.get(row['category'])
    if class_id is None or row['category'] == 'background':
        return None # Skip 'background' or other non-target categories

    # 2. Calculate normalized center coordinates
    x_center_norm = ((row['x1'] + row['x2']) / 2) / img_w
    y_center_norm = ((row['y1'] + row['y2']) / 2) / img_h

    # 3. Calculate normalized width and height
    width_norm = (row['x2'] - row['x1']) / img_w
    height_norm = (row['y2'] - row['y1']) / img_h

    # 4. Clamp values to be between 0 and 1 (just in case of bad labels)
    x_center_norm = np.clip(x_center_norm, 0, 1)
    y_center_norm = np.clip(y_center_norm, 0, 1)
    width_norm = np.clip(width_norm, 0, 1)
    height_norm = np.clip(height_norm, 0, 1)

    return f"{class_id} {x_center_norm:.6f} {y_center_norm:.6f} {width_norm:.6f} {height_norm:.6f}"

def create_yolo_labels(df, base_save_dir):
    """
    Iterates through the DataFrame, converts labels, and saves them to
    the appropriate train/val/test directories.
    """
    print("Starting YOLO label conversion (writing to local disk)...")
    grouped = df.groupby('split')

    for split, split_df in grouped:
        print(f"\nProcessing '{split}' split...")
        save_path_split = os.path.join(base_save_dir, split)
        image_groups = split_df.groupby('image_name')
        for image_name, rows in tqdm(image_groups, desc=f"Creating {split} labels"):

            # Get the base name for the .txt file
            # (e.g., 'b001a7ce-36f3fff2.jpg' -> 'b001a7ce-36f3fff2')
            txt_filename = os.path.splitext(image_name)[0] + '.txt'
            txt_filepath = os.path.join(save_path_split, txt_filename)

            yolo_labels = []
            for _, row in rows.iterrows():
                yolo_str = convert_to_yolo(row)
                if yolo_str:
                    yolo_labels.append(yolo_str)


            with open(txt_filepath, 'w') as f:
                if yolo_labels:
                    f.write('\n'.join(yolo_labels))
                else:

                    f.write('')

In [None]:
# --- 6. Run Preprocessing ---
create_yolo_labels(master_df, YOLO_LABEL_PATH)
print("\n--- ZIPPING AND COPYING TO DRIVE ---")
print("Zipping labels... (This may take a minute or two)")

!zip -r -q /content/labels_yolo.zip /content/labels_yolo

print("Copying zip file to Google Drive...")
!cp /content/labels_yolo.zip {YOLO_ZIP_PATH}

print(f"Done! Your labels are now saved in: {YOLO_ZIP_PATH}")

print("\n--- OFFLINE PREPROCESSING COMPLETE ---")
print("You now have:")
print(f"1. Original Images in: {IMAGE_PATH}")
print(f"2. Zipped YOLO Labels in: {YOLO_ZIP_PATH}")
print("\nThis is your 'Preprocessed Data' deliverable, ready for Milestone 2.")
print("To use these, you will unzip them in your training notebook.")

In [None]:
def count_files(folder):
    count = sum([len(files) for _, _, files in os.walk(folder)])
    return count

print("\n Folder Summary:")
print(f"Train label files: {count_files(os.path.join(YOLO_LABEL_PATH, 'train'))}")
print(f"Validation label files: {count_files(os.path.join(YOLO_LABEL_PATH, 'val'))}")
print(f"Test label files: {count_files(os.path.join(YOLO_LABEL_PATH, 'test'))}")
print(f"Total images folder: {count_files(IMAGE_PATH)}")

# Preprocessing Data Summary

---

## 1. Preprocessing Objectives

The **primary objective** of this phase was to transform the cleaned dataset `master_df.csv` into the structure and annotation format required for **training a YOLO (You Only Look Once)** model.

The **“Preprocessed Data”** deliverable consists of the original images paired with **new, machine-readable label files**.  
This notebook focused exclusively on **offline label conversion**.

> **Note on Augmentation**  
> Image resizing and data augmentation (e.g., random cropping, flipping, color shifts) were **not performed offline**.  
> These operations are handled *online* (in memory during training) by the model’s data loader.  
> This is a standard practice that:
> - Saves disk space  
> - Provides greater augmentation variability

---

## 2.  Core Preprocessing Task: Label Conversion

The **BDD100K** dataset provides annotations in **Pascal VOC** format, which must be converted to **YOLO format**.

###  Format Comparison

| **Attribute** | **Pascal VOC (Source)** | **YOLO (Target)** |
|----------------|--------------------------|--------------------|
| **Coordinates** | `x_min, y_min, x_max, y_max` | `x_center_norm, y_center_norm` |
| **Dimensions** | (Implicit) | `width_norm, height_norm` |
| **Units** | Pixels (Absolute) | Ratios (Normalized 0.0 → 1.0) |
| **Class ID** | String name (e.g., `"car"`) | Integer ID (e.g., `0`) |

---

###  Conversion Process

A `convert_to_yolo()` function was implemented to process each annotation as follows:

1. **Map Class ID**  
   The category name (e.g., `"car"`) was mapped to its integer ID (e.g., `0`) using a predefined `CATEGORY_MAP`.

2. **Calculate Center and Dimensions**  
   Convert absolute coordinates to center-based format:

   $$
   x_{center} = \frac{x_1 + x_2}{2}
   $$

   $$
   y_{center} = \frac{y_1 + y_2}{2}
   $$

   $$
   width = x_2 - x_1
   $$

   $$
   height = y_2 - y_1
   $$

3. **Normalize Values**  
   Normalize by image dimensions (1280×720):

   $$
   x_{center\_norm} = \frac{x_{center}}{1280}
   $$

   $$
   y_{center\_norm} = \frac{y_{center}}{720}
   $$

   $$
   width\_norm = \frac{width}{1280}
   $$

   $$
   height\_norm = \frac{height}{720}
   $$

4. **Generate YOLO Label String**  
   Each object’s label was written in YOLO format as:



class_id x_center_norm y_center_norm width_norm height_norm


---

## 3. Final File Generation

The `create_yolo_labels` script finalized the deliverable by iterating through **1.8 million annotations**:

- **Grouping** → Annotations were grouped by split (`train`, `val`, `test`) and by `image_name`.  
- **File Creation** → For each image (e.g., `b001a7ce-36f3fff2.jpg`), a corresponding text file (`b001a7ce-36f3fff2.txt`) was created inside the appropriate subdirectory under `labels_yolo/`.  
- **Negative Samples** → For images containing only the `"background"` category, an **empty .txt file** was created.  
These negative samples are critical — they teach the model which images contain *no objects of interest*.

---

## 4. Deliverable Summary: *“Preprocessed Data”*

This notebook successfully generated the **Preprocessed Data deliverable** for **Milestone 1**.  
The resulting directory structure is now ready for **model training in Milestone 2**.

---

### Dataset Structure

| **Directory**   | **Subfolder** | **Description** |
|-----------------|----------------|-----------------|
| **Images**      | train          | Original images |
|                 | val            | Original images |
|                 | test           | Original images |
| **labels_yolo** | train          | ~70,000 .txt label files |
|                 | val            | ~10,000 .txt label files |
|                 | test           | ~20,000 .txt label files |

---

 **Status:** Preprocessing completed successfully.  
The dataset is now fully prepared for YOLO model training.
