# 01 — YOLOv8 Training
## Heavy Vehicle Detection During Peak-Hour Ban Periods

**Module:** MAICEN 1125 · M4 · U3 · FMP Group Assignment

This notebook covers:
- Environment setup & GPU check
- Dataset download from GitHub (no API key required)
- YOLOv8n model training (50 epochs full / 5 epochs verify)
- Training metrics + curves
- Saving best weights

> ▶ **Runtime → Change runtime type → T4 GPU** before running.

After training, move to **`02_Inference.ipynb`** for validation + new-image predictions.

---

## Cell 1 — Install Dependencies & Check GPU

In [None]:
!pip install ultralytics --quiet

import torch
import ultralytics
from ultralytics import YOLO

print(f"Ultralytics : {ultralytics.__version__}")
print(f"PyTorch     : {torch.__version__}")
print(f"CUDA        : {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU         : {torch.cuda.get_device_name(0)}")

# Save pip freeze snippet for reproducibility
!pip freeze | grep -E 'ultralytics|torch' > /content/pip_freeze_snippet.txt
print("\npip freeze snippet → /content/pip_freeze_snippet.txt")

## Cell 2 — Configuration

In [None]:
# ─── CONFIGURATION ────────────────────────────────────────────────────────────
VERIFY_ONLY   = False          # True = 5-epoch quick check; False = full 50-epoch run
MODEL_VARIANT = 'yolov8n.pt'  # yolov8n (nano) | yolov8s (small)
EPOCHS        = 5 if VERIFY_ONLY else 50
BATCH         = 16
IMGSZ         = 640
PROJECT_NAME  = 'heavy_vehicle_detection'
RUN_NAME      = f'yolov8n_{EPOCHS}ep'

# GitHub repository
GITHUB_REPO   = "https://github.com/archsalem101/Automatic-Detection-of-Heavy-Vehicles-and-Trailers-During-Peak-Hour-Ban-Periods.git"
REPO_NAME     = "Automatic-Detection-of-Heavy-Vehicles-and-Trailers-During-Peak-Hour-Ban-Periods"
DATASET_DIR   = f"/content/{REPO_NAME}/images dataset"

print(f"Mode      : {'VERIFY (5 ep)' if VERIFY_ONLY else 'FULL TRAINING (50 ep)'}")
print(f"Model     : {MODEL_VARIANT}")
print(f"Epochs    : {EPOCHS} | Batch: {BATCH} | Img: {IMGSZ}px")

## Cell 3 — Clone Dataset from GitHub

No API key required. The dataset lives at `images dataset/` inside the repo.

In [None]:
import os
import yaml

# Clone the repository (skip if already cloned)
if not os.path.exists(f"/content/{REPO_NAME}"):
    print("Cloning repository...")
    !git clone --depth 1 "{GITHUB_REPO}" "/content/{REPO_NAME}"
else:
    print("Repository already cloned — pulling latest...")
    !git -C "/content/{REPO_NAME}" pull

print(f"\nDataset folder: {DATASET_DIR}")
print("Contents:", os.listdir(DATASET_DIR))

# ── Locate or create data.yaml ──────────────────────────────────────────────
DATA_YAML = os.path.join(DATASET_DIR, "data.yaml")

if not os.path.exists(DATA_YAML):
    print("data.yaml not found — generating from folder structure...")
    yaml_content = {
        "train": os.path.join(DATASET_DIR, "train", "images"),
        "val":   os.path.join(DATASET_DIR, "valid", "images"),
        "test":  os.path.join(DATASET_DIR, "test",  "images"),
        "nc":    3,
        "names": ["bus", "car", "truck"]
    }
    with open(DATA_YAML, "w") as f:
        yaml.dump(yaml_content, f, default_flow_style=False)
    print(f"data.yaml created at: {DATA_YAML}")
else:
    print(f"data.yaml found at: {DATA_YAML}")

# Confirm classes
with open(DATA_YAML) as f:
    cfg = yaml.safe_load(f)
print(f"Classes ({cfg['nc']}): {cfg['names']}")

## Cell 4 — Dataset Sanity Check (Image Count & Class Distribution)

In [None]:
import glob
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from collections import Counter

# Count images per split
for split in ['train', 'valid', 'test']:
    img_dir = os.path.join(DATASET_DIR, split, 'images')
    if os.path.isdir(img_dir):
        n = len(os.listdir(img_dir))
        print(f"{split:5s}: {n} images")
    else:
        print(f"{split:5s}: folder not found at {img_dir}")

# Count class instances in training labels
label_dir = os.path.join(DATASET_DIR, 'train', 'labels')
class_counts = Counter()
if os.path.isdir(label_dir):
    for lf in glob.glob(os.path.join(label_dir, '*.txt')):
        with open(lf) as f:
            for line in f:
                parts = line.strip().split()
                if parts:
                    cls_id = int(parts[0])
                    class_counts[cfg['names'][cls_id]] += 1

print("\nTraining label distribution:")
for cls, cnt in sorted(class_counts.items()):
    print(f"  {cls:8s}: {cnt} instances")

# Bar chart
if class_counts:
    plt.figure(figsize=(6, 4))
    plt.bar(class_counts.keys(), class_counts.values(),
            color=['#2196F3', '#4CAF50', '#FF5722'])
    plt.title('Training Instance Count per Class')
    plt.ylabel('Instances')
    plt.tight_layout()
    plt.savefig('/content/class_distribution.png', dpi=150)
    plt.show()

# Sample grid of training images
train_img_dir = os.path.join(DATASET_DIR, 'train', 'images')
if os.path.isdir(train_img_dir):
    all_imgs = os.listdir(train_img_dir)
    samples  = random.sample(all_imgs, min(6, len(all_imgs)))
    fig, axes = plt.subplots(2, 3, figsize=(15, 8))
    for ax, fn in zip(axes.flatten(), samples):
        ax.imshow(mpimg.imread(os.path.join(train_img_dir, fn)))
        ax.set_title(fn[:22], fontsize=8)
        ax.axis('off')
    plt.suptitle('Sample Training Images', fontsize=12)
    plt.tight_layout()
    plt.savefig('/content/sample_training_images.png', dpi=150)
    plt.show()

## Cell 5 — Train YOLOv8

| Mode | Expected runtime (T4 GPU) |
|------|---------------------------|
| Full (50 ep) | ~25–35 min |
| Verify (5 ep) | ~3–5 min |

In [None]:
model = YOLO(MODEL_VARIANT)

results = model.train(
    data     = DATA_YAML,
    epochs   = EPOCHS,
    batch    = BATCH,
    imgsz    = IMGSZ,
    project  = f'/content/{PROJECT_NAME}',
    name     = RUN_NAME,
    patience = 10,
    save     = True,
    plots    = True,
    device   = 0 if torch.cuda.is_available() else 'cpu',
    exist_ok = True,
    verbose  = True
)

WEIGHTS = f'/content/{PROJECT_NAME}/{RUN_NAME}/weights/best.pt'
print(f"\n✅ Training complete. Best weights → {WEIGHTS}")

## Cell 6 — Validation Metrics Table

In [None]:
import pandas as pd

best_model = YOLO(WEIGHTS)
metrics    = best_model.val(
    data   = DATA_YAML,
    imgsz  = IMGSZ,
    device = 0 if torch.cuda.is_available() else 'cpu'
)

print("\n" + "="*52)
print("  VALIDATION METRICS")
print("="*52)
print(f"  mAP@0.5        : {metrics.box.map50:.4f}")
print(f"  mAP@0.5:0.95   : {metrics.box.map:.4f}")
print(f"  Precision (all): {metrics.box.mp:.4f}")
print(f"  Recall (all)   : {metrics.box.mr:.4f}")
print("="*52)

df = pd.DataFrame({
    'Class'     : cfg['names'],
    'Precision' : metrics.box.p,
    'Recall'    : metrics.box.r,
    'mAP@0.5'   : metrics.box.ap50,
    'mAP@0.5:95': metrics.box.ap
})
print("\nPer-class:")
print(df.to_string(index=False))

## Cell 7 — Training Curves (results.png, Confusion Matrix, PR Curve)

In [None]:
run_dir = f'/content/{PROJECT_NAME}/{RUN_NAME}/'

for fname, title in [
    ('results.png',           'Training Curves (loss / P / R / mAP)'),
    ('confusion_matrix.png',  'Confusion Matrix'),
    ('PR_curve.png',          'Precision-Recall Curve'),
    ('F1_curve.png',          'F1 Curve'),
]:
    path = os.path.join(run_dir, fname)
    if os.path.exists(path):
        img = mpimg.imread(path)
        w   = 14 if fname == 'results.png' else 8
        plt.figure(figsize=(w, 6))
        plt.imshow(img)
        plt.axis('off')
        plt.title(title, fontsize=12)
        plt.tight_layout()
        plt.show()
        print(f"Shown: {fname}")

## Cell 8 — Save Weights Path for Inference Notebook

Copy the printed path and paste it into `02_Inference.ipynb` → Cell 2 as `WEIGHTS_PATH`.

In [None]:
import shutil

print("="*60)
print("  WEIGHTS PATH (copy into 02_Inference.ipynb)")
print("="*60)
print(f"  {WEIGHTS}")
print("="*60)

# Zip weights + curves for download / GitHub Release
os.makedirs('/content/training_outputs/curves', exist_ok=True)
shutil.copy(WEIGHTS, '/content/training_outputs/')

for fname in ['results.png', 'confusion_matrix.png', 'PR_curve.png', 'F1_curve.png']:
    src = os.path.join(run_dir, fname)
    if os.path.exists(src):
        shutil.copy(src, '/content/training_outputs/curves/')

shutil.make_archive('/content/training_outputs', 'zip', '/content/training_outputs')
print("\nOutputs zipped → /content/training_outputs.zip")

try:
    from google.colab import files
    files.download('/content/training_outputs.zip')
except ImportError:
    print("(Not in Colab — download manually)")

---
## Reproducibility Log

Fill in after a successful run and copy to the README:

| Field | Value |
|-------|-------|
| Date/time (UTC) | _(fill in)_ |
| GPU | _(e.g. T4 16 GB)_ |
| Ultralytics version | _(from pip_freeze_snippet.txt)_ |
| Runtime (50 ep) | _(e.g. 28 min)_ |
| mAP@0.5 achieved | _(fill in)_ |

---
*MAICEN 1125 M4 U3 FMP — February 2026*