In [None]:
import os
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import shutil #copy files cuh
import yaml
import matplotlib.pyplot as plt
import pandas as pd
from ultralytics import YOLO
from datetime import datetime

In [None]:
##Converting TO YOLO Format
CSV_PATH = r"E:\Documents\Codes\Python\vehicle detection pipeline\data\raw\train.csv"
IMG_DIR = r"E:\Documents\Codes\Python\vehicle detection pipeline\data\raw\train"
OUTPUT_DIR = r"E:\Documents\Codes\Python\vehicle detection pipeline\data"
YAML_PATH = os.path.join(OUTPUT_DIR, "vehicles.yaml")
CLASSES = ['Bus', 'Truck']
VAL_SPLIT = 0.2



## Converting Dataset To YOLO Format
This codeblock converts the current raw dataset to the YOLO yaml format.

In [None]:
def convert_to_yolo_bbox(xmin, xmax, ymin, ymax):
    x_center = (xmin+xmax) / 2
    y_center = (ymin+ymax) / 2
    width = xmax - xmin
    height = ymax - ymin
    return x_center, y_center, width, height
def create_yolo_files():
    df = pd.read_csv(CSV_PATH)
    df = df.drop(columns=["Unnamed: 0"])

    os.makedirs(f"{OUTPUT_DIR}/images/train", exist_ok=True)
    os.makedirs(f"{OUTPUT_DIR}/images/val", exist_ok=True)
    os.makedirs(f"{OUTPUT_DIR}/labels/train", exist_ok=True)
    os.makedirs(f"{OUTPUT_DIR}/labels/val", exist_ok=True)
    
    image_ids = df["ImageID"].unique()
    train_ids, val_ids = train_test_split(image_ids, test_size=VAL_SPLIT, random_state=42)
    
    for split, ids in [("train", train_ids), ("val", val_ids)]:
        subset = df[df["ImageID"].isin(ids)] #returns the specific subset that we are in rn
        for img_id, group in tqdm(subset.groupby("ImageID"), desc=f"Processing {split}"):
            label_path = f"{OUTPUT_DIR}/labels/{split}/{img_id}.txt"
            with open(label_path, "w") as f:
                for _, row in group.iterrows():
                    cls = CLASSES.index(row["LabelName"])
                    x_center, y_center, width, height = convert_to_yolo_bbox(row["XMin"], row["XMax"], row["YMin"], row["YMax"])
                    f.write(f"{cls} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")

            # Copy image safely using shutil    
            src = os.path.join(IMG_DIR, f"{img_id}.jpg")
            dst = os.path.join(OUTPUT_DIR, "images", split, f"{img_id}.jpg")
            if os.path.exists(src):
                shutil.copy(src, dst)
            else:
                print(f"[WARNING] Image not found: {src}")
    print(f"Finished Conversion, dataset created at {OUTPUT_DIR}")
def create_yaml():
    data_yaml = {
        'train': os.path.join(OUTPUT_DIR, 'images', 'train'),
        'val': os.path.join(OUTPUT_DIR, 'images', 'val'),
        'nc': len(CLASSES),
        'names': CLASSES
    }
    with open(YAML_PATH, 'w') as f:
        yaml.dump(data_yaml, f, sort_keys=False)
    print(f"YOLO YAML file created at: {YAML_PATH}")

#create_yolo_files()
#create_yaml()


## Training and Logging Code
This codeblock here is responsible for training and logging the training's performance

In [None]:


# Set non-interactive Matplotlib backend for compatibility in non-GUI environments
plt.switch_backend('Agg')

def plot_training_metrics(save_dir):
    # Path to the results CSV file
    results_csv_path = os.path.join(save_dir, "train_results", "results.csv")
    
    if not os.path.exists(results_csv_path):
        print(f"Results CSV not found at {results_csv_path}. Skipping plot.")
        return
    
    # Read metrics from CSV and strip any leading/trailing spaces from column names
    df = pd.read_csv(results_csv_path)
    df.columns = df.columns.str.strip()  # Remove spaces around column names
    
    # Extract epochs and metrics (adjust if your CSV has different columns)
    epochs = df['epoch']
    box_loss = df['train/box_loss']
    cls_loss = df['train/cls_loss']  # Assuming DFL loss is not plotted; add if needed
    precision = df['metrics/precision(B)']
    recall = df['metrics/recall(B)']
    map50 = df['metrics/mAP50(B)']
    map5095 = df['metrics/mAP50-95(B)']

    # Create the plot figure
    plt.figure(figsize=(12, 8))

    # Box/Class Loss
    plt.subplot(2, 2, 1)
    plt.plot(epochs, box_loss, label="Box Loss", color="blue")
    plt.plot(epochs, cls_loss, label="Class Loss", color="red")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title("Training Loss")
    plt.legend()

    # Precision
    plt.subplot(2, 2, 2)
    plt.plot(epochs, precision, label="Precision", color="green")
    plt.xlabel("Epochs")
    plt.ylabel("Precision")
    plt.title("Training Precision")
    plt.legend()

    # Recall
    plt.subplot(2, 2, 3)
    plt.plot(epochs, recall, label="Recall", color="orange")
    plt.xlabel("Epochs")
    plt.ylabel("Recall")
    plt.title("Training Recall")
    plt.legend()

    # mAP
    plt.subplot(2, 2, 4)
    plt.plot(epochs, map50, label="mAP@0.5", color="red")
    plt.plot(epochs, map5095, label="mAP@0.5:0.95", color="purple")
    plt.xlabel("Epochs")
    plt.ylabel("mAP")
    plt.title("Mean Average Precision (mAP)")
    plt.legend()

    plt.tight_layout()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    plot_path = os.path.join(save_dir, f"training_metrics_{timestamp}.png")
    plt.savefig(plot_path)
    plt.close()

    print(f"Training metrics plot saved to {plot_path}")

def train_yolo_model(data_path, model_path, save_dir, epochs=50, batch_size=16, img_size=640):
    
    os.makedirs(save_dir, exist_ok=True)  # Create save_dir if it doesn't exist
    model = YOLO(model_path)
    print
    results = model.train(
        data=data_path,
        epochs=epochs,
        batch=batch_size,
        imgsz=img_size,
        project=save_dir,
        name="train_results",
        exist_ok=True  # Do not overwrite existing results
    )
    
    # Plot metrics from the CSV (no need to pass results, as per-epoch data is in CSV)
    plot_training_metrics(save_dir)
    
    # Path to the best model (YOLO saves it automatically if val=True, which is default)
    best_model_path = os.path.join(save_dir, "train_results", "weights", "best.pt")
    
    print(f"Training Complete. Results are saved in {save_dir}")
    
    return best_model_path

## Evaluation and Inference
This Codeblock is responsible for evaluating the model and for predictions

In [None]:
def evaluate_model(model_path, data_yaml, split="val"):
    
    model = YOLO(model_path)
    results = model.val()
    
    #Print the summary
    print("Evaluation Results")
    print(f"mAP50: {results.box.map50:.4f}")
    print(f"mAP50-95: {results.box.map:.4f}")
    print(f"Precision: {results.box.mp:.4f}")
    print(f"Recall: {results.box.mr:.4f}")
    
    return results

def run_inference(model_path, source, save_dirs):
    model = YOLO(model_path)
    results = model.predict(source=source, save=True ,show=False)
    
    print("Inference Complete")
    print(f"Results saved to: {save_dirs}")
    
    return results


## Main
Run this code to begin the pipeline from data preprocessing to training and evaluation.

In [None]:
# Step 1: Convert CSV -> YOLO format
#create_yolo_files() #Already Done, uncomment if you want to preprocess the data.
#create_yaml()

# Step 2: Train YOLO model
train_yolo_model(
    data_path="data/vehicles.yaml",
    model_path="yolov8s.pt",
    save_dir="runs/train",
    epochs=2,
    batch_size=16,
    img_size=224
)

# Step 3: Evaluate model
'''evaluate_model(
    model_path="runs/train/train_results/weights/best.pt",
    data_yaml="data/vehicles.yaml"
)'''

# Step 4: Run inference on a sample image
'''run_inference(
    model_path="runs/train/train_results/weights/best.pt",
    source="data/images/val/0000599864fd15b3.jpg"
)'''