# Introduction
Download the dataset from
https://www.kaggle.com/datasets/734b7bcb7ef13a045cbdd007a3c19874c2586ed0b02b4afc86126e89d00af8d2
Store it as a folder called dataset

# Dataset Label Formatting
Clear all unused of deprecated files and folders

In [None]:
%reset -f
import os
import shutil
import random
from tqdm import tqdm  # For the progress bar

# Path to the 'dataset' folder
dataset_folder = "dataset"

# List of folders to keep
folders_to_keep = ["images", "labels"]

# Iterate through all items in the dataset folder
for item in os.listdir(dataset_folder):
    item_path = os.path.join(dataset_folder, item)
    
    # Check if the item is not in the keep list
    if item not in folders_to_keep:
        # Remove the folder or file
        if os.path.isdir(item_path):
            shutil.rmtree(item_path)  # Remove directories
            print(f"Removed folder: {item_path}")
        else:
            os.remove(item_path)  # Remove files
            print(f"Removed file: {item_path}")



Reformat dataset to include only vehicles.
The original dataset contains the labels of:

    ["aeroplane", "bicyclebike", "bird", "boat", "bottle", "bus",
    "car", "cat", "chair", "cow", "diningtable", "dog", "horse",
    "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]

A new filtered label directory has been created so that only remain the vehicles:

    ["car", "bus", "motorbike", "bicyclebike"]

These new labels are stored in dataset/labels_filtered

In [None]:
# Directory containing YOLO label .txt files
label_dir = "dataset/labels"  # Replace with your label directory path

# Class IDs for vehicle-related objects based on CLASS_NAMES and their new mapping
VEHICLE_CLASS_MAP = {
    6: 0,  # car -> 0
    5: 1,  # bus -> 1
    13: 2, # motorbike -> 2
    1: 3   # bicyclebike -> 3
}

# Directory to save filtered and remapped labels
output_dir = "dataset/labels_filtered"
os.makedirs(output_dir, exist_ok=True)

def filter_and_remap_labels(label_file):
    """
    Reads a YOLO label file, filters out non-vehicle classes,
    remaps class IDs for vehicle classes, and writes the output to a new file.
    """
    input_path = os.path.join(label_dir, label_file)
    output_path = os.path.join(output_dir, label_file)

    with open(input_path, "r") as infile, open(output_path, "w") as outfile:
        for line in infile:
            parts = line.split()
            class_id = int(parts[0])  # Extract class ID
            if class_id in VEHICLE_CLASS_MAP:
                # Remap class ID and write the updated line
                new_class_id = VEHICLE_CLASS_MAP[class_id]
                outfile.write(f"{new_class_id} " + " ".join(parts[1:]) + "\n")

# List all .txt files in the label directory
label_files = [f for f in os.listdir(label_dir) if f.endswith(".txt")]

# Process all .txt files with a progress bar
with tqdm(total=len(label_files), desc="Processing Labels", unit="file") as pbar:
    for file_name in label_files:
        filter_and_remap_labels(file_name)
        pbar.update(1)

print(f"Filtered and remapped labels saved in: {output_dir}")


Remove empty labels

In [None]:
# Define the path to the labels folder
labels_folder = "dataset/labels_filtered"

# List all .txt files in the labels folder
label_files = [f for f in os.listdir(labels_folder) if f.endswith('.txt')]

# Initialize a counter for removed files
removed_count = 0

# Check each label file and remove it if it's empty
for label_file in label_files:
    label_path = os.path.join(labels_folder, label_file)
    if os.path.getsize(label_path) == 0:  # Check if the file size is 0 bytes
        os.remove(label_path)  # Remove the empty file
        removed_count += 1
        # print(f"Removed empty label: {label_file}")

# Output the result
print(f"Total empty labels removed: {removed_count}")



Counting images and filtered_labels

In [None]:
print(os.getcwd())
# Define the paths to the images and labels folders
images_folder = "dataset/images"
labels_folder = "dataset/labels_filtered"

# List all files in the images and labels folders
image_files = [f for f in os.listdir(images_folder) if f.endswith('.jpg')]
label_files = [f for f in os.listdir(labels_folder) if f.endswith('.txt')]

# Count the total number of images and labels
num_images = len(image_files)
num_labels = len(label_files)

# Check for matching files (base filenames without extensions)
image_basenames = {os.path.splitext(f)[0] for f in image_files}
label_basenames = {os.path.splitext(f)[0] for f in label_files}

# Count matched and unmatched files
matched_files = image_basenames & label_basenames
unmatched_images = image_basenames - label_basenames
unmatched_labels = label_basenames - image_basenames

print(f"Total images: {num_images}")
print(f"Total labels: {num_labels}")
print(f"Matched files: {len(matched_files)}")
print(f"Unmatched images: {len(unmatched_images)}")
print(f"Unmatched labels: {len(unmatched_labels)}")

# Optionally print the unmatched files
if unmatched_images:
    print("Unmatched images (no corresponding label):")
    for img in unmatched_images:
        print(f"  {img}")

if unmatched_labels:
    print("Unmatched labels (no corresponding image):")
    for lbl in unmatched_labels:
        print(f"  {lbl}")


Create the folder of images_filtered with a reduced number of unlabellel images.
The ratio of labelled images and unlabelled images has been set to 50/50. 

In [None]:
# Paths
images_folder = "dataset/images"
labels_folder = "dataset/labels_filtered"
output_folder = "dataset/images_filtered"

# Ratio of labeled and unlabeled images
r_label = 50
r_unlabel = 100 - r_label

# Ensure output directory exists
os.makedirs(output_folder, exist_ok=True)

# Get all image files and corresponding label files
image_files = [f for f in os.listdir(images_folder) if f.endswith('.jpg')]
label_files = [f for f in os.listdir(labels_folder) if f.endswith('.txt')]

# Get the base filenames (without extensions) for labels
label_basenames = {os.path.splitext(label)[0] for label in label_files}

# Separate labeled and unlabeled images
labeled_images = [img for img in image_files if os.path.splitext(img)[0] in label_basenames]
unlabeled_images = [img for img in image_files if os.path.splitext(img)[0] not in label_basenames]

# Check counts
num_labeled = len(labeled_images)
num_unlabeled_to_select = min(int(num_labeled * r_unlabel / r_label), len(unlabeled_images))

# Randomly select the required number of unlabeled images
selected_unlabeled_images = random.sample(unlabeled_images, num_unlabeled_to_select)

# Combine labeled and selected unlabeled images
images_to_copy = labeled_images + selected_unlabeled_images

# Copy labeled and selected unlabeled images to the output folder with a progress bar
with tqdm(total=len(images_to_copy), desc="Copying Images", unit="file") as pbar:
    for img in images_to_copy:
        src_path = os.path.join(images_folder, img)
        dst_path = os.path.join(output_folder, img)
        shutil.copy(src_path, dst_path)
        pbar.update(1)

# Output results
print(f"Total labeled images: {num_labeled}")
print(f"Total unlabeled images selected: {len(selected_unlabeled_images)}")
print(f"Total images in 'images_filtered': {len(os.listdir(output_folder))}")



# Create Train, Validation and Test image sets
From the image and labels ("dataset/images", "dataset/labels_filtered")
Create the test, validation and test sets.


- Training is stored in ("dataset/train/images", "dataset/train/labels")
- Validation is stored in ("dataset/valid/images", "dataset/valid/labels")
- Test is stored in ("dataset/test/images", "dataset/test/labels")


In [None]:
import os
import shutil
import random
from tqdm import tqdm  # For progress bar

# Split data into 60% train, 20% validation, 20% test
train_perc = .6
valid_perc = .2

# Define folder paths
images_folder = "dataset/images_filtered"  # Folder containing filtered images
labels_folder = "dataset/labels_filtered"  # Folder containing filtered labels

train_images_folder = "dataset/train/images"
train_labels_folder = "dataset/train/labels"

valid_images_folder = "dataset/valid/images"
valid_labels_folder = "dataset/valid/labels"

test_images_folder = "dataset/test/images"
test_labels_folder = "dataset/test/labels"

# Create output directories
for folder in [train_images_folder, train_labels_folder,
               valid_images_folder, valid_labels_folder,
               test_images_folder, test_labels_folder]:
    os.makedirs(folder, exist_ok=True)

# Get a list of all images
image_files = sorted(os.listdir(images_folder))

# Create a list of images with and without labels
data = []
for image_file in image_files:
    label_file = os.path.splitext(image_file)[0] + ".txt"
    if os.path.exists(os.path.join(labels_folder, label_file)):
        data.append((image_file, label_file))  # Image has a corresponding label
    else:
        data.append((image_file, None))  # Image has no label (no objects detected)

# Shuffle the data
random.shuffle(data)

# Split data into ratios
train_split = int(train_perc * len(data))
valid_split = int((train_perc + valid_perc) * len(data))

train_data = data[:train_split]
valid_data = data[train_split:valid_split]
test_data = data[valid_split:]

# Function to copy images and labels with a progress bar
def copy_files(data, dest_images_folder, dest_labels_folder, phase_name):
    with tqdm(total=len(data), desc=f"Copying {phase_name}") as pbar:
        for image_file, label_file in data:
            # Copy the image file
            shutil.copy(os.path.join(images_folder, image_file), os.path.join(dest_images_folder, image_file))
            # Copy the label file if it exists
            if label_file:
                shutil.copy(os.path.join(labels_folder, label_file), os.path.join(dest_labels_folder, label_file))
            # Update progress bar
            pbar.update(1)

# Copy data to respective folders
copy_files(train_data, train_images_folder, train_labels_folder, "Training Data")
copy_files(valid_data, valid_images_folder, valid_labels_folder, "Validation Data")
copy_files(test_data, test_images_folder, test_labels_folder, "Testing Data")

print("Dataset split complete!")
print(f"Training data: {len(train_data)} images")
print(f"Validation data: {len(valid_data)} images")
print(f"Testing data: {len(test_data)} images")



# Load YOLO Model and Begin Training!!

In [None]:
# !pip install ultralytics 
from ultralytics import YOLO
import os

epochs_per_run = 5

path = os.getcwd()
model_path = os.path.join(path, "runs/detect/successful_train_25", "weights/best.pt")

i=0

print("*********************")
print("*********************")
print(f'***Saved epoch {i}***')
print("*********************")
print("*********************")

while 1:

    # Load a pretrained YOLOv11 model
    model = YOLO(model_path)  # Choose the appropriate model variant
    
    # Train the model
    yaml_path = os.path.join(path, "train.yaml")
    print(yaml_path)
    out = model.train(
        data=yaml_path,           # Path to the data configuration file
        epochs=epochs_per_run,    # Number of training epochs
        imgsz=640,                # Image size
        batch=16,                 # Batch size
        device="cpu"              # GPU device (use 'cpu' for CPU training)
    )

    model_path = os.path.join(path, out.save_dir, "weights/last.pt")
    i+=epochs_per_run
    print("*********************")
    print("*********************")
    print(f'***Saved epoch {i}***')
    print("*********************")
    print("*********************")

*********************
*********************
***Saved epoch 0***
*********************
*********************
/home/alex/ForeHelm/YOLO_training0/train.yaml
Ultralytics 8.3.34 🚀 Python-3.12.3 torch-2.5.1+cu124 


ValueError: Invalid CUDA 'device=0' requested. Use 'device=cpu' or pass valid CUDA device(s) if available, i.e. 'device=0' or 'device=0,1,2,3' for Multi-GPU.

torch.cuda.is_available(): False
torch.cuda.device_count(): 0
os.environ['CUDA_VISIBLE_DEVICES']: -1
See https://pytorch.org/get-started/locally/ for up-to-date torch install instructions if no CUDA devices are seen by torch.


In [None]:
out