In [8]:
import os
import shutil
from ultralytics import YOLO
import cv2
import numpy as np
import supervision as sv

In [3]:
train_dir = "OtherTrain/MOT20-02"
validation_dir = "MOT20Dataset/validation/MOT20-01"
test_dir = "MOT20Dataset/test/MOT20-07"
output_dir = "MOT20Dataset/updated"

In [4]:
# Function to split a MOT20 training set into a train and validation set
def TrainSplit(train_directory, output_directory):
    # Paths to dataset
    image_dir = os.path.join(train_directory, "images")
    label_dir = os.path.join(train_directory, "labels")
    
    # Create train/val directories
    train_image_dir = os.path.join(output_directory, "train/images")
    train_label_dir = os.path.join(output_directory, "train/labels")
    val_image_dir = os.path.join(output_directory, "val/images")
    val_label_dir = os.path.join(output_directory, "val/labels")
    
    os.makedirs(train_image_dir, exist_ok=True)
    os.makedirs(train_label_dir, exist_ok=True)
    os.makedirs(val_image_dir, exist_ok=True)
    os.makedirs(val_label_dir, exist_ok=True)
    
    # Get sorted list of files
    images = sorted(os.listdir(image_dir))
    labels = sorted(os.listdir(label_dir))
    
    # Ensure images and labels align
    assert len(images) == len(labels), "Mismatch between images and labels!"
    
    # Split dataset
    split_ratio = 0.8
    train_count = int(len(images) * split_ratio)
    
    for i, (image, label) in enumerate(zip(images, labels)):
        if i < train_count:
            shutil.move(os.path.join(image_dir, image), train_image_dir)
            shutil.move(os.path.join(label_dir, label), train_label_dir)
        else:
            shutil.move(os.path.join(image_dir, image), val_image_dir)
            shutil.move(os.path.join(label_dir, label), val_label_dir)
    
    print("Dataset splitting complete!")

In [5]:
# Function to convert MOT20 dataset format to YOLO format
def ConvertMotToYOLO(input_dir, is_test_data=False):
    input_file = f'{input_dir}/gt/gt.txt'
    output_dir = f'{input_dir}/labels'  # Directory to store YOLO annotations
    image_width = 1920     
    image_height = 1080
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Process gt.txt
    with open(input_file, "r") as f:
        for line in f:
            # Split line and filter for required values
            split_gt_line = line.strip().split(",")
            
            # Ensure we have at least the first 7 values
            if len(split_gt_line) < 6:
                continue  # Skip invalid lines
            
            # Parse required values
            frame = int(split_gt_line[0])
            obj_id = 0 # set every tracked pedestrian to class 0
            bb_left = float(split_gt_line[2])
            bb_top = float(split_gt_line[3])
            bb_width = float(split_gt_line[4])
            bb_height = float(split_gt_line[5])
            # Check the provided confidence score if this is a training set
            if is_test_data == False:
                conf = float(split_gt_line[6])
                
                # Skip entries with conf == 0
                if conf == 0:
                    continue
    
            # Normalize bounding box coordinates
            x_center = (bb_left + bb_width / 2) / image_width
            y_center = (bb_top + bb_height / 2) / image_height
            width = bb_width / image_width
            height = bb_height / image_height
    
            # YOLO format: <class_id> <x_center> <y_center> <width> <height>
            yolo_line = f"{int(obj_id)} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}"
    
            # Write to frame-specific file
            output_file = os.path.join(output_dir, f"{int(frame):06d}.txt")
            with open(output_file, "a") as out_f:
                out_f.write(yolo_line + "\n")

In [20]:
ConvertMotToYOLO(train_dir)

In [21]:
TrainSplit(train_dir, output_dir)

Dataset splitting complete!


In [2]:
model = YOLO("yolov8n.pt")  # load yolov8 nano pretrained model

In [3]:
model.train(
    data='data.yaml',  # Path to your dataset YAML file
    epochs=5,
    batch=16,
    imgsz=1280,
    device='cpu',
    freeze=10,
    optimizer='AdamW',
    lr0=0.001,
    weight_decay=0.001,
    augment=True,
)

New https://pypi.org/project/ultralytics/8.3.51 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.3.48 🚀 Python-3.10.12 torch-2.5.1+cu124 CPU (13th Gen Intel Core(TM) i9-13900KF)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8n.pt, data=data.yaml, epochs=5, time=None, patience=100, batch=16, imgsz=1280, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train, exist_ok=False, pretrained=True, optimizer=AdamW, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=10, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=True, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_f

[W1217 16:15:42.384740088 NNPACK.cpp:61] Could not initialize NNPACK! Reason: Unsupported hardware.


Freezing layer 'model.2.m.0.cv2.bn.weight'
Freezing layer 'model.2.m.0.cv2.bn.bias'
Freezing layer 'model.3.conv.weight'
Freezing layer 'model.3.bn.weight'
Freezing layer 'model.3.bn.bias'
Freezing layer 'model.4.cv1.conv.weight'
Freezing layer 'model.4.cv1.bn.weight'
Freezing layer 'model.4.cv1.bn.bias'
Freezing layer 'model.4.cv2.conv.weight'
Freezing layer 'model.4.cv2.bn.weight'
Freezing layer 'model.4.cv2.bn.bias'
Freezing layer 'model.4.m.0.cv1.conv.weight'
Freezing layer 'model.4.m.0.cv1.bn.weight'
Freezing layer 'model.4.m.0.cv1.bn.bias'
Freezing layer 'model.4.m.0.cv2.conv.weight'
Freezing layer 'model.4.m.0.cv2.bn.weight'
Freezing layer 'model.4.m.0.cv2.bn.bias'
Freezing layer 'model.4.m.1.cv1.conv.weight'
Freezing layer 'model.4.m.1.cv1.bn.weight'
Freezing layer 'model.4.m.1.cv1.bn.bias'
Freezing layer 'model.4.m.1.cv2.conv.weight'
Freezing layer 'model.4.m.1.cv2.bn.weight'
Freezing layer 'model.4.m.1.cv2.bn.bias'
Freezing layer 'model.5.conv.weight'
Freezing layer 'model.5.

[34m[1mtrain: [0mScanning /home/abey/Desktop/Repos/OtherProjects/ObjectTrackingMOT20/MOT20[0m
[34m[1mval: [0mScanning /home/abey/Desktop/Repos/OtherProjects/ObjectTrackingMOT20/MOT20Da[0m


Plotting labels to runs/detect/train/labels.jpg... 
[34m[1moptimizer:[0m AdamW(lr=0.001, momentum=0.937) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.001), 63 bias(decay=0.0)
Image sizes 1280 train, 1280 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train[0m
Starting training for 5 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/5         0G      1.434      1.145       1.27        186       1280: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        557      26220      0.885      0.781      0.881      0.541






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        2/5         0G      1.259     0.7951      1.189         71       1280: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        557      26220      0.908      0.813      0.907      0.584

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        3/5         0G      1.218     0.7351      1.164        105       1280: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        557      26220      0.921      0.826      0.908      0.602






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        4/5         0G      1.172     0.6899      1.143        269       1280: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        557      26220      0.929      0.839      0.918      0.612






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        5/5         0G      1.139       0.66      1.121         39       1280: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        557      26220       0.93      0.845      0.922      0.618






5 epochs completed in 1.849 hours.
Optimizer stripped from runs/detect/train/weights/last.pt, 6.3MB
Optimizer stripped from runs/detect/train/weights/best.pt, 6.3MB

Validating runs/detect/train/weights/best.pt...
Ultralytics 8.3.48 🚀 Python-3.10.12 torch-2.5.1+cu124 CPU (13th Gen Intel Core(TM) i9-13900KF)
Model summary (fused): 168 layers, 3,005,843 parameters, 0 gradients, 8.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  m


                   all        557      26220       0.92      0.834      0.917      0.615
Speed: 1.4ms preprocess, 226.0ms inference, 0.0ms loss, 1.8ms postprocess per image
Results saved to [1mruns/detect/train[0m


In [3]:
model.load("runs/detect/train/weights/best.pt")

Transferred 319/355 items from pretrained weights


YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_s

In [13]:
# Evaluate the model on the test dataset
results = model.val(
    data='data.yaml',         
    imgsz=1280,               
    split='test',             # Use the 'test' split
    save_json=True,           
    save_txt=True,            
    max_det=100,
    conf=0.3, # Min confidence threshold for detections
    iou=0.6) # IoU threshold for non-max suppression

Ultralytics 8.3.48 🚀 Python-3.10.12 torch-2.5.1+cu124 CPU (13th Gen Intel Core(TM) i9-13900KF)


[34m[1mval: [0mScanning /home/abey/Desktop/Repos/OtherProjects/ObjectTrackingMOT20/MOT20Da[0m
                 Class     Images  Instances      Box(P          R      mAP50  m


                   all        585      20330      0.749      0.457      0.615      0.393
                person        585      20330      0.749      0.457      0.615      0.393
Speed: 2.0ms preprocess, 165.8ms inference, 0.0ms loss, 1.3ms postprocess per image
Saving runs/detect/val5/predictions.json...
Results saved to [1mruns/detect/val5[0m


In [5]:
# Convert the test images to a video for tracking
def ConvertTestImagesToVideo(test_image_folder, output_video):

    # Get sorted list of image files
    image_files = sorted(os.listdir(image_folder), key=lambda x: int(x.split('.')[0]))
    
    # Read the first image to get the dimensions for the video
    first_image_path = os.path.join(image_folder, image_files[0])
    frame = cv2.imread(first_image_path)
    height, width, _ = frame.shape
    
    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'XVID')  # You can change this codec
    video_writer = cv2.VideoWriter(output_video_path, fourcc, 30.0, (width, height))  # 30.0 is the frame rate
    
    # Loop through the images and write each to the video file
    for image_file in image_files:
        image_path = os.path.join(image_folder, image_file)
        frame = cv2.imread(image_path)
        video_writer.write(frame)
    
    # Release the VideoWriter
    video_writer.release()

In [6]:
# Path to the directory containing images
image_folder = "MOT20Dataset/test/MOT20-07/images"
output_video_path = "output_video.avi"
ConvertTestImagesToVideo(image_folder, output_video_path)
print(f"Video saved to {output_video_path}")

Video saved to output_video.avi


In [12]:
# Initialize the ByteTrack tracker and annotators
tracker = sv.ByteTrack()
box_annotator = sv.BoxAnnotator()
label_annotator = sv.LabelAnnotator()

In [20]:
# Video processing
video_path = "output_video.avi"

def callback(frame: np.ndarray, _: int) -> np.ndarray:
    results = model.predict(frame)[0]
    # Extract detections from results
    boxes = results.boxes.xyxy.cpu().numpy()  # Bounding box coordinates
    scores = results.boxes.conf.cpu().numpy()  # Confidence scores
    class_ids = results.boxes.cls.cpu().numpy().astype(int)  # Class IDs
    
    # Create Supervision detections
    detections = sv.Detections(
        xyxy=boxes,
        confidence=scores,
        class_id=class_ids
    )
    detections = tracker.update_with_detections(detections)

    # Create labels with class names and tracker IDs
    labels = [
        f"#{tracker_id} {results.names[class_id]}"
        for tracker_id, class_id in zip(detections.tracker_id, detections.class_id)
    ]

    annotated_frame = box_annotator.annotate(
        frame.copy(), detections=detections)
    return label_annotator.annotate(
        annotated_frame, detections=detections, labels=labels)

sv.process_video(
    source_path=video_path,
    target_path="result.avi",
    callback=callback
)


0: 384x640 1 person, 2 boats, 28 umbrellas, 106.7ms
Speed: 12.3ms preprocess, 106.7ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 2 boats, 24 umbrellas, 87.7ms
Speed: 2.1ms preprocess, 87.7ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 boats, 24 umbrellas, 87.4ms
Speed: 11.1ms preprocess, 87.4ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 boat, 25 umbrellas, 99.7ms
Speed: 1.3ms preprocess, 99.7ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 25 umbrellas, 79.4ms
Speed: 1.8ms preprocess, 79.4ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 26 umbrellas, 87.6ms
Speed: 1.9ms preprocess, 87.6ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 23 umbrellas, 71.0ms
Speed: 1.3ms preprocess, 71.0ms inference, 0.4ms postprocess per image at sh