In [1]:
import os
import shutil
from ultralytics import YOLO
import cv2
import numpy as np
import supervision as sv
import csv
import motmetrics as mm
import pandas as pd



In [13]:
train_dir = "MOT20Dataset/train"
validation_dir = "MOT20Dataset/validation"
test_dir = "MOT20Dataset/test/MOT20-01"
output_dir = "MOT20Dataset/updated"

In [4]:
# Function to split a MOT20 training set into a train and validation set
def TrainSplit(train_directory, output_directory):
    # Paths to dataset
    image_dir = os.path.join(train_directory, "images")
    label_dir = os.path.join(train_directory, "labels")
    
    # Create train/val directories
    train_image_dir = os.path.join(output_directory, "train/images")
    train_label_dir = os.path.join(output_directory, "train/labels")
    val_image_dir = os.path.join(output_directory, "val/images")
    val_label_dir = os.path.join(output_directory, "val/labels")
    
    os.makedirs(train_image_dir, exist_ok=True)
    os.makedirs(train_label_dir, exist_ok=True)
    os.makedirs(val_image_dir, exist_ok=True)
    os.makedirs(val_label_dir, exist_ok=True)
    
    # Get sorted list of files
    images = sorted(os.listdir(image_dir))
    labels = sorted(os.listdir(label_dir))
    
    # Ensure images and labels align
    assert len(images) == len(labels), "Mismatch between images and labels!"
    
    # Split dataset
    split_ratio = 0.8
    train_count = int(len(images) * split_ratio)
    
    for i, (image, label) in enumerate(zip(images, labels)):
        if i < train_count:
            shutil.move(os.path.join(image_dir, image), train_image_dir)
            shutil.move(os.path.join(label_dir, label), train_label_dir)
        else:
            shutil.move(os.path.join(image_dir, image), val_image_dir)
            shutil.move(os.path.join(label_dir, label), val_label_dir)
    
    print("Dataset splitting complete!")

In [11]:
# Function to convert MOT20 dataset format to YOLO format
def ConvertMotToYOLO(input_dir, is_test_data=False):
    input_file = f'{input_dir}/gt/gt.txt'
    output_dir = f'{input_dir}/labels'  # Directory to store YOLO annotations
    image_width = 1920     
    image_height = 1080
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Process gt.txt
    with open(input_file, "r") as f:
        for line in f:
            # Split line and filter for required values
            split_gt_line = line.strip().split(",")
            
            # Ensure we have at least the first 7 values
            if len(split_gt_line) < 6:
                continue  # Skip invalid lines
            
            # Parse required values
            frame = int(split_gt_line[0])
            obj_id = 0 # set every tracked pedestrian to class 0
            bb_left = float(split_gt_line[2])
            bb_top = float(split_gt_line[3])
            bb_width = float(split_gt_line[4])
            bb_height = float(split_gt_line[5])
            # Check the provided confidence score if this is a training set
            if is_test_data == False:
                conf = float(split_gt_line[6])
                
                # Skip entries with conf == 0
                if conf == 0:
                    continue
    
            # Normalize bounding box coordinates
            x_center = (bb_left + bb_width / 2) / image_width
            y_center = (bb_top + bb_height / 2) / image_height
            width = bb_width / image_width
            height = bb_height / image_height
    
            # YOLO format: <class_id> <x_center> <y_center> <width> <height>
            yolo_line = f"{int(obj_id)} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}"
    
            # Write to frame-specific file
            output_file = os.path.join(output_dir, f"{int(frame):06d}.txt")
            with open(output_file, "a") as out_f:
                out_f.write(yolo_line + "\n")

In [20]:
ConvertMotToYOLO(train_dir)

In [21]:
TrainSplit(train_dir, output_dir)

Dataset splitting complete!


In [2]:
model = YOLO("yolov8n.pt")  # load yolov8 nano pretrained model

In [4]:
model.train(
    data='data.yaml',  # Path to your dataset YAML file
    epochs=10,
    batch=16,
    imgsz=1280,
    device='cpu',
    freeze=10,
    optimizer='AdamW',
    lr0=0.001,
    weight_decay=0.001,
    augment=True,
    conf = 0.3,
    nms=True
)

New https://pypi.org/project/ultralytics/8.3.53 available ðŸ˜ƒ Update with 'pip install -U ultralytics'
Ultralytics 8.3.48 ðŸš€ Python-3.10.12 torch-2.5.1+cu124 CPU (13th Gen Intel Core(TM) i9-13900KF)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8n.pt, data=data.yaml, epochs=10, time=None, patience=100, batch=16, imgsz=1280, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train2, exist_ok=False, pretrained=True, optimizer=AdamW, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=10, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=0.3, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=True, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False,

[W1223 13:30:58.850350203 NNPACK.cpp:61] Could not initialize NNPACK! Reason: Unsupported hardware.


Freezing layer 'model.2.m.0.cv2.conv.weight'
Freezing layer 'model.2.m.0.cv2.bn.weight'
Freezing layer 'model.2.m.0.cv2.bn.bias'
Freezing layer 'model.3.conv.weight'
Freezing layer 'model.3.bn.weight'
Freezing layer 'model.3.bn.bias'
Freezing layer 'model.4.cv1.conv.weight'
Freezing layer 'model.4.cv1.bn.weight'
Freezing layer 'model.4.cv1.bn.bias'
Freezing layer 'model.4.cv2.conv.weight'
Freezing layer 'model.4.cv2.bn.weight'
Freezing layer 'model.4.cv2.bn.bias'
Freezing layer 'model.4.m.0.cv1.conv.weight'
Freezing layer 'model.4.m.0.cv1.bn.weight'
Freezing layer 'model.4.m.0.cv1.bn.bias'
Freezing layer 'model.4.m.0.cv2.conv.weight'
Freezing layer 'model.4.m.0.cv2.bn.weight'
Freezing layer 'model.4.m.0.cv2.bn.bias'
Freezing layer 'model.4.m.1.cv1.conv.weight'
Freezing layer 'model.4.m.1.cv1.bn.weight'
Freezing layer 'model.4.m.1.cv1.bn.bias'
Freezing layer 'model.4.m.1.cv2.conv.weight'
Freezing layer 'model.4.m.1.cv2.bn.weight'
Freezing layer 'model.4.m.1.cv2.bn.bias'
Freezing layer '

[34m[1mtrain: [0mScanning /home/abey/Desktop/Repos/OtherProjects/ObjectTrackingMOT20/MOT20[0m
[34m[1mval: [0mScanning /home/abey/Desktop/Repos/OtherProjects/ObjectTrackingMOT20/MOT20Da[0m


Plotting labels to runs/detect/train2/labels.jpg... 
[34m[1moptimizer:[0m AdamW(lr=0.001, momentum=0.937) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.001), 63 bias(decay=0.0)
Image sizes 1280 train, 1280 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train2[0m
Starting training for 10 epochs...
Closing dataloader mosaic

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/10         0G      1.385      1.113       1.27         42       1280: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        557      26220      0.921      0.767      0.859      0.602






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       2/10         0G      1.218     0.7666      1.188         91       1280: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        557      26220      0.923      0.837      0.916      0.625






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       3/10         0G      1.171     0.6929       1.16         87       1280: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        557      26220      0.934      0.844       0.92      0.644






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       4/10         0G      1.127     0.6507      1.134         44       1280: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        557      26220      0.929      0.846      0.921       0.65






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       5/10         0G      1.092     0.6191      1.117         92       1280: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        557      26220      0.924       0.85      0.921      0.649

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



       6/10         0G      1.079     0.6016      1.108         45       1280: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        557      26220       0.93      0.856      0.925       0.65






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       7/10         0G      1.064     0.5882      1.097         53       1280: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        557      26220      0.918      0.854       0.92       0.65






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       8/10         0G       1.05     0.5772      1.092         84       1280: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        557      26220      0.929      0.857      0.925      0.657






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       9/10         0G      1.043     0.5706      1.084         43       1280: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        557      26220      0.922       0.86      0.924      0.652






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      10/10         0G      1.028     0.5608      1.078         48       1280: 1
                 Class     Images  Instances      Box(P          R      mAP50  m

                   all        557      26220      0.926      0.862      0.925      0.661

10 epochs completed in 3.287 hours.





Optimizer stripped from runs/detect/train2/weights/last.pt, 6.3MB
Optimizer stripped from runs/detect/train2/weights/best.pt, 6.3MB

Validating runs/detect/train2/weights/best.pt...
Ultralytics 8.3.48 ðŸš€ Python-3.10.12 torch-2.5.1+cu124 CPU (13th Gen Intel Core(TM) i9-13900KF)
Model summary (fused): 168 layers, 3,005,843 parameters, 0 gradients, 8.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  m


                   all        557      26220      0.915      0.848       0.92       0.65
Speed: 1.4ms preprocess, 216.3ms inference, 0.0ms loss, 0.4ms postprocess per image
Results saved to [1mruns/detect/train2[0m


ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([0])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x709b69e79d80>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,    0.036036,    0.037037,    0.038038,    0.039039,     0.04004,    0.041041,    0.042042,    0.043043,    0.044044,    0.045045,    0.046046,    0.047047,
          0.048048, 

In [3]:
model.load("runs/detect/train2/weights/best.pt")

Transferred 319/355 items from pretrained weights


YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_s

In [15]:
ConvertMotToYOLO(test_dir)

In [17]:
# Evaluate the model on the test dataset
results = model.val(
    data='data.yaml',         
    imgsz=1280,               
    split='test',             # Use the 'test' split
    save_json=True,           
    save_txt=True,            
    max_det=200,
    conf=0.25, # Min confidence threshold for detections
    iou=0.5) # IoU threshold for non-max suppression

Ultralytics 8.3.48 ðŸš€ Python-3.10.12 torch-2.5.1+cu124 CPU (13th Gen Intel Core(TM) i9-13900KF)


[34m[1mval: [0mScanning /home/abey/Desktop/Repos/OtherProjects/ObjectTrackingMOT20/MOT20Da[0m
                 Class     Images  Instances      Box(P          R      mAP50  m


                   all        429      19870      0.959      0.281      0.623       0.49
                person        429      19870      0.959      0.281      0.623       0.49
Speed: 2.2ms preprocess, 133.8ms inference, 0.0ms loss, 1.2ms postprocess per image
Saving runs/detect/val3/predictions.json...
Results saved to [1mruns/detect/val3[0m


In [10]:
# Convert the test images to a video for tracking
def ConvertTestImagesToVideo(test_image_folder, output_video):

    # Get sorted list of image files
    image_files = sorted(os.listdir(image_folder), key=lambda x: int(x.split('.')[0]))
    
    # Read the first image to get the dimensions for the video
    first_image_path = os.path.join(image_folder, image_files[0])
    frame = cv2.imread(first_image_path)
    height, width, _ = frame.shape
    
    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'XVID')  # You can change this codec
    video_writer = cv2.VideoWriter(output_video_path, fourcc, 30.0, (width, height))  # 30.0 is the frame rate
    
    # Loop through the images and write each to the video file
    for image_file in image_files:
        image_path = os.path.join(image_folder, image_file)
        frame = cv2.imread(image_path)
        video_writer.write(frame)
    
    # Release the VideoWriter
    video_writer.release()

In [11]:
# Path to the directory containing images
image_folder = "MOT20Dataset/test/MOT20-01/images"
output_video_path = "Results/MOT20-01_video.avi"
ConvertTestImagesToVideo(image_folder, output_video_path)
print(f"Video saved to {output_video_path}")

Video saved to Results/MOT20-01_video.avi


In [19]:
# Initialize the ByteTrack tracker and annotators
tracker = sv.ByteTrack(track_activation_threshold=0.2)
box_annotator = sv.BoxAnnotator()
label_annotator = sv.LabelAnnotator()

In [20]:
# File to save tracking results
output_file = "Results/MOT20-01.txt"

# Open the results file for logging
with open(output_file, "w", newline="") as f:
    writer = csv.writer(f)

    def callback(frame: np.ndarray, frame_id: int) -> np.ndarray:
        results = model.predict(frame)[0]
        # Extract detections from results
        boxes = results.boxes.xyxy.cpu().numpy()  # Bounding box coordinates
        scores = results.boxes.conf.cpu().numpy()  # Confidence scores
        class_ids = results.boxes.cls.cpu().numpy().astype(int)  # Class IDs

        # Create Supervision detections
        detections = sv.Detections(
            xyxy=boxes,
            confidence=scores,
            class_id=class_ids
        )
        detections = tracker.update_with_detections(detections)

        # Log tracking results in MOTChallenge format
        for box, score, class_id, tracker_id in zip(
            detections.xyxy, detections.confidence, detections.class_id, detections.tracker_id
        ):
            x1, y1, x2, y2 = box
            w, h = x2 - x1, y2 - y1  # Convert to width and height
            writer.writerow([frame_id + 1, tracker_id, x1, y1, w, h, score, class_id, -1])

        # Create labels with class names and tracker IDs
        labels = [
            f"#{tracker_id} {results.names[class_id]}"
            for tracker_id, class_id in zip(detections.tracker_id, detections.class_id)
        ]

        annotated_frame = box_annotator.annotate(
            frame.copy(), detections=detections)
        return label_annotator.annotate(
            annotated_frame, detections=detections, labels=labels)

    sv.process_video(
        source_path="Results/MOT20-01_video.avi",
        target_path="Results/MOT20-01_detections.avi",
        callback=callback
    )


0: 384x640 7 persons, 4 umbrellas, 7 tvs, 108.5ms
Speed: 6.9ms preprocess, 108.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 3 umbrellas, 7 tvs, 59.7ms
Speed: 1.7ms preprocess, 59.7ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 umbrellas, 5 tvs, 79.7ms
Speed: 1.7ms preprocess, 79.7ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 umbrella, 6 tvs, 216.2ms
Speed: 24.2ms preprocess, 216.2ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 2 umbrellas, 7 tvs, 95.1ms
Speed: 1.4ms preprocess, 95.1ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 umbrellas, 6 tvs, 126.3ms
Speed: 16.3ms preprocess, 126.3ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 3 umbrellas, 6 tvs, 74.5ms
Speed: 1.5ms preprocess, 74.5ms inference, 0.4ms postprocess 

In [9]:
# Compute IoU (intersection-over-union) matrix between ground truth and predictions
def compute_iou_matrix(gt_boxes, pred_boxes):
    iou_matrix = np.zeros((len(gt_boxes), len(pred_boxes)))
    for i, gt in enumerate(gt_boxes):
        for j, pred in enumerate(pred_boxes):
            # Calculate intersection
            x1 = max(gt[0], pred[0])
            y1 = max(gt[1], pred[1])
            x2 = min(gt[2], pred[2])
            y2 = min(gt[3], pred[3])
            intersection = max(0, x2 - x1) * max(0, y2 - y1)

            # Calculate union
            gt_area = (gt[2] - gt[0]) * (gt[3] - gt[1])
            pred_area = (pred[2] - pred[0]) * (pred[3] - pred[1])
            union = gt_area + pred_area - intersection

            # Compute IoU
            iou_matrix[i, j] = intersection / union if union > 0 else 0
    return iou_matrix

In [21]:
# Paths to the ground truth and tracking results
gt_path = "MOT20Dataset/test/MOT20-01/gt/gt.txt"  
results_path = "Results/MOT20-01.txt" 

gt_columns = ["frame", "id", "x", "y", "w", "h", "_1", "_2", "_3", "_4"]
det_columns = ["frame", "_", "x", "y", "w", "h", "confidence", "_1", "_2", "_3"]

# Adjust column names to match the file structure
gt_data = pd.read_csv(gt_path, header=None)
det_data = pd.read_csv(results_path, header=None)

# Assign appropriate column names based on actual data shape
gt_data.columns = gt_columns[:gt_data.shape[1]]
det_data.columns = det_columns[:det_data.shape[1]]

# Initialize MOTAccumulator
acc = mm.MOTAccumulator(auto_id=True)

# Process frames
frames = sorted(gt_data["frame"].unique())
for frame_id in frames:
    # Ground truth for this frame
    gt_frame = gt_data[gt_data["frame"] == frame_id]
    gt_ids = gt_frame["id"].values
    gt_boxes = np.array(
        [
            [row["x"], row["y"], row["x"] + row["w"], row["y"] + row["h"]]
            for _, row in gt_frame.iterrows()
        ]
    )

    # Detections for this frame
    det_frame = det_data[det_data["frame"] == frame_id]
    pred_ids = np.arange(len(det_frame))  # Assign unique IDs to detections
    pred_boxes = np.array(
        [
            [row["x"], row["y"], row["x"] + row["w"], row["y"] + row["h"]]
            for _, row in det_frame.iterrows()
        ]
    )

    # Compute IoU matrix
    if len(gt_boxes) > 0 and len(pred_boxes) > 0:
        iou_matrix = compute_iou_matrix(gt_boxes, pred_boxes)
    else:
        iou_matrix = np.empty((len(gt_boxes), len(pred_boxes)))

    # Update accumulator
    acc.update(gt_ids, pred_ids, iou_matrix)

# Compute metrics
mh = mm.metrics.create()
metrics = ["mota", "idf1", "precision", "recall"]
summary = mh.compute(acc, metrics=metrics, name="MOT20-01")

# Print results
print(mm.io.render_summary(summary, formatters=mh.formatters, namemap=mm.io.motchallenge_metric_names))

          MOTA  IDF1   Prcn  Rcll
MOT20-01 20.8% 34.6% 100.0% 20.9%
