In [1]:
import torch
import cv2
import numpy as np
from ultralytics import YOLO
import os

In [2]:
# Path to your video frames and label files
frames_dir = 'D:\Datasets\DAta_intern\V8T3\images'
labels_dir = 'D:\Datasets\DAta_intern\V8T3\labels'
model_path = "D:\VS Code Folders\Squash-Ball-Tracking\Squash-Ball-Detection\\results_after_75_epochs\weights\\best.pt"

In [16]:
# Output video settings
output_video_path = 'output_video.mp4'
output_fps = 30 

In [4]:
model = YOLO(model_path)

In [13]:
# Function to draw ground truth and predictions on each frame
def draw_detections(image, ground_truths):
    # Draw ground truth (green dot)
    for gt in ground_truths:
        x, y = int(gt[1]), int(gt[2])
        cv2.drawMarker(image, (x, y), color=[0, 255, 0], thickness=1, 
        markerType= cv2.MARKER_CROSS, line_type=cv2.LINE_AA,
        markerSize=10)
        overlay = image.copy()
        cv2.circle(overlay, (x, y), 10, (0, 0, 255), -1)  # Filled red circle
        cv2.addWeighted(overlay, 0.3, image, 0.7, 0, image)  # Blend the overlay with less intensity
    
    return image

In [6]:
# Function to read annotations from label file
def read_annotations(label_file):
    with open(label_file, 'r') as file:
        annotations = []
        for line in file.readlines():
            parts = line.strip().split()
            class_id = int(parts[0])
            center_x = float(parts[1])
            center_y = float(parts[2])
            width = float(parts[3])
            height = float(parts[4])
            annotations.append((class_id, center_x, center_y, width, height))
        return annotations

In [7]:
# Function to convert normalized annotations to pixel values
def convert_annotations_to_pixels(annotations, image_width, image_height):
    pixel_annotations = []
    for annotation in annotations:
        class_id, center_x, center_y, width, height = annotation
        center_x = int(center_x * image_width)
        center_y = int(center_y * image_height)
        pixel_annotations.append((class_id, center_x, center_y, width, height))
    return pixel_annotations

In [8]:
# Function to check if prediction is inside the red circle
def is_inside_circle(pred_x, pred_y, gt_x, gt_y, radius=10):
    distance_squared = np.sqrt((pred_x - gt_x)**2 + (pred_y - gt_y)**2)
    return distance_squared <= radius

In [9]:
frame_paths = []
for frame in os.listdir(frames_dir):
    if frame.endswith('.jpg'):
        frame_paths.append(os.path.join(frames_dir, frame))

In [17]:
# Initialize video writer
frame = cv2.imread(frame_paths[0])
height, width, _ = frame.shape
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(output_video_path, fourcc, output_fps, (width, height))
print(height, width)

640 640


In [38]:
# Process each frame
for frame_path in frame_paths:
    # Read frame
    frame = cv2.imread(frame_path)
    
    # Read corresponding label file
    frame_name = os.path.splitext(os.path.basename(frame_path))[0]
    label_file = os.path.join(labels_dir, frame_name) + '.txt'
    
    ground_truths = read_annotations(label_file)
    ground_truths = convert_annotations_to_pixels(ground_truths, width, height)
    
    # Make predictions
    predictions = model.predict(frame)
    
    # Extracting bounding box center coordinates for predictions
    prediction_centers = []
    for pred in predictions[0].boxes:
        if pred.conf > 0.1:  # You can set a threshold for confidence
            bbox = pred.xyxy[0]  # xyxy format
            x_center = (bbox[0] + bbox[2]) / 2
            y_center = (bbox[1] + bbox[3]) / 2
            prediction_centers.append([x_center.item(), y_center.item()])

    # Draw ground truth and predictions on the frame
    annotated_frame = draw_detections(frame, ground_truths)
    cv2.putText(annotated_frame, f'Ground Truth: {ground_truths}' , (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
    cv2.putText(annotated_frame, f'Predictions: {prediction_centers}' , (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
    # Check true positives and false positives
    for pred in prediction_centers:
        pred_x, pred_y = (pred[0]), (pred[1])
        for gt in ground_truths:
            gt_x, gt_y = (gt[1]), (gt[2])
            
            print("g_x: ", gt_x)
            print('gt_y: ', gt_y)
            print('pred_x: ', pred_x)
            print('pred_y: ', pred_y)


            if is_inside_circle(pred_x, pred_y, gt_x, gt_y, radius=10):
                # True positive (prediction inside the red circle)
                cv2.putText(annotated_frame, 'True Positive', (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
            else:
                # False positive (prediction outside the red circle)
                cv2.putText(annotated_frame, 'False Positive', (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
    
    # Write frame to the output video
    out.write(annotated_frame)
    
    # Display the frame (optional)
    cv2.imshow('Frame', annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
out.release()
cv2.destroyAllWindows()


0: 640x640 (no detections), 865.6ms
Speed: 11.8ms preprocess, 865.6ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 358.5ms
Speed: 8.2ms preprocess, 358.5ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 323.3ms
Speed: 7.5ms preprocess, 323.3ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 285.0ms
Speed: 7.0ms preprocess, 285.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 291.6ms
Speed: 7.0ms preprocess, 291.6ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 297.1ms
Speed: 7.6ms preprocess, 297.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 355.1ms
Speed: 8.1ms preprocess, 355.1ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 516.6ms
Speed: 7.5ms prep

In [20]:
# Process each frame
for frame_path in frame_paths:
    # Read frame
    frame = cv2.imread(frame_path)
    
    # Read corresponding label file
    frame_name = os.path.splitext(os.path.basename(frame_path))[0]
    label_file = os.path.join(labels_dir, frame_name) + '.txt'
    
    ground_truths = read_annotations(label_file)
    ground_truths = convert_annotations_to_pixels(ground_truths, width, height)
    
    # Make predictions
    predictions = model.predict(frame)
    
    # Extracting bounding box center coordinates for predictions
    prediction_centers = []
    for pred in predictions[0].boxes:
        if pred.conf > 0.1:  # You can set a threshold for confidence
            bbox = pred.xyxy[0]  # xyxy format
            x_center = (bbox[0] + bbox[2]) / 2
            y_center = (bbox[1] + bbox[3]) / 2
            prediction_centers.append([x_center.item(), y_center.item()])

    # Draw ground truth and predictions on the frame
    annotated_frame = draw_detections(frame, ground_truths)
    cv2.putText(annotated_frame, f'Ground Truth: {ground_truths}', (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
    cv2.putText(annotated_frame, f'Predictions: {prediction_centers}', (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)

    # Track which ground truths have been matched
    matched_ground_truths = set()
    
    # Check true positives and false positives
    for pred in prediction_centers:
        pred_x, pred_y = pred[0], pred[1]
        matched = False
        for i, gt in enumerate(ground_truths):
            gt_x, gt_y = gt[1], gt[2]
            
            print("g_x: ", gt_x)
            print('gt_y: ', gt_y)
            print('pred_x: ', pred_x)
            print('pred_y: ', pred_y)

            if is_inside_circle(pred_x, pred_y, gt_x, gt_y, radius=10):
                # True positive (prediction inside the red circle)
                cv2.putText(annotated_frame, 'True Positive', (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
                matched_ground_truths.add(i)
                matched = True
                break
        if not matched:
            # False positive (prediction outside the red circle)
            cv2.putText(annotated_frame, 'False Positive', (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)

    # Check for false negatives (ground truths with no matching predictions)
    for i, gt in enumerate(ground_truths):
        if i not in matched_ground_truths:
            gt_x, gt_y = gt[1], gt[2]
            cv2.putText(annotated_frame, 'False Negative', (10,60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

    # Write frame to the output video
    out.write(annotated_frame)
    
    # Display the frame (optional)
    cv2.imshow('Frame', annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
out.release()
cv2.destroyAllWindows()



0: 640x640 (no detections), 342.7ms
Speed: 7.8ms preprocess, 342.7ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 496.8ms
Speed: 20.0ms preprocess, 496.8ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 359.2ms
Speed: 12.0ms preprocess, 359.2ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 372.1ms
Speed: 89.0ms preprocess, 372.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 275.6ms
Speed: 6.0ms preprocess, 275.6ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 254.9ms
Speed: 6.0ms preprocess, 254.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 303.3ms
Speed: 8.1ms preprocess, 303.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 306.3ms
Speed: 8.0ms pr

In [34]:
# Process each frame
all_frames_metrics = []

for frame_path in frame_paths:
    # Read frame
    frame = cv2.imread(frame_path)
    
    # Read corresponding label file
    frame_name = os.path.splitext(os.path.basename(frame_path))[0]
    label_file = os.path.join(labels_dir, frame_name) + '.txt'
    
    ground_truths = read_annotations(label_file)
    ground_truths = convert_annotations_to_pixels(ground_truths, width, height)
    
    # Make predictions
    predictions = model.predict(frame)
    
    # Extracting bounding box center coordinates for predictions
    prediction_centers = []
    prediction_confidences = []
    for pred in predictions[0].boxes:
        if pred.conf > 0.1:  # You can set a threshold for confidence
            bbox = pred.xyxy[0]  # xyxy format
            x_center = (bbox[0] + bbox[2]) / 2
            y_center = (bbox[1] + bbox[3]) / 2
            prediction_centers.append([x_center, y_center])
            prediction_confidences.append(pred.conf)
    
    # Draw ground truth and predictions on the frame
    annotated_frame = draw_detections(frame, ground_truths, prediction_centers)
    
    # Calculate true positives, false positives, and false negatives
    TP, FP, FN = 0, 0, 0
    TP_confidence, FP_confidence = 0.0, 0.0
    
    matched_predictions = set()
    for gt in ground_truths:
        gt_x, gt_y = int(gt[1]), int(gt[2])
        matched = False
        for idx, (pred_x, pred_y) in enumerate(prediction_centers):
            if is_inside_circle(pred_x, pred_y, gt_x, gt_y, radius=10):
                if idx not in matched_predictions:
                    TP += 1
                    TP_confidence += prediction_confidences[idx]
                    matched_predictions.add(idx)
                    matched = True
                    break
        if not matched:
            FN += 1

    for idx in range(len(prediction_centers)):
        if idx not in matched_predictions:
            FP += 1
            FP_confidence += prediction_confidences[idx]
    
    f1_score = (2 * TP_confidence) / (2 * TP_confidence + FP_confidence + FN) if (2 * TP_confidence + FP_confidence + FN) > 0 else 0

    # Annotate the frame with the metrics
    cv2.putText(annotated_frame, f'TP: {TP}', (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    cv2.putText(annotated_frame, f'FP: {FP}', (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
    cv2.putText(annotated_frame, f'FN: {FN}', (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
    # cv2.putText(annotated_frame, f'F1-score: {f1_score:.2f}', (10, 80), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
    
    # Store the metrics for this frame
    all_frames_metrics.append((TP, FP, FN, f1_score))
    
    # Write frame to the output video
    out.write(annotated_frame)
    
    # Display the frame (optional)
    cv2.imshow('Frame', annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
out.release()
cv2.destroyAllWindows()

# Calculate and display overall metrics (optional)
# overall_TP_confidence = sum([TP_confidence for _, _, _, TP_confidence in all_frames_metrics])
# overall_FP_confidence = sum([FP_confidence for _, _, _, FP_confidence in all_frames_metrics])
# overall_FN = sum([FN for _, _, FN, _ in all_frames_metrics])
# overall_f1_score = (2 * overall_TP_confidence) / (2 * overall_TP_confidence + overall_FP_confidence + overall_FN) if (2 * overall_TP_confidence + overall_FP_confidence + overall_FN) > 0 else 0

# print(f'Overall TP confidence: {overall_TP_confidence}')
# print(f'Overall FP confidence: {overall_FP_confidence}')
# print(f'Overall FN: {overall_FN}')
# print(f'Overall F1-score: {overall_f1_score:.2f}')




0: 640x640 (no detections), 377.6ms
Speed: 9.0ms preprocess, 377.6ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 1345.2ms
Speed: 9.7ms preprocess, 1345.2ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 293.3ms
Speed: 8.8ms preprocess, 293.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 325.9ms
Speed: 12.0ms preprocess, 325.9ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 276.0ms
Speed: 8.0ms preprocess, 276.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 284.7ms
Speed: 7.7ms preprocess, 284.7ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 401.6ms
Speed: 9.0ms preprocess, 401.6ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 280.2ms
Speed: 5.8ms pre

TypeError: unsupported format string passed to Tensor.__format__

In [24]:
# visualizing the detections only to get an idea how is the model actually performing!
# Process each frame
for frame_name in os.listdir(frames_dir):
    frame_path = os.path.join(frames_dir, frame_name)
    frame = cv2.imread(frame_path)

    # Make detections
    results = model(frame)

    # Draw bounding boxes and labels on the frame
    for result in results:
        for box in result.boxes:
            # Extract coordinates
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            confidence = box.conf[0]
            label = f"{box.cls[0]} {confidence:.2f}"

            # Draw rectangle and label on the frame
            cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 2)
            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

    # Display the resulting frame
    cv2.imshow('Frame', frame)

    # Press Q on keyboard to exit
    if cv2.waitKey(25) & 0xFF == ord('q'):
        break

# Release OpenCV windows and finish
cv2.destroyAllWindows()



0: 640x640 (no detections), 300.9ms
Speed: 12.2ms preprocess, 300.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 526.2ms
Speed: 92.3ms preprocess, 526.2ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 373.2ms
Speed: 9.4ms preprocess, 373.2ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 289.4ms
Speed: 8.0ms preprocess, 289.4ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 272.8ms
Speed: 6.0ms preprocess, 272.8ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 309.0ms
Speed: 8.0ms preprocess, 309.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 288.3ms
Speed: 8.3ms preprocess, 288.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 284.8ms
Speed: 9.0ms pre

In [None]:
results = model.predict('D:\Datasets\DAta_intern\V8T3\images\\frame_100283_jpg.rf.47d7eac528b605db604a8f6202517ba8.jpg')


image 1/1 D:\Datasets\DAta_intern\V8T3\images\frame_100283_jpg.rf.47d7eac528b605db604a8f6202517ba8.jpg: 640x640 1 ball, 340.9ms
Speed: 8.1ms preprocess, 340.9ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)


In [None]:
results

[ultralytics.engine.results.Results object with attributes:
 
 boxes: ultralytics.engine.results.Boxes object
 keypoints: None
 masks: None
 names: {0: 'ball'}
 obb: None
 orig_img: array([[[  2,   2,   2],
         [  0,   0,   0],
         [  1,   1,   1],
         ...,
         [  0,   0,   4],
         [  0,   0,   4],
         [  0,   1,   5]],
 
        [[  0,   0,   0],
         [  0,   0,   0],
         [  0,   0,   0],
         ...,
         [  0,   0,   3],
         [  0,   0,   4],
         [  0,   0,   3]],
 
        [[166, 166, 166],
         [161, 161, 161],
         [159, 159, 159],
         ...,
         [149, 149, 155],
         [150, 149, 158],
         [150, 150, 156]],
 
        ...,
 
        [[116, 116, 116],
         [110, 112, 112],
         [105, 110, 109],
         ...,
         [133, 148, 157],
         [129, 144, 153],
         [121, 136, 145]],
 
        [[  7,   5,   4],
         [  5,   3,   2],
         [  1,   2,   0],
         ...,
         [  0,   0, 

In [None]:
results[0]

ultralytics.engine.results.Results object with attributes:

boxes: ultralytics.engine.results.Boxes object
keypoints: None
masks: None
names: {0: 'ball'}
obb: None
orig_img: array([[[  2,   2,   2],
        [  0,   0,   0],
        [  1,   1,   1],
        ...,
        [  2,   1,   5],
        [  2,   1,   5],
        [  3,   2,   6]],

       [[  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0],
        ...,
        [  0,   0,   3],
        [  0,   0,   4],
        [  0,   0,   3]],

       [[166, 166, 166],
        [161, 161, 161],
        [159, 159, 159],
        ...,
        [152, 150, 156],
        [152, 149, 158],
        [153, 151, 157]],

       ...,

       [[109, 117, 117],
        [105, 113, 113],
        [101, 109, 109],
        ...,
        [141, 153, 165],
        [137, 149, 161],
        [129, 141, 153]],

       [[  0,   5,   4],
        [  0,   4,   3],
        [  0,   3,   2],
        ...,
        [  0,   0,   8],
        [  0,   0,   6],
        [  0,  

In [None]:
results[0].boxes

ultralytics.engine.results.Boxes object with attributes:

cls: tensor([0.])
conf: tensor([0.4162])
data: tensor([[399.0138, 117.2492, 405.4528, 127.3184,   0.4162,   0.0000]])
id: None
is_track: False
orig_shape: (640, 640)
shape: torch.Size([1, 6])
xywh: tensor([[402.2333, 122.2838,   6.4390,  10.0692]])
xywhn: tensor([[0.6285, 0.1911, 0.0101, 0.0157]])
xyxy: tensor([[399.0138, 117.2492, 405.4528, 127.3184]])
xyxyn: tensor([[0.6235, 0.1832, 0.6335, 0.1989]])

In [None]:
print(results[0].boxes[0].xyxy[0])

tensor([399.0138, 117.2492, 405.4528, 127.3184])
