## Practice : People detector and analyzer

1. Input images from wiiplay.mp4 with frame number between 41000 and 44000.
2. Use YOLOv8 to detect people, mark as red rectangle, and count how many persons in each frame. (hint: check label == 'person')
3. Try to find out which frame contains the most number of persons. (print the number of persons on the upper-left corner)
4. (optional) Try to find out which frame containes the largest person. (print the size of its bounding box on the upper-left corner)
5. (optional) Try to find out which frame containes the smallest person. (print the size of its bounding box on the upper-left corner)
6. Show the three output frames you found.
7. Verify the correctness of your output, then adjust the desired confidence threshold for improvement. 
8. Upload your Jupyter code file (*.ipynb)

In [2]:
import cv2
import numpy as np
from ultralytics import YOLO

In [3]:
model = YOLO('yolov8n.pt')

In [4]:
cap = cv2.VideoCapture('WiiPlay.mp4')

most_people_frame = None
most_people_count = 0

largest_person_frame = None
largest_person_size = 0

smallest_person_frame = None
smallest_person_size = float('inf')

current_frame_number = 0

# Define the range of frames to process
start_frame = 41000
end_frame = 44000

# Set the frame position to start_frame
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

while True:
    ret, img = cap.read()
    if not ret or current_frame_number > end_frame:
        break
    current_frame_number += 1
    
    img = cv2.flip(img, 1)
    results = model.predict(img, conf=0.95)

    person_count = 0
    frame_largest_size = 0
    frame_smallest_size = float('inf')

    for result in results:
        for box in result.boxes:
            label = results[0].names[int(box.cls)]
            if label == 'person':
                person_count += 1

                left, top, right, bottom = np.array(box.xyxy, dtype=np.uint16).squeeze()
                width = right - left
                height = bottom - top
                size = width * height

                # Update largest and smallest person size in the current frame
                if size > frame_largest_size:
                    frame_largest_size = size
                if size < frame_smallest_size:
                    frame_smallest_size = size

                # Draw rectangle and label on the image
                cv2.rectangle(img, (left, top), (right, bottom), (0, 0, 255), 2)
                confidence = float(box.conf.cpu())
                cv2.putText(img, f'{label} {confidence:.2f}', (left + 5, bottom - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 1, cv2.LINE_AA)

    # Update most people count frame
    if person_count > most_people_count:
        most_people_count = person_count
        most_people_frame = current_frame_number

    # Update largest person frame
    if frame_largest_size > largest_person_size:
        largest_person_size = frame_largest_size
        largest_person_frame = current_frame_number

    # Update smallest person frame
    if frame_smallest_size < smallest_person_size:
        smallest_person_size = frame_smallest_size
        smallest_person_frame = current_frame_number

    cv2.imshow('Wii Play', img)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

print(f'Most people in frame: {most_people_frame} with {most_people_count} persons')
print(f'Largest person in frame: {largest_person_frame} with size {largest_person_size}')
print(f'Smallest person in frame: {smallest_person_frame} with size {smallest_person_size}')



0: 384x640 (no detections), 124.9ms
Speed: 4.2ms preprocess, 124.9ms inference, 932.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 24.7ms
Speed: 2.4ms preprocess, 24.7ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 26.0ms
Speed: 4.6ms preprocess, 26.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 27.5ms
Speed: 4.6ms preprocess, 27.5ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 24.3ms
Speed: 3.3ms preprocess, 24.3ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 24.6ms
Speed: 2.8ms preprocess, 24.6ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 24.0ms
Speed: 3.2ms preprocess, 24.0ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 38.2ms
Speed: 15.3ms preprocess, 38.