## Practice : People detector and analyzer

1. Input images from wiiplay.mp4 with frame number between 41000 and 44000.
2. Use YOLOv8 to detect people, mark as red rectangle, and count how many persons in each frame. (hint: check label == 'person')
3. Try to find out which frame contains the most number of persons. (print the number of persons on the upper-left corner)
4. (optional) Try to find out which frame containes the largest person. (print the size of its bounding box on the upper-left corner)
5. (optional) Try to find out which frame containes the smallest person. (print the size of its bounding box on the upper-left corner)
6. Show the three output frames you found.
7. Verify the correctness of your output, then adjust the desired confidence threshold for improvement. 
8. Upload your Jupyter code file (*.ipynb)

In [1]:
import cv2
import numpy as np
from ultralytics import YOLO

In [2]:
model = YOLO('yolov8n.pt')

In [7]:
# load the video
cap = cv2.VideoCapture('WiiPlay.mp4')

# Initialize variables to store the frame number with the most people, the number of people in that frame

most_people_frame = None
most_people_count = 0

largest_person_frame = None
largest_person_size = 0

smallest_person_frame = None
smallest_person_size = float('inf')

# Set the start and end frame
start_frame = 41000
end_frame = 44000

current_frame_number = start_frame

cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

while True:
    # Read the frame
    ret, img = cap.read()
    if not ret or current_frame_number > end_frame:
        break

    current_frame_number += 1
    
    # Flip the image horizontally
    img = cv2.flip(img, 1)

    # Load the image to the YOLOv8 model
    results = model.predict(img, conf=0.1119)

    # Init variables to store the number of persons, the largest and smallest person size
    person_count = 0
    frame_largest_size = 0
    frame_smallest_size = float('inf')

    # Init list to store the bounding boxes
    bounding_boxes = []

    # Loop through the results and draw the bounding boxes on the image.
    
    for result in results:
        for box in result.boxes:
            label = results[0].names[int(box.cls)]

            # If the label is person, draw the bounding box
            if label == 'person':
                person_count += 1

                # Get the bounding box coordinates
                left, top, right, bottom = np.array(box.xyxy.cpu(), dtype=np.uint16).squeeze()

                """
                Calculate the width, height, and size of the bounding box.
                Append the bounding box coordinates and size to the bounding_boxes list.
                Update the frame_largest_size and frame_smallest_size if needed.
                """
                width = right - left
                height = bottom - top
                size = width * height
                bounding_boxes.append((left, top, width, height, size))

                # Update the frame_largest_size and frame_smallest_size

                if size > frame_largest_size:
                    frame_largest_size = size
                if size < frame_smallest_size:
                    frame_smallest_size = size

                # Draw the bounding box and label
                cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0), 2)

                # Add the confidence to the label
                confidence = float(box.conf.cpu())

                # Add the label to the bounding box
                cv2.putText(img, label, (left, top - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)
            
    # Update most people count frame
    if person_count > most_people_count:
        most_people_count = person_count
        most_people_frame = current_frame_number

    # Update largest person frame
    if frame_largest_size > largest_person_size:
        largest_person_size = frame_largest_size
        largest_person_frame = current_frame_number

    # Update smallest person frame
    if frame_smallest_size < smallest_person_size:
        smallest_person_size = frame_smallest_size
        smallest_person_frame = current_frame_number

    # Print the number of persons on the upper-left corner
    cv2.putText(img, f'Persons: {person_count}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # Print the size of each bounding box on the upper-left corner
    for i, (left, top, width, height, size) in enumerate(bounding_boxes):
        cv2.putText(img, f'Size {i+1}: {size}', (10, 60 + i * 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

    text = f'Current_Frame: {current_frame_number}'

    # get the size of the text
    text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)[0]

    # 10 pixels from the right edge
    text_x = img.shape[1] - text_size[0] - 10 

    # 30 pixels from the top edge
    text_y = 30

    # Print the frame number on the upper-right corner 
    cv2.putText(img, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # Show the image
    cv2.imshow('Result', img)

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the video capture and close the windows
cap.release()
cv2.destroyAllWindows()

"""
Print the frame number with the most people, the number of people in that frame,
the frame number with the largest person, the size of the largest person,
the frame number with the smallest person, and the size of the smallest person.
"""

print(f'Most people in frame: {most_people_frame} with {most_people_count} persons')
print(f'Largest person in frame: {largest_person_frame} with size {largest_person_size}')
print(f'Smallest person in frame: {smallest_person_frame} with size {smallest_person_size}')



0: 384x640 29 persons, 27.4ms
Speed: 11.9ms preprocess, 27.4ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 17 persons, 1 fire hydrant, 27.1ms
Speed: 3.1ms preprocess, 27.1ms inference, 18.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 17 persons, 28.2ms
Speed: 3.8ms preprocess, 28.2ms inference, 3.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 19 persons, 24.6ms
Speed: 3.7ms preprocess, 24.6ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 17 persons, 2 teddy bears, 26.2ms
Speed: 3.0ms preprocess, 26.2ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 4 teddy bears, 24.7ms
Speed: 2.7ms preprocess, 24.7ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 3 teddy bears, 24.3ms
Speed: 2.9ms preprocess, 24.3ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 17 persons, 3 teddy bears, 24