# Project 2 -- All 5 Categories

In [2]:
from ultralytics import YOLO
from roboflow import Roboflow
import os
import matplotlib.pyplot as plt
from PIL import Image
from shuffle import train_test_split
import pydoc
import cv2

In [3]:

def path_to(*p):
    """
    Takes an arbitrary number of strings and converts it to a path
    with the current working directory automatically tacked onto the front of the path.
    
    Parameters:
        *p (str): Single string or list of strings.
    
    Returns:
        (string): Full path to desired directory/file
    """
    return os.path.join(os.getcwd(), *p)

## Download dataset form Roboflow
For this to work you must update with your own API key. This is only necessary if you want to train it yourself.

In [4]:
def download_dataset(v: int):
    """
    Function that takes the dataset version and downloads if from our Roboflow Project.
    The dataset will be save under project2-dataset-{v}.
    
    Parameters:
        v (int): the version number of the dataset that you want to download
    Returns: 
        None: This function does not return anything.
        
    """
    rf = Roboflow(api_key="jSHmBYOVLG3O81wzzHz3")
    project = rf.workspace("pikavision").project("project2-dataset")
    version = project.version(v)
    dataset = version.download("yolov11")
download_dataset(3)

loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in project2-dataset-3 to yolov11:: 100%|██████████| 464327/464327 [00:49<00:00, 9325.94it/s] 





Extracting Dataset Version Zip to project2-dataset-3 in yolov11:: 100%|██████████| 19466/19466 [00:01<00:00, 11528.37it/s]


### Split the data
This splits the data into train, valid, and test datasets at random. 

85% train, 10% validation, 5% test

In [7]:
# Split data into train validation and dest datasets
train_test_split(path_to('project2-dataset-3', 'train'), percentage_test=5, percentage_valid=10)

Total images in src directory: 8319
Abyssinian_149_jpg.rf.7654b20d662a40fe280fa6cc45e2c1af
image-91-_jpg.rf.7fbec487ac98ce210fe7c546591c86e5
image-425-_jpg.rf.17131544f5de196f41187d529e4d1d12
image-114-_jpg.rf.855707d6aecfee5f72cef7a6c46dc8cb
pug_168_jpg.rf.3c728ee2666cc33c6254e5169d89562c
wheaten_terrier_184_jpg.rf.88e4ff19afc4339f597c48f285b34c4d
1-517-_jpeg_jpg.rf.a334b4617635447747435bcb6271f9f6
image-801-_jpg.rf.47d73f70672d111c7a74bd9ebb314f98
pic_650_jpg.rf.f3a59982bca63e3753bbf8b2d8f3a0bd
yorkshire_terrier_165_jpg.rf.2ae5ab9819c4d4f7a2b38c0e0cf86c9a
Birman_17_jpg.rf.2906624d776b50f67609a58b5ffbe25b
image-139-_jpg.rf.5d78d25a57243e3091b45e91fc8888c5
pikachu_00126_jpg.rf.0108e8922715033004f69150c7cc2590
Bombay_186_jpg.rf.f7588545f7e9d3a2f9f0e2dec1c9338b
tello_30_jpeg_jpg.rf.2af1636cc4bc24d0f7b7c728a5f40e0b
american_pit_bull_terrier_128_jpg.rf.34ad5723aa191a34f27515b1f2b4d758
image-423-_jpg.rf.555042d645c06693ba62d3fb6c323e93
wheaten_terrier_109_jpg.rf.7a1aa356af5742eaa6b39b0e69e2

## Test Model Function

Takes the model and a sample image and saves new photo with prediction(s) and bounding boxes.

In [4]:
def test_model(model : YOLO, sample_image : str):
    """
    Takes the model and a sample image and then produces a prediction image.
    The function saves the image to the test_images folder.
    It also shows the predictions using matplotlib.
    
    Parameters:
        model (YOLO): YOLO model that you want to make a prediction with.
        sample_image (str): Sample image to make the prediction on. Can by .png or .jpg
    
    Returns:
        None: This function does not return any value.

    """
    path_to_img = path_to('test_images', sample_image)
    results = model.predict(
        source=path_to_img,
        conf=0.1,
    )

    for r in results:
        name = 'the_detected_' + sample_image
        path_to_predicted_img = path_to('test_images', name)
        r.save(path_to_predicted_img)
    image = Image.open(path_to_predicted_img)
    plt.imshow(image)
    plt.axis('off')
    plt.show()
        

In [5]:
def test_model_video(model : YOLO, video_name : str):
    """
    Takes the YOLO model and a sample video file and then produces a prediction for each frame.
    The function saves the processed frames and optionally shows the predictions using matplotlib.

    Parameters:
        model (YOLO): YOLO model that you want to make predictions with.
        video_path (str): Path to the video file to make predictions on. Can be .mp4, .avi, etc.

    Returns:
        None: This function does not return any value.
    """
    # Open the video using OpenCV
    path_to_video = path_to('test_vids', video_name)
    cap = cv2.VideoCapture(path_to_video)

    # Get the video frame width, height, and frame rate
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    # Prepare for saving the processed video (optional)
    name = 'the_detected_' + video_name
    path_to_predicted_vid = path_to('test_vids', name)
    output_video_path = path_to_predicted_vid
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Apply YOLO prediction on the current frame
        results = model.predict(
            source=frame,  # Feed the frame directly
            conf=0.1
        )
        
        new_frame = results[0].plot()
        out.write(new_frame)

    # Release resources
    cap.release()
    out.release()

## Train the model 
Takes model and new data. Uses "Transfer Learning" to add/improve category(s) based on new data given. 

In [None]:
def train_model(model : YOLO, yaml_file : str) -> YOLO:
    """
    Takes the model and a .yaml file and trains the model on the dataset specified in the .yaml file.
    Saves the results of the training to runs/detect
    
    Parameters:
        model (YOLO): The pretrained model that you want want to train ontop of. 
        yaml_file (str): The .yaml file that specifies the data that you want to train on.
        
    Returns: 
        None: This function does not return anything.
    """
    results = model.train(
        data = path_to('project2-dataset-3', yaml_file),
        epochs = 100,
        imgsz = 640,
        save_dir = path_to(),
        device = 'gpu',
        batch=16
        
    )
    

### Load pretrained model
This loads the YOLOv11 model as a starting point

In [9]:
model = YOLO('yolo11n.pt')

In [None]:
# Test the pretrained model
test_model(model, 'the_cat.png')
test_model(model, 'the_dog.png')
test_model(model, 'the_drone.jpg')
test_model(model, 'the_person.png')
test_model(model, 'the_pikachu.jpeg') 

In [8]:
train_model(model, 'data.yaml')

In [None]:
best_model = YOLO(path_to('runs', 'detect', '5_categories_no_aug', 'weights', 'best.pt'))
test_model(best_model, 'the_cat.png')
test_model(best_model, 'the_dog.png')
test_model(best_model, 'the_drone.jpg')
test_model(best_model, 'the_person.png')
test_model(best_model, 'the_pikachu.jpeg')
test_model(best_model, 'dog2.jpg')
test_model(best_model, 'person2.jpg')

In [6]:
# Load custom model
new_model = YOLO(path_to('runs', 'detect', '5_categories_no_aug', 'weights', 'best.pt'))

In [None]:
# test custom model
test_model(new_model, 'the_cat.png')
test_model(new_model, 'the_dog.png')
test_model(new_model, 'the_drone.jpg')
test_model(new_model, 'the_person.png')
test_model(new_model, 'the_pikachu.jpeg')
test_model(new_model, 'dog2.jpg')
test_model(new_model, 'person2.jpg')

In [11]:
test_model_video(new_model, 'pika.mp4')
test_model_video(new_model, 'drone.mp4')
test_model_video(new_model, 'dog_cat.mp4')


0: 384x640 3 persons, 48.5ms
Speed: 1.6ms preprocess, 48.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 43.9ms
Speed: 1.6ms preprocess, 43.9ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 drone, 3 persons, 42.0ms
Speed: 1.0ms preprocess, 42.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 42.0ms
Speed: 1.0ms preprocess, 42.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 drone, 2 persons, 40.0ms
Speed: 1.0ms preprocess, 40.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 drone, 3 persons, 42.0ms
Speed: 1.0ms preprocess, 42.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 drone, 4 persons, 41.0ms
Speed: 2.0ms preprocess, 41.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 41.0ms
Speed: 1.0ms preprocess, 41.0ms inference, 1.

In [14]:
results = new_model.val(data=path_to('project2-dataset-5', 'data.yaml'))

Ultralytics 8.3.34 🚀 Python-3.8.10 torch-2.4.1+cu121 CUDA:0 (NVIDIA GeForce RTX 3060, 12288MiB)


[34m[1mval: [0mScanning /home/hunt3041/PikaVision/project2-dataset-5/valid/labels.cache... 1946 images, 241 backgrounds, 0 corrupt: 100%|██████████| 1946/1946 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 122/122 [00:12<00:00,  9.64it/s]


                   all       1946       2115      0.969      0.925      0.968      0.817
                   cat        254        257      0.991      0.992      0.994      0.929
                   dog        495        495      0.988      0.986      0.994      0.862
                 drone        469        499      0.976      0.971      0.991      0.707
                person        357        728      0.932      0.758      0.895      0.742
               pikachu        130        136      0.958      0.919      0.967      0.847
Speed: 0.2ms preprocess, 3.1ms inference, 0.0ms loss, 0.8ms postprocess per image
Results saved to [1mruns/detect/val[0m


Function Used for Live Detection

In [7]:
def live_detection(model: YOLO):
    """
    Capture live video feed from the laptop camera and use YOLO model for object detection.
    
    Parameters:
        model (YOLO): The YOLO model that you want to use for live detection.
        
    Returns:
        None: Displays the live feed with detected objects.
    """
    # Start video capture (0 is the default camera on most systems)
    cap = cv2.VideoCapture(0)
    
    if not cap.isOpened():
        print("Error: Camera not found.")
        return
    
    print("Press 'q' to quit.")
    
    while True:
        # Capture frame-by-frame
        ret, frame = cap.read()
        
        if not ret:
            print("Error: Failed to capture image.")
            break
        
        # Convert the frame to RGB for YOLO model compatibility
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Convert the frame to a PIL image for YOLO
        pil_image = Image.fromarray(rgb_frame)
        
        # Perform inference with the YOLO model
        results = model(pil_image)
        
        # Draw the results on the frame
        annotated_frame = results[0].plot()  # Annotated image with detected objects
        
        # Display the annotated frame
        cv2.imshow("YOLO Live Detection", annotated_frame)
        
        # Press 'q' to quit
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    # Release the camera and close the window
    cap.release()
    cv2.destroyAllWindows()

Test the Live Detection

In [9]:
live_detection(new_model)

Press 'q' to quit.

0: 480x640 1 person, 44.0ms
Speed: 2.0ms preprocess, 44.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 48.0ms
Speed: 1.0ms preprocess, 48.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 46.0ms
Speed: 0.0ms preprocess, 46.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 45.0ms
Speed: 1.0ms preprocess, 45.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 44.0ms
Speed: 1.0ms preprocess, 44.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 43.0ms
Speed: 1.0ms preprocess, 43.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 44.0ms
Speed: 2.0ms preprocess, 44.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 45.0ms
Speed: 0.0ms preprocess, 45.0ms inference, 1.0ms postprocess per image