# Assignment 2 Object Detection [Video Generation]
Contains code for the loading the trained model, readings frames from a video file, and applying the object detection model to detect the location of the drinks in the image and the corresponding classes.

**NOTES:** Due to the limitation of live camera video feed in Kaggle notebooks, feeding from a video file (.mp4) was the workaround for the demo submission. A demo file is generated with 640x480 resolution and 30 frames per second.

## 1. Setup and import dependencies
Update torchvision and torch library due to some newly added features not currently available in the preset libraries. Install pycocotools for the vision reference code blocks.

In [None]:
!pip install torchvision -U
!pip install torch -U
!pip install pycocotools

In [1]:
import torch
import numpy as np
import label_utils
from torch.utils.data import DataLoader
from torchvision import transforms
from PIL import Image
import cv2

In [2]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
def create_model(num_classes):
    # load Faster RCNN pre-trained model
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    
    # get the number of input features 
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # define a new head for the detector with required number of classes
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) 
    return model

In [3]:
od_trained_model = "../input/ee298z-hw2-od-weights/adulay-fasterrcnn_resnet50_fpn-1651304089.3776634.pth"
num_classes = len(label_utils.params["classes"])

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = create_model(num_classes=num_classes).to(device)
model.load_state_dict(torch.load(
    od_trained_model, map_location=device
))
model.eval() # set for inference and not training mode

In [4]:
import sys
import cv2
import matplotlib.pyplot as plt

def preprocess_frame(image):
    # BGR to RGB
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
    # make the pixel range between 0 and 1
    image /= 255.0
    # bring color channels to front
    image = np.transpose(image, (2, 0, 1)).astype(float)
    # convert to tensor
    image = torch.tensor(image, dtype=torch.float).cuda()
    # add batch dimension
    image = torch.unsqueeze(image, 0)   
    return image

def detect_drinks(filename, detection_threshold=0.8, read_from_file=True, image=None, to_plot=True):
    # lowest value found at around ~0.8, adjust to lower value to 'capture more'/higher allowance
    # default threshold set to 0.8 since 0.75 captures other objects in the background as drinks
    # setting to a higher value (e.g. 0.9) lowers those cases

    if read_from_file:
        image = cv2.imread(filename)
    orig_image = image.copy()
    image = preprocess_frame(image)

    outputs = model(image)

    # load all detection to CPU for further operations
    outputs = [{k: v.to('cpu') for k, v in t.items()} for t in outputs]
    # carry further only if there are detected boxes
    if len(outputs[0]['boxes']) != 0:
        boxes = outputs[0]['boxes'].data.numpy()
        scores = outputs[0]['scores'].data.numpy()
        # filter out boxes according to `detection_threshold`

        boxes = boxes[scores >= detection_threshold].astype(np.int32)
        draw_boxes = boxes.copy()
        # get all the predicited class names
        pred_classes = [label_utils.params["classes"][i] for i in outputs[0]['labels'].cpu().numpy()]

        # draw the bounding boxes and write the class name on top of it
        for j, box in enumerate(draw_boxes):
            cv2.rectangle(orig_image,
                        (int(box[0]), int(box[1])),
                        (int(box[2]), int(box[3])),
                        (0, 0, 255), 2)
            cv2.putText(orig_image, pred_classes[j], 
                        (int(box[0]), int(box[1]-5)),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 
                        2, lineType=cv2.LINE_AA)

        if to_plot:
            imgplot = plt.imshow(cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB))
            plt.show()

    return orig_image

In [5]:
vid = cv2.VideoCapture('../input/ee298z-hw2-od-vids/WIN_20220430_16_01_02_Pro.mp4')

In [6]:
import time

out_vid = cv2.VideoWriter(f'demo-{time.time()}.mp4',
                        cv2.VideoWriter_fourcc('m', 'p', '4', 'v'),
                        30,
                        (640,480), 
                        isColor=True)

while(vid.isOpened()):
    ret, frame = vid.read()
    if ret == True:
        detected_frame = detect_drinks(None, 
                      detection_threshold=0.9, 
                      read_from_file=False, 
                      image=frame,
                      to_plot=False)
        detected_frame = cv2.resize(detected_frame,(640,480),fx=0,fy=0, interpolation = cv2.INTER_CUBIC)
        out_vid.write(detected_frame)
    else:
        break

vid.release()
out_vid.release()

Optional: Run the following code block to generate a download link for the demo video for cases when output panel in Kaggle is slightly unresponsive.

In [7]:
import os
from IPython.display import FileLink
os.chdir(r'/kaggle/working')

# Note: change the time portion of the filename during the actual runtime
FileLink(r"./demo-1651323358.4610178.mp4")