## Import Libarries

In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import cv2
import time

from ultralytics import YOLO
import torch
from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator


## path and model declaration

In [8]:
video_path  = os.path.join(os.getcwd(), 'src','classroom.mp4')

## checkpoints for sam
sam_checkpoints = "checkpoints"
vit_h = "sam_vit_h_4b8939.pth"
vit_b = "sam_vit_b_01ec64.pth"
vit_l = "sam_vit_l_0b3195.pth"

## check for device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
## yolo model
yolo_model = YOLO('yolov8n.pt').to(device)

## sam model
model_type = "vit_l"
sam = sam_model_registry[model_type](checkpoint=os.path.join(sam_checkpoints, vit_l))
sam = sam.to(device)
predictor = SamPredictor(sam)

In [1]:
def process_frame(frame):
    
    results = yolo_model(frame, conf=0.25, classes=[0])
    
    ## Process results
    for result in results:
        boxes = result.boxes
        
    bbox = boxes.xyxy
    print('bbox shape: ', bbox.shape)
    #confidences = boxes.conf
    #classes = boxes.cls 
    #predictor = SamPredictor(sam)
    predictor.set_image(frame)
    
    input_boxes = bbox.to(predictor.device)
    transformed_boxes = predictor.transform.apply_boxes_torch(input_boxes, frame.shape[:2])
    
    masks, _, _ = predictor.predict_torch(
    point_coords=None,
    point_labels=None,
    boxes=transformed_boxes,
    multimask_output=False,
    )
    
    return masks

In [2]:
def optimized_mask2img(mask):
    palette = {
        0: (0, 0, 0),
        1: (255, 0, 0),
        2: (0, 255, 0),
        3: (0, 0, 255),
        4: (0, 255, 255),
    }
    items = mask.shape[0]
    rows = mask.shape[1]
    cols = mask.shape[2]
    image = np.zeros((items, rows, cols, 3), dtype=np.uint8)
    image[:, :, :, 0] = mask * palette[1][0]
    image[:, :, :, 1] = mask * palette[1][1]
    image[:, :, :, 2] = mask * palette[1][2]
    return image

def optimized_show_mask(masks):
    masks = np.squeeze(masks, axis = 1)
    separate_rgb_masks = optimized_mask2img(masks)
    combined_mask = np.sum(separate_rgb_masks, axis = 0)
    return combined_mask


In [3]:


cap = cv2.VideoCapture(video_path)

if cap.isOpened() == False:
    print("Error in loading the video")
    
i = 0
while(cap.isOpened()):
    ret, frame = cap.read()
    
    start = time.time()
    masks = process_frame(frame)
    end = time.time()
    
    print("1. Time taken for frame {} is {}".format(i, end-start))
    
    start = time.time()
    colour_mask = optimized_show_mask(masks)
    end = time.time()
    
    print("2. Time taken for frame {} is {}".format(i, end-start))
    # frame = frame + colour_mask*0.3
    
    #dispaly frame and colour mask in same window
    start = time.time()
    frame = ((frame/np.max(frame))*255).astype(np.uint8)
    colour_mask = cv2.addWeighted(colour_mask.astype(np.uint8), 0.3, frame, 0.7, 0, colour_mask.astype(np.uint8))
    
    
    #-----------for contours -------
    masks = np.squeeze(masks.detach().cpu().numpy(), axis = 1).astype(np.uint8)
    print('masks shape: ', masks.shape, masks.shape[0], np.unique(masks))
    for dim in range(masks.shape[0]):
        print('in shape: ', masks[dim, :, :].shape)
        contours, hierarchy = cv2.findContours(image = masks[dim, :, :], mode = cv2.RETR_TREE, method = cv2.CHAIN_APPROX_NONE)
        cv2.drawContours(image = frame, contours=contours, contourIdx=-1, color=(0, 255, 0), thickness=1, lineType=cv2.LINE_AA)
    
    cv2.imshow('frame', colour_mask)
    end = time.time()
    print("3. Time taken for frame {} is {}".format(i, end-start))
    
    if cv2.waitKey(25) & 0xFF == ord('q'):
        break

    
    i = i + 1

cap.release()
cv2.destroyAllWindows()

NameError: name 'cv2' is not defined