## Import Libarries

In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import cv2
import time

from ultralytics import YOLO
import torch
from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator


## path and model declaration

In [8]:
video_path  = os.path.join(os.getcwd(), 'src','classroom.mp4')

## checkpoints for sam
sam_checkpoints = "checkpoints"
vit_h = "sam_vit_h_4b8939.pth"
vit_b = "sam_vit_b_01ec64.pth"
vit_l = "sam_vit_l_0b3195.pth"

## check for device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
## yolo model
yolo_model = YOLO('yolov8n.pt').to(device)

## sam model
model_type = "vit_l"
sam = sam_model_registry[model_type](checkpoint=os.path.join(sam_checkpoints, vit_l))
sam = sam.to(device)
predictor = SamPredictor(sam)

In [15]:
def process_frame(frame):
    
    results = yolo_model(frame, conf=0.25, classes=[0])
    
    ## Process results
    for result in results:
        boxes = result.boxes
        
    bbox = boxes.xyxy
    #confidences = boxes.conf
    #classes = boxes.cls 
    #predictor = SamPredictor(sam)
    predictor.set_image(frame)
    
    input_boxes = bbox.to(predictor.device)
    transformed_boxes = predictor.transform.apply_boxes_torch(input_boxes, frame.shape[:2])
    
    masks, _, _ = predictor.predict_torch(
    point_coords=None,
    point_labels=None,
    boxes=transformed_boxes,
    multimask_output=False,
    )
    
    return masks

In [44]:

def mask2img(mask):
    palette = {
        0: (0, 0, 0),
        1: (255, 0, 0),
        2: (0, 255, 0),
        3: (0, 0, 255),
        4: (0, 255, 255),
    }
    
    palette_tensor = torch.tensor([palette[x] for x in mask.flatten()], dtype=torch.uint8)
    image = palette_tensor.reshape(mask.shape[0], mask.shape[1], 3)
    return image

def show_mask(masks, random_color=False):
    mask_images = [mask2img(torch.squeeze(mask).cpu().numpy()) for mask in masks]
    combined_mask = torch.sum(torch.stack(mask_images, dim=0), dim=0)
    return combined_mask.detach().cpu().numpy()


In [46]:


cap = cv2.VideoCapture(video_path)

if cap.isOpened() == False:
    print("Error in loading the video")
    
i = 0
while(cap.isOpened()):
    ret, frame = cap.read()
    
    start = time.time()
    masks = process_frame(frame)
    end = time.time()
    
    print("1. Time taken for frame {} is {}".format(i, end-start))
    
    start = time.time()
    colour_mask = show_mask(masks)
    end = time.time()
    
    print("2. Time taken for frame {} is {}".format(i, end-start))
    # frame = frame + colour_mask*0.3
    
    #dispaly frame and colour mask in same window
    start = time.time()
    frame = ((frame/np.max(frame))*255).astype(np.uint8)
    colour_mask = cv2.addWeighted(colour_mask.astype(np.uint8), 0.3, frame, 0.7, 0, colour_mask.astype(np.uint8))
    #cv2.imshow('frame', frame)
    cv2.imshow('frame', colour_mask)
    end = time.time()
    print("3. Time taken for frame {} is {}".format(i, end-start))
    
    if cv2.waitKey(25) & 0xFF == ord('q'):
        break

    
    i = i + 1

    

cap.release()
cv2.destroyAllWindows()


0: 384x640 8 persons, 23.0ms
Speed: 3.0ms preprocess, 23.0ms inference, 7.0ms postprocess per image at shape (1, 3, 384, 640)


1. Time taken for frame 0 is 0.9768147468566895



0: 384x640 7 persons, 23.0ms
Speed: 2.0ms preprocess, 23.0ms inference, 6.0ms postprocess per image at shape (1, 3, 384, 640)


2. Time taken for frame 0 is 3.8341541290283203
3. Time taken for frame 0 is 0.02499842643737793
1. Time taken for frame 1 is 0.9142334461212158



0: 384x640 7 persons, 8.0ms
Speed: 2.0ms preprocess, 8.0ms inference, 8.0ms postprocess per image at shape (1, 3, 384, 640)


2. Time taken for frame 1 is 3.3505029678344727
3. Time taken for frame 1 is 0.010968923568725586
1. Time taken for frame 2 is 0.863013505935669



0: 384x640 8 persons, 33.0ms
Speed: 2.0ms preprocess, 33.0ms inference, 10.0ms postprocess per image at shape (1, 3, 384, 640)


2. Time taken for frame 2 is 3.3431832790374756
3. Time taken for frame 2 is 0.010030031204223633
1. Time taken for frame 3 is 0.9471521377563477



0: 384x640 7 persons, 19.0ms
Speed: 2.0ms preprocess, 19.0ms inference, 4.0ms postprocess per image at shape (1, 3, 384, 640)


2. Time taken for frame 3 is 3.842090368270874
3. Time taken for frame 3 is 0.011001348495483398
1. Time taken for frame 4 is 0.8942739963531494



0: 384x640 7 persons, 19.0ms
Speed: 2.0ms preprocess, 19.0ms inference, 10.0ms postprocess per image at shape (1, 3, 384, 640)


2. Time taken for frame 4 is 3.3581533432006836
3. Time taken for frame 4 is 0.009998559951782227
1. Time taken for frame 5 is 0.9173538684844971



0: 384x640 7 persons, 19.0ms
Speed: 2.0ms preprocess, 19.0ms inference, 10.0ms postprocess per image at shape (1, 3, 384, 640)


2. Time taken for frame 5 is 3.4241602420806885
3. Time taken for frame 5 is 0.010000228881835938
1. Time taken for frame 6 is 0.8531053066253662



0: 384x640 7 persons, 12.0ms
Speed: 2.0ms preprocess, 12.0ms inference, 10.0ms postprocess per image at shape (1, 3, 384, 640)


2. Time taken for frame 6 is 3.3597705364227295
3. Time taken for frame 6 is 0.00999903678894043
1. Time taken for frame 7 is 0.9042179584503174



0: 384x640 7 persons, 19.0ms
Speed: 2.0ms preprocess, 19.0ms inference, 4.0ms postprocess per image at shape (1, 3, 384, 640)


2. Time taken for frame 7 is 3.3455116748809814
3. Time taken for frame 7 is 0.011001825332641602
1. Time taken for frame 8 is 0.8813011646270752



0: 384x640 8 persons, 19.0ms
Speed: 2.0ms preprocess, 19.0ms inference, 9.0ms postprocess per image at shape (1, 3, 384, 640)


2. Time taken for frame 8 is 3.3629238605499268
3. Time taken for frame 8 is 0.01100301742553711
1. Time taken for frame 9 is 0.9050312042236328



0: 384x640 8 persons, 29.0ms
Speed: 3.0ms preprocess, 29.0ms inference, 12.0ms postprocess per image at shape (1, 3, 384, 640)


2. Time taken for frame 9 is 3.814703941345215
3. Time taken for frame 9 is 0.008994340896606445
1. Time taken for frame 10 is 0.8716320991516113



0: 384x640 7 persons, 25.0ms
Speed: 2.0ms preprocess, 25.0ms inference, 5.0ms postprocess per image at shape (1, 3, 384, 640)


2. Time taken for frame 10 is 3.8407680988311768
3. Time taken for frame 10 is 0.009029626846313477
1. Time taken for frame 11 is 0.9148571491241455



0: 384x640 8 persons, 20.0ms
Speed: 1.0ms preprocess, 20.0ms inference, 7.0ms postprocess per image at shape (1, 3, 384, 640)


2. Time taken for frame 11 is 3.471442699432373
3. Time taken for frame 11 is 0.01099848747253418
1. Time taken for frame 12 is 0.8535547256469727
2. Time taken for frame 12 is 3.909257173538208
3. Time taken for frame 12 is 0.00899958610534668
