In [7]:
import torch
import matplotlib.pyplot as plt
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.utils import draw_bounding_boxes
from torchvision.utils import draw_segmentation_masks
import torchvision.transforms.functional as F
from PIL import Image, ImageDraw, ImageColor
import numpy as np
plt.rcParams["savefig.bbox"] = 'tight'


def show(imgs):
    if not isinstance(imgs, list):
        imgs = [imgs]
    fig, axs = plt.subplots(ncols=len(imgs), squeeze=False)
    for i, img in enumerate(imgs):
        img = img.detach()
        img = F.to_pil_image(img)
        axs[0, i].imshow(np.asarray(img))
        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])

def draw_masks(image, masks):
    # Create a color map for the masks
    color_map = {}
    for i in range(masks.shape[0]):
        color_map[i] = ImageColor.getrgb(f'hsl({i/float(masks.shape[0])*360},100%,50%)')

    # Create a transparent image to overlay the masks on
    mask_overlay = Image.new('RGBA', image.size, (0, 0, 0, 0))

    # Loop over the masks and draw them on the mask overlay
    for i in range(masks.shape[0]):
        mask = masks[i, :, :]
        color = color_map[i]
        mask_overlay_draw = ImageDraw.Draw(mask_overlay)
        mask_overlay_draw.bitmap((0, 0), transforms.ToPILImage()(mask), fill=color)

    # Blend the mask overlay with the original image using alpha blending
    image_alpha = image.copy().convert('RGBA')
    image_alpha.putalpha(128)
    image_blend = Image.alpha_composite(image_alpha, mask_overlay)

    return image_blend.convert('RGB')       

In [None]:
 

# Define the COCO dataset and data loader
coco_dataset = torchvision.datasets.CocoDetection(root='etc/val2017/', annFile='etc/annotations/captions_val2017.json',
                                                  transform=transforms.ToTensor())
coco_loader = DataLoader(coco_dataset, batch_size=1, shuffle=False)

# Load the pre-trained Mask R-CNN model
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

# Put the model in evaluation mode
model.eval()

In [None]:
# Loop over the images in the dataset and process them with the model
for i, (image, target) in enumerate(coco_loader):
    # Convert the image to a PIL Image object
    image = transforms.ToPILImage()(image[0])
    
    # Run the image through the model to obtain the predictions
    with torch.no_grad():
        prediction = model([transforms.ToTensor()(image)])
        
        
        # Convert the tensor to a numpy array
        image_np = torch.Tensor.cpu(transforms.ToTensor()(image)).numpy()
        # Convert the numpy array to uint8
        image_np_uint8 = (image_np * 255).astype('uint8')
        result = draw_segmentation_masks(torch.from_numpy(image_np_uint8),
                                         masks = prediction[0]['masks'].squeeze(1) > 0.5,
                                         alpha=0.9)
        show(result)
        #masks = prediction[0]['masks'] > 0.5  # threshold the masks
        #image_with_masks = draw_masks(image, masks)

        # Plot the image with the masks
        #image_with_masks.show()
    if i == 3:
        break
    # Print the predicted class labels and bounding boxes for the image
    #print(prediction[0]['labels'])
    #print(prediction[0]['boxes'])

In [None]:
image = Image.open('etc/room.jpg')
# Downscale the image
new_size = (int(image.size[0]/3), int(image.size[1]/3))
image = image.resize(new_size)
#image = result
# Run the image through the model to obtain the predictions
with torch.no_grad():
    prediction = model([transforms.ToTensor()(image)])
    indices = torch.nonzero(prediction[0]['scores'] > 0.90, as_tuple=False).squeeze(1)

    # Convert the tensor to a numpy array
    image_np = torch.Tensor.cpu(transforms.ToTensor()(image)).numpy()
    # Convert the numpy array to uint8
    image_np_uint8 = (image_np * 255).astype('uint8')
    result = draw_segmentation_masks(torch.from_numpy(image_np_uint8),
                                     masks = prediction[0]['masks'][indices].squeeze(1) > 0.5,
                                     alpha=0.9)
    
    #result = draw_bounding_boxes(torch.from_numpy(image_np_uint8),
    #                             prediction[0]['boxes'][indices])
    show(result)

In [25]:
cap = cv2.VideoCapture("etc/vidroom2.mp4")

# Define the codec and output format for the processed video
fourcc = cv2.VideoWriter_fourcc(*"XVID")
fps = 25.0
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter("output.avi", fourcc, fps, (width, height))
def tracking(prediction):
    sort_list = np.array([])
    for i in range(len(prediction[0]['boxes'])):
        sort_list = np.append(sort_list,
                              np.append(prediction[0]['boxes'][i].detach().cpu().numpy(),
                                        prediction[0]['scores'][i].detach().cpu().numpy()))
        
# Create a VLC HTTP or RTP stream for the output video
# Example for RTP:
#vlc_url = "rtp://127.0.0.1:1234/out_stream"
# Example for HTTP:
# vlc_url = "http://127.0.0.1:8080/"
mot_tracker = Sort()
# Loop over the frames in the stream, process them, and write them to the output video
while cap.isOpened():
    # Read the next frame from the stream
    ret, frame = cap.read()
    frame = cv2.flip(frame, 0)
    image = Image.fromarray(frame)
    new_size = (int(image.size[0]/2), int(image.size[1]/2))
    image = image.resize(new_size)
    prediction = model([transforms.ToTensor()(image)])
    prediction[0]['sort_list'] = tracking(prediction)

    tracked_objects = mot_tracker.update(prediction[0]['sort_list'] )
    unique_labels = detections[:, -1].cpu().unique()
    
    indices = torch.nonzero(prediction[0]['scores'] > 0.80, as_tuple=False).squeeze(1)

    # Convert the tensor to a numpy array
    image_np = torch.Tensor.cpu(transforms.ToTensor()(image)).numpy()
    # Convert the numpy array to uint8
    image_np_uint8 = (image_np * 255).astype('uint8')
    result = draw_segmentation_masks(torch.from_numpy(image_np_uint8),
                                     masks = prediction[0]['masks'][indices].squeeze(1) > 0.5,
                                     alpha=0.9)
    
    #result = draw_bounding_boxes(torch.from_numpy(image_np_uint8),
    #                             prediction[0]['boxes'][indices])
    show(result)
    break

TypeError: object of type 'NoneType' has no len()

In [22]:
prediction

[{'boxes': tensor([[1.3702e+02, 1.6410e+02, 2.6870e+02, 3.2760e+02],
          [1.8343e+02, 5.7907e+02, 3.9327e+02, 7.3983e+02],
          [3.0108e+02, 2.4881e+02, 3.5896e+02, 3.2978e+02],
          [9.8902e+01, 4.6005e+02, 1.3199e+02, 5.0768e+02],
          [4.6913e+01, 4.4941e+02, 7.9249e+01, 4.9693e+02],
          [2.2895e+02, 3.6977e+02, 2.6802e+02, 4.5983e+02],
          [1.8956e+02, 2.6882e+02, 2.4881e+02, 3.2601e+02],
          [2.6052e+02, 2.5728e+02, 3.1644e+02, 3.2704e+02],
          [1.4334e+02, 4.3628e+02, 1.8993e+02, 4.9396e+02],
          [1.3913e+02, 3.7177e+02, 1.9005e+02, 4.9426e+02],
          [9.0604e+01, 4.5518e+02, 1.2667e+02, 4.8898e+02],
          [3.7486e+02, 6.1336e+02, 5.3845e+02, 8.4781e+02],
          [4.1288e+02, 2.9017e+02, 4.7749e+02, 3.5335e+02],
          [3.7624e+02, 1.4689e+02, 5.3005e+02, 3.4583e+02],
          [4.9643e+00, 4.6168e+02, 1.9992e+02, 6.7923e+02],
          [1.1329e+02, 4.5594e+02, 1.4185e+02, 4.9080e+02],
          [1.4484e+02, 4.4137e+

In [4]:
import cv2
import numpy as np
import torch
import matplotlib.pyplot as plt
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.utils import draw_bounding_boxes
from torchvision.utils import draw_segmentation_masks
import torchvision.transforms.functional as F
from PIL import Image, ImageDraw, ImageColor
import numpy as np
import cv2
from sort import Sort
plt.rcParams["savefig.bbox"] = 'tight'

In [5]:
def load_maskrcnn():
    # Define the COCO dataset and data loader
    coco_dataset = torchvision.datasets.CocoDetection(root='etc/val2017/', annFile='etc/annotations/captions_val2017.json',
                                                      transform=transforms.ToTensor())
    coco_loader = DataLoader(coco_dataset, batch_size=1, shuffle=False)

    # Load the pre-trained Mask R-CNN model
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

    # Put the model in evaluation mode
    model.eval()
    return model

def feedforward(model, image):
    # Convert the frame to a NumPy array
    #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # change color space if necessary
    image = Image.fromarray(image)
    new_size = (int(image.size[0]/2), int(image.size[1]/2))
    image = image.resize(new_size)
    prediction = model([transforms.ToTensor()(image)])
    indices = torch.nonzero(prediction[0]['scores'] > 0.90, as_tuple=False).squeeze(1)
    
    
    
    # Convert the tensor to a numpy array
    image_np = torch.Tensor.cpu(transforms.ToTensor()(image)).numpy()
    # Convert the numpy array to uint8
    image_np_uint8 = (image_np * 255).astype('uint8')
    result = draw_segmentation_masks(torch.from_numpy(image_np_uint8),
                                     masks = prediction[0]['masks'][indices].squeeze(1) > 0.5,
                                     alpha=0.9)

    result = result.detach().cpu().numpy()
    result = np.rollaxis(result, 0, 3)
    #result = F.to_pil_image(result)

    return result
    
    
def PIL_to_cv2(image: np.array):    
    # Convert the PIL image to a NumPy array in BGR format
    output_np = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return output_np


# Connect to a live stream
stream_url = "rtmp://192.168.25.1:8082/live"
model = load_maskrcnn()

# Create an OpenCV video capture object to decode the frames from the stream
#cap = cv2.VideoCapture(stream_url)
cap = cv2.VideoCapture("etc/vidroom2.mp4")

# Define the codec and output format for the processed video
fourcc = cv2.VideoWriter_fourcc(*"XVID")
fps = 25.0
#width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
width = 540
#height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
height = 960
out = cv2.VideoWriter("output.avi", fourcc, fps, (width, height))

# Create a VLC HTTP or RTP stream for the output video
# Example for RTP:
#vlc_url = "rtp://127.0.0.1:1234/out_stream"
# Example for HTTP:
# vlc_url = "http://127.0.0.1:8080/"

# Loop over the frames in the stream, process them, and write them to the output video
counter = 0
while cap.isOpened():
    # Read the next frame from the stream
    ret, frame = cap.read()
    frame = cv2.flip(frame, 0)
    if not ret:
        break
    # Process the frame with PyTorch
    result = feedforward(model, frame)
    #result_frame = PIL_to_cv2(result)
    # Write the processed frame to the output video
    out.write(result)
    counter += 1
    if counter == 5:
        out.release()
        cap.release()
        break


loading annotations into memory...
Done (t=0.03s)
creating index...
index created!


