In [None]:
#Required library import
import cv2
import torch
from ssd_updated import build_ssd #Updated SSD file with layers and detection file
import imageio
from torch.autograd import Variable
from data import BaseTransform, VOC_CLASSES as labelmap

In [None]:
#function to detect the required objects based on the pre trained model in ssd300_mAP_77.43_v2.pth
def detect(frame, net, transform):
    #4 total transformations to the frame.
    height, width = frame.shape[:2] #here the frame object contains [h, w, c] we will exclude color as its not required for this var
    frame_t = transform(frame)[0] #the transform object will give us the array of the image of which we will only need the 1st index.
    x = torch.from_numpy(frame_t).permute(2,0,1) #Convert Numpy Array to Torch Tensor. Permute used to change color channels from RBG(0,1,2) to GRB(2,0,1) as network was trained on that sequence.
    x = Variable(x.unsqueeze(0)) #Expand the Dimensions to include the batch size
    y = net(x) #we will send the tensor x to the net
    detections = y.data #we will take the data generated from the tensor after applying the net.
    scale = torch.Tensor([width, height, width, height]) #we will create the scale object with the dimensions of the data.
    #detections = [batch, number of classes, number of occurence, (score, x0, Y0, x1, y1)]]
    for i in range(detections.size(1)):
        j=0
        while detections[0,i,j,0] >= 0.6:
            pt = (detections[0,i,j,1:] * scale).numpy() #this will be the point of the detection with scaled data to include x0...y1
            cv2.rectangle(frame, (int(pt[0]), int(pt[1])), (int(pt[2]), int(pt[3])), (255, 0, 0), 2)
            cv2.putText(frame, labelmap[i - 1], (int(pt[0]), int(pt[1])), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 2, cv2.LINE_AA)
            j += 1
    return frame

In [None]:
#now to create the network
net = build_ssd('test')# We create an object that is our neural network ssd.
net.load_state_dict(torch.load('ssd300_mAP_77.43_v2.pth', map_location= lambda storage, loc: storage))

In [None]:
# We create an object of the BaseTransform class, a class that will do the required transformations so that the image can be the input of the neural network.
transform = BaseTransform(net.size, (104/256.0, 117/256.0, 123/256.0)) 

In [None]:
# Object detection on the video
reader = imageio.get_reader('test.mp4') #we open the video
fps = reader.get_meta_data()['fps'] # We get the fps frequence (frames per second).
writer = imageio.get_writer('output.mp4', fps = fps) # We create an output video with this same fps frequence.

In [None]:
for i, frame in enumerate(reader):
    frame = detect(frame, net.eval(), transform)
    writer.append_data(frame) # We add the next frame in the output video.
    print(i)  #We print the number of the processed frame.
    
writer.close()# We close the process that handles the creation of the output video.