In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
from IPython.display import Image
display(Image(filename="images/intro-image.png"))

## Highlights

- Monitor workstations to ensure that no workstation remains idle, and ensure that all workstations are utilized at 100% capacity.
- Implement employee monitoring to ensure the equitable tracking of hours worked by employees.
- Utilizing YOLO (You Only Look Once) for the purpose of both counting and detecting individuals in a room.
- The process of fine-tuning YOLO on a custom dataset to enhance the accuracy of the results.
- Utilizing Intersection over Union (IoU) as a threshold to determine whether a station is taken or vacant.
- Applying image processing techniques and using Pytesseract to perform Optical Character Recognition (OCR) in order to extract the timestamp from the CCTV camera feed.

## Introduction

The solution that I implemented in this project is to use deep learning techniques to detect an employee and monitor the time the employee is seated in its station. An additional feature in this project is to count the number of person inside the office. To be able to track if there are some violations on social distancing.

## Import Necessary Libraries

In [None]:
# python standard libraries
import os, time, glob, re, shutil, pickle
from datetime import datetime, timedelta
from base64 import b64decode, b64encode

# google colab/notebook libraries
from IPython.display import display, Javascript, Image
from IPython.display import Video
from js2py import eval_js

# external libraries
import cv2, PIL, io, html, pytesseract
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from skimage.morphology import erosion
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Define Color Constants For Bounding Boxes

In [None]:
person_color = (220, 155, 58)
vacant_color = (0, 0, 200)
taken_color = (0, 200, 0)
station_color = (0, 100, 0)

## Define Workstation & DateTime Area Coordinates

In [None]:
coordinates = {
    'station_1' : {'x1':1123, 'x2':1200, 
                   'y1':112, 'y2':176},
    'station_2' : {'x1':1267, 'x2':1330, 
                   'y1':121, 'y2':195},               
    'station_3' : {'x1':1378, 'x2':1450, 
                   'y1':138, 'y2':207},
    'station_4' : {'x1':1483, 'x2':1553, 
                   'y1':166, 'y2':253},
    'station_5' : {'x1':1608, 'x2':1680, 
                   'y1':204, 'y2':280},
    'station_6' : {'x1':798, 'x2':872, 
                   'y1':148, 'y2':237},
    'station_7' : {'x1':968, 'x2':1060, 
                   'y1':165, 'y2':269},
    'station_8' : {'x1':1099, 'x2':1209, 
                   'y1':199, 'y2':336},
    'station_9' : {'x1':1289, 'x2':1391, 
                   'y1':235, 'y2':366},
    'station_10' : {'x1':374, 'x2':457, 
                   'y1':251, 'y2':355},
    'station_11' : {'x1':471, 'x2':562, 
                   'y1':278, 'y2':481},
    'station_12' : {'x1':575, 'x2':717, 
                   'y1':337, 'y2':557},
    'station_13' : {'x1':727, 'x2':931, 
                   'y1':411, 'y2':684},
    'station_14' : {'x1':964, 'x2':1194, 
                   'y1':530, 'y2':785}
}

coordinates_ocr= [(55, 43), (799, 91)]

In [None]:
from IPython.display import HTML
from IPython.display import Image

def hide_cell():
    HTML('''<script>
    code_show=true; 
    function code_toggle() {
     if (code_show){
     $('div.input').hide();
     } else {
     $('div.input').show();
     }
     code_show = !code_show
    } 
    $( document ).ready(code_toggle);
    </script>
    <style>
    .output_png {
        display: table-cell;
        text-align: center;
        horizontal-align: middle;
        vertical-align: middle;
        margin:auto;
    }

    tbody, thead {
        margin-left:100px;
    }

    </style>
    <form action="javascript:code_toggle()"><input type="submit"
    value="Click here to toggle on/off the raw code."></form>''')
# hide_cell()

## Get Current Working Directory

In [None]:
print("Current Working Directory: \t",os.getcwd())

## Change Makefile To Have GPU, OPENCV and LIBSO Enabled

In [None]:
%cd darknet
!sed -i 's/OPENCV=0/OPENCV=1/' Makefile
!sed -i 's/GPU=0/GPU=1/' Makefile
!sed -i 's/CUDNN=0/CUDNN=1/' Makefile
!sed -i 's/CUDNN_HALF=0/CUDNN_HALF=1/' Makefile
!sed -i 's/LIBSO=0/LIBSO=1/' Makefile

# make darknet (builds darknet so that you can then use the darknet.py file 
# and have its dependencies)
!make

## Exploratory Data Analysis 

Since it is just a single camera each of the frames would have the same size and channels. For exploratory data analysis, let us check the dimesions of one frame of the CCTV feed.

In [None]:
# print sample image 
vidcap = cv2.VideoCapture('../cctv_footage/1.mp4')
success, frame = vidcap.read()

# Get the width and height of the frame
frame_height, frame_width, _ = frame.shape

# Get Frame Per Second Information
fps = vidcap.get(cv2.CAP_PROP_FPS)

# Get Total Frame From a video footage
total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))

# Print the width and height
print("Frame Width:", frame_width)
print("Frame Height:", frame_height)
print("FPS:", fps)
print("Total Frame: ",total_frames)

if success:
    plt.figure(figsize=(15, 10))
    plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    plt.axis('off')  # Optional: Turn off the axis if you don't want it
    plt.show()

## Methodology

In [None]:
display(Image(filename="../images/methodology.png"))

<b>a.) Dataset</b>
<p style="text-align:justify">The dataset employed for this project consists of personal video surveillance footage from our business, totaling 15 minutes in length, recorded in November 2023.</p>

<b>b.) Extract images</b>
<p style="text-align:justify">Extracted 170 sample images to be used for training and validation set.</p>

<b>c.) Fine tune YOLOv4 model</b>
<p style="text-align:justify">Conducted transfer learning by initially utilizing pretrained weights trained on the COCO dataset and then fine-tuned the model using our custom dataset.</p>

<b>d.) Perform non-max suppression</b>
<p style="text-align:justify">Applied non-maximum suppression to eliminate multiple bounding boxes around a single object, retaining the one with the highest confidence score when there is an overlap.</p>

<b>e.) Set-up work station</b>
<p style="text-align:justify">Established a workstation setup to establish and implement a rule based on the Intersection over Union (IoU) of the detected persons and the workstations, enabling the identification of which stations are occupied and which are vacant.</p>

<b>f.) Compute for the time of the employee in work station</b>
<p style="text-align:justify">Employed the information provided by the DVR, located in the upper right of the image, and applied image processing techniques and Optical Character Recognition (OCR) to extract and convert the text into a datetime Python object..</p>

## Results and Discussion

<h2>1. Pre trained YOLO on COCO dataset</h2>

<p style="text-align:justify">YOLO has pre trained model on COCO dataset which could classify <a href="https://gist.github.com/AruniRC/7b3dadd004da04c80198557db5da4bda">80 objects</a>. <a href="https://cocodataset.org/#home">COCO</a> dataset is a large scale object detection, segmentation, and captioning of over 330k images. One of the objects that the YOLO trained on this dataset could classify is person which is what we need for this study. Let us try to use if it would work well on our dataset.</p>

In [None]:
# use if you want to use default settings
# get the scaled yolov4 weights file that is pre-trained to detect 80 classes (objects) from shared google drive
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1V3vsIaxAlGWvK4Aar9bAiK5U0QFttKwq' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1V3vsIaxAlGWvK4Aar9bAiK5U0QFttKwq" -O yolov4-csp.weights && rm -rf /tmp/cookies.txt

In [None]:
# import darknet functions to perform object detections
from darknet import *
# load in our YOLOv4 architecture network
network, class_names, class_colors = load_network("cfg/yolov4-csp.cfg", "cfg/coco.data", "yolov4-csp.weights")
width = network_width(network)
height = network_height(network)

# darknet helper function to run detection on image
def darknet_helper(img, width, height):
    darknet_image = make_image(width, height, 3)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_resized = cv2.resize(img_rgb, (width, height),
                              interpolation=cv2.INTER_LINEAR)

    # get image ratios to convert bounding boxes to proper size
    img_height, img_width, _ = img.shape
    width_ratio = img_width/width
    height_ratio = img_height/height

    # run model on darknet style image to get detections
    copy_image_from_bytes(darknet_image, img_resized.tobytes())
    detections = detect_image(network, class_names, darknet_image)
    free_image(darknet_image)
    return detections, width_ratio, height_ratio

## Evaluate In Validation Set

In [None]:
!./darknet detector test cfg/coco.data cfg/yolov4-csp.cfg yolov4-csp.weights data/0.jpg

In [None]:
# !./darknet detector demo cfg/coco.data cfg/yolov4-csp.cfg yolov4-csp.weights ../cctv_footage/Desire_Clip_02.mp4 -dont_show -ext_output

In [None]:
display(Image(filename="predictions.jpg"))

<p style="text-align:justify">It turns out that for some of the frames it would perfectly classify the person in the image. However, there are some frames that the model misclassify the person. Additionally, there are sometimes multiple bounding boxes on a single object and some objects that are not needed in this project are being classified like chair, monitor, laptop etc. Although the model is trained on thousand of images, it was not really trained on this type of environment and probably not to all angles of a  person. </p>

## 2. Train YOLO on custom dataset

<p style="text-align:justify">To remedy the issues found in using pretrained model on COCO dataset, we can perform training on custom dataset. The detailed explanation on how to train your YOLO on custom dataset can be found in their <a href="https://github.com/AlexeyAB/darknet#how-to-train-to-detect-your-custom-objects">documentation</a>.</p>

<h3>a. Extract custom dataset </h3>

<p style="text-align:justify">Training YOLO on custom dataset would require images, so we need to transform our video and sample it into images. <a href="https://labelstud.io/blog/Quickly-Create-Datasets-for-Training-YOLO-Object-Detection.html">50 to 100</a> images are usually enough to train a single object but for this project I sampled the data into 200 images to ensure more correctness of the data.</p>


In [None]:
def create_image_frame():
    cap = cv2.VideoCapture('../cctv_footage/1.mp4')
    if not cap.isOpened():
        print("Error: Could not open video file.")
        exit()
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    output_directory = 'frames1'
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    img_array = []
    counter = 0
    image_count = 171

    while True:
        ret, frame = cap.read()
        if not ret:
            break  # Break out of the loop when there are no more frames

        if counter % int(length / 196) == 0:
            fname = os.path.join(output_directory, f'{image_count}.jpg')
            image_count += 1
            cv2.imwrite(fname, frame)

        counter += 1

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()
# create_image_frame()

<h3>b. Manually label person in custom dataset </h3>

<p style="text-align:justify">Now that we have our 170 images, we need to manually label the images in a format expected by the YOLO algorithm which is <code>{object-class} {x_center} {y_center} {width} {height}</code>. I used <a href="https://github.com/tzutalin/labelImg">LabelImg</a> to manually label my data into bounding boxes. It will produce text files of bounding boxes with the same filename as the image file and a text file that contains the class names. We then split the data into train and 10 percent validation set.</p>

In [None]:
def data_labelling():
    current_dir = './data/obj'

    # Percentage of images to be used for the test set
    percentage_test = 20;

    # Create and/or truncate train.txt and test.txt
    file_train = open('./data/train.txt', 'w')
    file_test = open('./data/test.txt', 'w')

    # Populate train.txt and test.txt
    counter = 1
    index_test = round(100 / percentage_test)
    for pathAndFilename in glob.iglob(os.path.join(current_dir, "*.jpg")):
        title, ext = os.path.splitext(os.path.basename(pathAndFilename))

        if counter == index_test:
            counter = 1
            file_test.write("data/obj" + "/" + title + '.jpg' + "\n")
        else:
            file_train.write("data/obj" + "/" + title + '.jpg' + "\n")
            counter = counter + 1
            
            
# data_labelling()

<h3>c. Train the model</h3>

<p style="text-align:justify">The model training can be performed by using the <code>detector train</code> command which expects at least three parameters: data, configurations, and initial weights. Initially I used <code>yolov4.conv.137</code> as my pre trained weights which is trained in COCO dataset. Then, I retrained my weights to further improve the results. Overall it took me about six hours of training to get a good result.</p>

In [None]:
# train by transfer learning from weights trained on coco dataset by darknet
def train():
#     !./darknet detector train data/obj.data cfg/yolov4-obj.cfg yolov4.conv.137 -dont_show -map
    !./darknet detector train data/obj.data cfg/yolov4-obj.cfg backup/yolov4-obj_last.weights -dont_show
# train()

In [None]:
# continue training model by transfer learning from weights trained on custom dataset
def train_on_custom_set():
    !./darknet detector train data/obj.data cfg/yolov4-obj.cfg backup/yolov4-obj_last.weights -dont_show -map
# train_on_custom_set()

## Check performance

In [None]:
os.getcwd()

In [None]:
# define helper function imShow
def imShow(path):
    image = cv2.imread(path)
    height, width = image.shape[:2]
    resized_image = cv2.resize(image,(3*width, 3*height), interpolation = cv2.INTER_CUBIC)

    fig = plt.gcf()
    fig.set_size_inches(18, 10)
    plt.axis("off")
    plt.imshow(cv2.cvtColor(resized_image, cv2.COLOR_BGR2RGB))
    plt.show()
imShow('chart.png')

## Check mAP (mean average precision)

You can check mAP for all the weights saved every 1000 iterations for eg:- yolov4-custom_4000.weights, yolov4-custom_5000.weights, yolov4-custom_6000.weights, and so on. This way you can find out which weights file gives you the best result. The higher the mAP the better it is.

In [None]:
!./darknet detector map data/obj.data cfg/yolov4-obj.cfg backup/yolov4-obj_5000.weights -points 0

## Test your custom Object Detector

Create a copy of yolov4=obj.cfg config file to set it to test mode

Change line batch to batch=1

Change line subdivisions to subdivisions=1

## Run detector on an image

Run your custom detector on an image with this command. (The thresh flag sets the minimum accuracy required for object detection)

In [None]:
!./darknet detector test data/obj.data cfg/yolov4-obj-test.cfg backup/yolov4-obj_final.weights data/360.jpg -thresh 0.3

In [None]:
display(Image(filename="predictions.jpg"))

## Run detector on a video

Run your custom detector on a video with this command. (The thresh flag sets the minimum accuracy required for object detection)

In [None]:
!./darknet detector demo data/obj.data cfg/yolov4-obj-test.cfg backup/yolov4-obj_best.weights -dont_show ../cctv_footage/Desire_Clip_02.mp4 -thresh 0.5 -i 0 -out_filename ../cctv_footage/results1.mp4

<h3>d. Perform Non-Max Suppression</h3>

<p style="text-align:justify">The non max supression created by <a href="https://github.com/AlexeyAB/darknet/blob/master/darknet.py">darknet</a> prioritizes the bottom right bounding box and removes the overlap. I have updated the code of darknet to get the maximum confidence of the bounding boxes and remove the overlap. The solution could be slower than darknet implementation but it yielded into better accuracy which is more important in this project. I chose 65% threshold for the non-max suppression as it provided optimal result.</p>

In [None]:
def non_max_suppression_fast1(detections, overlap_thresh):
    """ modified non max suppression from darknet to get the overlap
        with max confidence
    
    Parameters
    ==========
    detections       :     tuple
                           class_name, confidence, and coordinates
    overlap_thresh   :     float
                           IOU threshold
    
    Returns
    ==========
    non_max_suppression_fast   :   tuple
                                   detections without high overlap
    """
    boxes = []
    confs = []

    for detection in detections:
        class_name, conf, (x, y, w, h) = detection
#         print("\n\nclass_name",class_name, "\n\nconf",conf, "\n\nx",x, "\n\ny",y, "\n\nw",w, "\n\nh",h)
        x1 = x - w / 2
        y1 = y - h / 2
        x2 = x + w / 2
        y2 = y + h / 2
        boxes.append(np.array([x1, y1, x2, y2]))
        confs.append(conf)
   
    boxes_array = np.array(boxes)

    # initialize the list of picked indexes
    pick = []
    # grab the coordinates of the bounding boxes
    x1 = boxes_array[:, 0]
    y1 = boxes_array[:, 1]
    x2 = boxes_array[:, 2]
    y2 = boxes_array[:, 3]
    # compute the area of the bounding boxes and sort the bounding
    # boxes by the bottom-right y-coordinate of the bounding box
    area = (x2 - x1 + 1) * (y2 - y1 + 1)
    idxs = np.argsort(y2)
    confs = np.array(confs)
    # keep looping while some indexes still remain in the indexes
    # list

    while len(idxs) > 0:
        # grab the last index in the indexes list and add the
        # index value to the list of picked indexes
        last = len(idxs) - 1
        i = idxs[last]
        # find the largest (x, y) coordinates for the start of
        # the bounding box and the smallest (x, y) coordinates
        # for the end of the bounding box

        xx1 = np.maximum(x1[i], x1[idxs[:last]])
        yy1 = np.maximum(y1[i], y1[idxs[:last]])
        xx2 = np.minimum(x2[i], x2[idxs[:last]])
        yy2 = np.minimum(y2[i], y2[idxs[:last]])
        # compute the width and height of the bounding box
        w = np.maximum(0, xx2 - xx1 + 1)
        h = np.maximum(0, yy2 - yy1 + 1)
        # compute the ratio of overlap
        overlap = (w * h) / area[idxs[:last]]

        # choose the highest confidence among overlaps
        overlap_args = np.where(overlap > overlap_thresh)[0]
        overlap_indices = idxs[overlap_args].tolist() + [i]
        confidence_list = confs[idxs[overlap_args]].tolist() + [confs[i]]
        confidence_list = list(map(float, confidence_list))
        highest_confidence = np.argmax(confidence_list)
        pick.append(overlap_indices[highest_confidence])

        # delete indices that overlaps
        idxs = np.delete(idxs, np.concatenate(([last], overlap_args)))

    return [detections[i] for i in pick]

<h3>e. Inference custom YOLO </h3>

<p style="text-align:justify">Now that we have trained our custom YOLO model and created a custom non-max suppression, we can now try if the model would work on a test set. The average precision increased from 69% trained on coco dataset to 98% in our custom dataset. You can evaluate using mean average precision of the object. It can be calculated by adjusting the threshold of confidence and get the average precision score on 50% IoU score. It can be calculated by using <code>detector map</code> command of YOLOv4 repository.</p>

In [None]:
# import darknet functions to perform object detections
from darknet import *
# load in our YOLOv4 architecture network
(network, 
 class_names, 
 class_colors) = load_network("cfg/yolov4-obj.cfg", 
                              "data/obj.data", 
                              "backup/yolov4-obj_final.weights")
width = network_width(network)
height = network_height(network)

# darknet helper function to run detection on image
def darknet_helper(img, width, height):
    """ darknet helper function to get detections, width and height ratio

    Parameters
    ==========
    img           :    np.array
                       image file
    width         :    int
                       width
    height        :    int
                       height
    
    Returns
    =========
    darknet_helper  : tuple
                      tuple of detections, width and height ratio
    """
    darknet_image = make_image(width, height, 3)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_resized = cv2.resize(img_rgb, (width, height),
                              interpolation=cv2.INTER_LINEAR)

    # get image ratios to convert bounding boxes to proper size
    img_height, img_width, _ = img.shape
    width_ratio = img_width/width
    height_ratio = img_height/height

    # run model on darknet style image to get detections
    copy_image_from_bytes(darknet_image, img_resized.tobytes())
    detections = detect_image(network, class_names, darknet_image)
    free_image(darknet_image)
    return detections, width_ratio, height_ratio

<h3>f. Run Custom Yolo On sample Image  </h3>

In [None]:
vidcap = cv2.VideoCapture('../cctv_footage/1.mp4')

for i in range(15):
    success,frame = vidcap.read()

# get the predicted detections of the trained custom yolo
detections, width_ratio, height_ratio = darknet_helper(frame, width, height)

# apply non max suppression to eliminate multiple predictions
# on same person
detections = non_max_suppression_fast1(detections, 0.65)

for label, confidence, bbox in detections:
    left, top, right, bottom = bbox2points(bbox)
    left, top, right, bottom = (int(left * width_ratio), int(top * height_ratio), 
    int(right * width_ratio), int(bottom * height_ratio))
    cv2.rectangle(frame, (left, top), (right, bottom), person_color, 2)
    cv2.putText(frame, "{} [{:.2f}]".format(label, float(confidence)),
                    (left, top - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    person_color, 2)

# cv2_imshow(frame)

# Check if the video frame was successfully read
if success:
    plt.figure(figsize=(15, 10))
    plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    plt.axis('off')  # Optional: Turn off the axis if you don't want it
    plt.show()

<h3>g. Evaluate In validation set</h3>

In [None]:
# !./darknet detector map data/obj.data cfg/yolov4-obj.cfg backup/yolov4-obj_best.weights -points 0

<h2>3. Set up work station area</h2>

<p style="text-align:justify">We now get the coordinates of the four work stations. I used paint to manually get the coordinates of each of the stations and plotted inte figure below.</p>

In [None]:
# run custom yolo on a sample image 
# %cd ..
vidcap = cv2.VideoCapture('../cctv_footage/1.mp4')
success,frame = vidcap.read()


for stations, coordinate in coordinates.items():
    cv2.rectangle(frame, (coordinate['x1'], coordinate['y1']), 
                    (coordinate['x2'], coordinate['y2']), station_color, 2)
    
    text_size, baseline = cv2.getTextSize(f" {stations}", cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
                                   
    cv2.rectangle(frame,
              (coordinate['x1'], coordinate['y1'] - text_size[1] - 5),
              (coordinate['x1'] + text_size[0], coordinate['y1'] - 5),
              (128, 0, 128),
              cv2.FILLED)
    
    cv2.putText(frame, f" {stations}",
                (coordinate['x1'], coordinate['y1'] - 7), 
                cv2.FONT_HERSHEY_SIMPLEX, 0.6,
                (0,0,0), 2)

plt.figure(figsize=(15, 10))
plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
plt.axis('off')  # Optional: Turn off the axis if you don't want it
plt.show()

<h2>4. Integrate Custom YOLO on work stations</h2>

<p style="text-align:justify">To integrate work stations in our custom network, we need to set up a rule to determine if the work station is taken or vacant. I used the Intersect Over Union (IoU) of 0.3 as a threshold if they overlap with the person to determine if the station is taken. </p>

In [None]:
def get_iou(bb1, bb2):
    """
    Calculate the Intersection over Union (IoU) of two bounding boxes.

    Parameters
    ----------
    bb1 : dict
        Keys: {'x1', 'x2', 'y1', 'y2'}
        The (x1, y1) position is at the top left corner,
        the (x2, y2) position is at the bottom right corner
    bb2 : dict
        Keys: {'x1', 'x2', 'y1', 'y2'}
        The (x, y) position is at the top left corner,
        the (x2, y2) position is at the bottom right corner

    Returns
    -------
    float
        in [0, 1]
    """
    # determine the coordinates of the intersection rectangle
    x_left = max(bb1['x1'], bb2['x1'])
    y_top = max(bb1['y1'], bb2['y1'])
    x_right = min(bb1['x2'], bb2['x2'])
    y_bottom = min(bb1['y2'], bb2['y2'])

    if x_right < x_left or y_bottom < y_top:
        return 0.0

    # The intersection of two axis-aligned bounding boxes is always an
    # axis-aligned bounding box
    intersection_area = (x_right - x_left) * (y_bottom - y_top)

    # compute the area of both AABBs
    bb1_area = (bb1['x2'] - bb1['x1']) * (bb1['y2'] - bb1['y1'])
    bb2_area = (bb2['x2'] - bb2['x1']) * (bb2['y2'] - bb2['y1'])
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
    
    return iou 

### Run Custom Yolo On  Sample Image 

In [None]:
vidcap = cv2.VideoCapture('../cctv_footage/1.mp4')
success,frame = vidcap.read()

# get the predicted detections of the trained custom yolo
detections, width_ratio, height_ratio = darknet_helper(frame, width, height)

# apply non max suppression to eliminate multiple predictions
# on same person
detections = non_max_suppression_fast1(detections, 0.65)
detections_bb = []
for label, confidence, bbox in detections:
    left, top, right, bottom = bbox2points(bbox)
    left, top, right, bottom = (int(left * width_ratio), 
                                int(top * height_ratio), 
                                int(right * width_ratio), 
                                int(bottom * height_ratio))
    
    cv2.rectangle(frame, (left, top), (right, bottom), person_color, 2)
    
    text_size, _ = cv2.getTextSize("{} [{:.2f}]".format(label, float(confidence)), cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
                                   
    cv2.rectangle(frame,
              (left, top - text_size[1] - 5),
              (left + text_size[0], top - 5),
              (0, 0, 0),  # Black color for the background
              cv2.FILLED)
    
    
    cv2.putText(frame, "{} [{:.2f}]".format(label, float(confidence)),
                    (left, top - 7), cv2.FONT_HERSHEY_SIMPLEX, 0.6,
                    person_color, 2)

    detections_bb.append({
        'x1' : left,
        'y1' : top,
        'x2' : right,
        'y2' : bottom
    })

thresh = 0.3
for stations, coordinate in coordinates.items():
    taken = False
    for detection in detections_bb:
        iou = get_iou(coordinate, detection)
        if iou >= thresh:
            taken = True
            break
    color = taken_color if taken else vacant_color
        
    cv2.rectangle(frame, (coordinate['x1'], coordinate['y1']), 
                    (coordinate['x2'], coordinate['y2']), color, 2)
    
    text_size, _ = cv2.getTextSize(f"{stations}", cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
        
    cv2.rectangle(frame,
          (coordinate['x1'], coordinate['y1'] - text_size[1] - 5),
          (coordinate['x1'] + text_size[0], coordinate['y1'] - 5),
          (0, 0, 0),  # Black color for the background
          cv2.FILLED)
    
    cv2.putText(frame, f"{stations}",
                (coordinate['x1'], coordinate['y1'] - 7), 
                cv2.FONT_HERSHEY_SIMPLEX, 0.6,
                color, 2)
frame = cv2.resize(frame, (1080, 720), 
                    interpolation=cv2.INTER_AREA)

# cv2_imshow(frame)

plt.figure(figsize=(15, 10))
plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
plt.axis('off')  # Optional: Turn off the axis if you don't want it
plt.show()

<h2>5. Extract datetime information</h2>

<p style="text-align:justify">To get the time the employee stays in its work station, we can perform it in two ways: We can use the information of the image presented by the DVR which can be seen in the upper right of the image, or used the information of the camera by knowing the  frames per second of the camera. In this project I used both method but the former is more appropriate since based on my experience, CCTV cameras are often changed compared to the DVR. Cameras could have multiple frames per second, and using that information could yield to incorrect classification of time. Hence, extracting the information produced by the DVR would have longer usability of the model that we would create. Here is the sample of the original image:</p>

In [None]:
vidcap = cv2.VideoCapture('../cctv_footage/Desire_Clip_02.mp4')
success, frame = vidcap.read()
if success:
    plt.figure(figsize=(15, 10))
    plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

## Mark Time/Date Area

In [None]:
vidcap = cv2.VideoCapture('../cctv_footage/Desire_Clip_02.mp4')
success,frame = vidcap.read()
cv2.rectangle(frame, (coordinates_ocr[0][0], coordinates_ocr[0][1]), (coordinates_ocr[1][0], coordinates_ocr[1][1]), vacant_color, 2)
plt.figure(figsize=(15, 10))
plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.show()

### a. Datetime information using OCR

<p style="text-align:justify">First, I performed image preprocessing in order for the OCR model to understand the text in the image more accurately. The image processing techniques performed are extract the image to get the area with the date time only, convert the image into BGR to Gray, adjust the contrast and brightness to make the background brighter before thresholding, perform adaptive thresholding to remove the background, and lasty use morphological operations such as multiple erosion to make the text bolder.</p>

In [None]:
def multi_ero(im, num):
    """ Perform multiple erosion on the image

    Parameters
    ==========
    im        :     np.array
                    image file
    num       :     int
                    number of times to apply erosion
    """
    for i in range(num):
        im = erosion(im)
    return im

def date_time_ocr():
    imgs = []
    # get images for testing 
    vidcap = cv2.VideoCapture('../XVR_ch5_main_20220214100004_20220214110005.mp4')
    success,frame = vidcap.read()
    for i in tqdm(range(8000)):
        # Capture frame-by-frame
        success, frame = vidcap.read()
        if not i % 50: 
            if frame is not None:
                imgs.append(frame)
            else:
                pass
    invalid = []
    valid = []
    datetime_clean = []


    for img in tqdm(imgs):
        img = cv2.cvtColor(img.copy(), cv2.COLOR_BGR2GRAY)

        contrast = 1.5
        contrast = max(contrast, 1.0); contrast = min(contrast, 3.0)

        brightness = 60
        brightness = max(brightness, 0.0); brightness = min(brightness, 100.0)

        img = np.clip(contrast * img.astype('float32') 
                            + brightness, 0.0, 255.0)

        img = img.astype('uint8')

        img = cv2.adaptiveThreshold(img,
                                    255,
                                    cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                    cv2.THRESH_BINARY,
                                    21, 2)

        img = img[coordinates_ocr[0][1]:coordinates_ocr[1][1], 
                  coordinates_ocr[0][0]:coordinates_ocr[1][0]]

        img = multi_ero(img, 2)
        datetime_clean.append(img)
        text = pytesseract.image_to_string(img, lang='eng', 
                config='--psm 10 --oem 3 -c tessedit_char_whitelist=0123456789:-')

        time_format = r'[0-5]\d:[0-5]\d:[0-5]\d'
        date_format = r'\d{4}-(?:0\d|1[12])-(?:[0-2]\d|3[01])'
        datetime_format = date_format + time_format
        text = text.replace(' ', '')
        try:
            timestamp_string = re.sub('(\d{4}-(?:0\d|1[12])-(?:[0-2]\d|3[01]))', 
                                    r'\1' + r' ', 
                                    re.findall(datetime_format, text)[0])
        except:
            invalid.append(text)
            continue


        if len(text) != 20:
            invalid.append(text)

        else:
            valid.append(text)

# def date_time_ocr():            

In [None]:
# cv2_imshow(datetime_clean[1])

<p style="text-align:justify">I used <a href="https://pypi.org/project/pytesseract/">Pytesseract</a> to read the text in the image and convert it to datetime object of python. Pytesseract is an optical character recognition (OCR) tool for python made by Google that uses Deep Learning and in particular LSTM to predict on the text of the image. I used the following configurations:<code>psm</code>=10 so that it it will classify per character, and <code>tessedit_char_whitelist=0123456789:-</code> so that the model would be forced to classify between these whitelist characters which are expected in our date and time element</p> 

In [None]:
def get_ocr_datetime(img, contrast=3, brightness=60):
    """ get the datetime equivalent based on the image

    Parameters
    ==========
    img        :    np.array
                    image file
    contrast   :    int
                    contrast between 1-3
    brightness :    int 
                    brightness between 0-100

    Returns
    =========
    get_ocr_datetime  :   datetime.datetime
                          datetime equivalent of the cctv image
    """
    # convert to grayscale
    img = cv2.cvtColor(img.copy(), cv2.COLOR_BGR2GRAY)

    contrast = max(contrast, 1.0)
    contrast = min(contrast, 3.0)
    
    brightness = max(brightness, 0.0) 
    brightness = min(brightness, 100.0)

    # clip image based on contrast and brightness provided
    img = np.clip(contrast * img.astype('float32') 
                        + brightness, 0.0, 255.0)

    img = img.astype('uint8')

    # perform adaptive thresholding
    img = cv2.adaptiveThreshold(img,
                              255,
                              cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                              cv2.THRESH_BINARY,
                              21, 2)

    # perform segmentation on the region of interest
    img = img[coordinates_ocr[0][1]:coordinates_ocr[1][1], 
              coordinates_ocr[0][0]:coordinates_ocr[1][0]]

    # perform multiple erosion
    img = multi_ero(img, 2)
    
    # get text using pytesseract 
    text = pytesseract.image_to_string(img, lang='eng', 
            config='--psm 10 --oem 3 -c tessedit_char_whitelist=0123456789:-')
    
    # check validity of results
    time_format = r'[0-5]\d:[0-5]\d:[0-5]\d'
    date_format = r'\d{4}-(?:0\d|1[12])-(?:[0-2]\d|3[01])'
    datetime_format = date_format + time_format
    text = text.replace(' ', '')

    if len(text) == 20:
        text = '2022-02-14' + text[10:]
    
    try:
        timestamp_string = re.sub('(\d{4}-(?:0\d|1[12])-(?:[0-2]\d|3[01]))', 
                                r'\1' + r' ', 
                                re.findall(datetime_format, text)[0])
    except:
        return None
    
    return datetime.strptime(timestamp_string, "%Y-%m-%d %H:%M:%S")

In [None]:
# print(f'correct datetime format percentage: {len(valid)/len(imgs) * 100}')

<p style="text-align:justify">I checked the format and found out that 70% of the results are correctly classified in a correct datetime format by the model. In particular, it experienced difficulties in classifying number eight when it is near six. However there are about 15 frames per second so there will be a lot of chance for the model to get the correct time. Here is a sample of the output of the pytesseract that is passed to be converted to Python datetime. </p>

In [None]:
# get_ocr_datetime(imgs[1])

### b. Datetime information using fps

<p style="text-align:justify">The camera that I am using is a 15 fps which is one of the standard of a CCTV. Using that information, we count the number of frames an employee is sitting on its station then for every 15 frames we count that as 1 second.</p>

<h2>6. Integration of timer to workstation </h2>

<h3>a. OCR</h3>

In [None]:
# initialize timer per station
# list definition:
# list[0] : total time in work station
# list[1] : last datetime 
# list[2] : debt

def on_ocr():
    timer = {'station_' + str(i): [timedelta(0), None, False] for i in range(1,10)}

    # %cd /content/
    cap = cv2.VideoCapture('../cctv_footage/Desire_Clip_02.mp4')

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print("Total Frame: ",total_frames)

    success,frame = cap.read()

    width =  1600
    height = 900
    resize = True
    img_array =[]

    for i in tqdm(range(total_frames)):
        # Capture frame-by-frame
        ret, frame = cap.read()

        if i <= 1600:
            continue

        detections, width_ratio, height_ratio = darknet_helper(frame, 
                                                               width, 
                                                               height)
        detections = non_max_suppression_fast1(detections, 0.65)
        detections_bb = []
        for label, confidence, bbox in detections:
            left, top, right, bottom = bbox2points(bbox)
            left, top, right, bottom = (int(left * width_ratio), 
                                        int(top * height_ratio), 
                                        int(right * width_ratio), 
                                        int(bottom * height_ratio))
            cv2.rectangle(frame, (left, top), (right, bottom), person_color, 2)
            cv2.putText(frame, "{} [{:.2f}]".format(label, float(confidence)),
                                (left, top - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                                person_color, 4)

            detections_bb.append({
                    'x1' : left,
                    'y1' : top,
                    'x2' : right,
                    'y2' : bottom
                })

        thresh = 0.3

        for stations, coordinate in coordinates.items():
            taken = False
            for detection in detections_bb:
                iou = get_iou(coordinate, detection)
                if iou >= thresh:
                    taken = True
                    break

            if taken or timer[stations][2]:
                ocr_time = get_ocr_datetime(frame)
                if ocr_time is None:
                    timer[stations][2] = True
                    continue
                else:
                    timer[stations][2] = False
                    if timer[stations][1] is None:
                        timer[stations][1] = ocr_time
                    else:
                        if timer[stations][1] > ocr_time:
                            # invalid time
                            timer[stations][2] = True
                        elif (ocr_time - timer[stations][1]) <= timedelta(seconds=3):
                            timer[stations][0] += (ocr_time - timer[stations][1])
                            timer[stations][1] = ocr_time
                        else:
                            # invalid time
                            timer[stations][2] = True

            color = taken_color if taken else vacant_color

            cv2.rectangle(frame, (coordinate['x1'], coordinate['y1']), 
                            (coordinate['x2'], coordinate['y2']), color, 2)

            cv2.putText(frame, f"{stations} [{str(timer[stations][0])}]",
                        (coordinate['x1'], coordinate['y1'] - 5), 
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                        color, 2)

        if resize:
            frame = cv2.resize(frame, (width, height), 
                                interpolation=cv2.INTER_AREA)
        img_array.append(frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # When everything done, release the capture
    cap.release()
    cv2.destroyAllWindows()
    
# on_ocr()

### b. FPS

In [None]:
# initialize timer per station
# list definition:
# list[0] : total time in work station
# list[1] : number of frames in each work station
timer = {'station_' + str(i): [timedelta(0), 0] for i in range(1,len(coordinates)+1)}
cap = cv2.VideoCapture('../cctv_footage/1.mp4')

total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
success,frame = cap.read()

width =  frame_width
height = frame_height
resize = False
img_array =[]
for i in tqdm(range(total_frames-1)):
    # Capture frame-by-frame
    ret, frame = cap.read()

#     if i <= 1600:
#         continue

    detections, width_ratio, height_ratio = darknet_helper(frame, 
                                                           width, 
                                                           height)
    detections = non_max_suppression_fast1(detections, 0.65)
    detections_bb = []
    for label, confidence, bbox in detections:
        left, top, right, bottom = bbox2points(bbox)
        left, top, right, bottom = (int(left * width_ratio), 
                                    int(top * height_ratio), 
                                    int(right * width_ratio), 
                                    int(bottom * height_ratio))
        cv2.rectangle(frame, (left, top), (right, bottom), person_color, 2)
        
        text_size, _ = cv2.getTextSize("{} [{:.2f}]".format(label, float(confidence)), cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
        cv2.rectangle(frame,
              (left, top - text_size[1] - 5),
              (left + text_size[0], top - 5),
              (0, 0, 0),  # Black color for the background
              cv2.FILLED)
        
        cv2.putText(frame, "{} [{:.2f}]".format(label, float(confidence)),
                            (left, top - 7), cv2.FONT_HERSHEY_SIMPLEX, 0.6,
                            person_color, 4)
        
        detections_bb.append({
                'x1' : left,
                'y1' : top,
                'x2' : right,
                'y2' : bottom
            })
    
    thresh = 0.2
    
    for stations, coordinate in coordinates.items():
        taken = False
        for detection in detections_bb:
            iou = get_iou(coordinate, detection)
            if iou >= thresh:
                taken = True
                break
        
        if taken:
            timer[stations][1] += 1
            if timer[stations][1] % 10 == 0:
                timer[stations][1] = 0
                timer[stations][0] += timedelta(seconds=1)
              

        color = taken_color if taken else vacant_color
            
        cv2.rectangle(frame, (coordinate['x1'], coordinate['y1']), 
                        (coordinate['x2'], coordinate['y2']), color, 2)
        
        text_size, _ = cv2.getTextSize(f"{stations} [{str(timer[stations][0])}]", cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
        
        cv2.rectangle(frame,
              (coordinate['x1'], coordinate['y1'] - text_size[1] - 5),
              (coordinate['x1'] + text_size[0], coordinate['y1'] - 5),
              (0, 0, 0),  # Black color for the background
              cv2.FILLED)
        
        cv2.putText(frame, f"{stations} [{str(timer[stations][0])}]",
                    (coordinate['x1'], coordinate['y1'] - 7), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6,
                    color, 2)
        
    count_person = len(detections_bb)
    cv2.rectangle(frame, (23, 26), 
                      (208, 63), (0,0,0), -1)
    
    cv2.putText(frame, f"Count of Person: {count_person:0>2}",
                    (23 + 5,26+ 25), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    (255,255, 255), 2)

    if resize:
        frame = cv2.resize(frame, (width, height), 
                            interpolation=cv2.INTER_AREA)
#     print(frame)
    img_array.append(frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# When everything done, release the capture
cap.release()
cv2.destroyAllWindows()

## 7. Save frames as Video

<p style="text-align:justify">Save frames as video as mp4 and compress it in order to be compatible with Google Colab. The demo video will be submitted separately so it will not blow up the size of the notebook.</p>

In [None]:
def convert_image_array_to_video():
    fname = '../images/output_video1.mp4'
    if not resize:
        width= frame_width
        height= frame_height

    out = cv2.VideoWriter(fname, cv2.VideoWriter_fourcc(*'mp4v'), 
                          20, (width, height))

    for i in tqdm(range(len(img_array))):
        frame = cv2.resize(img_array[i], (width, height))
        out.write(frame)
    out.release()
    
convert_image_array_to_video()