In [1]:
import sys
sys.path.append("/data/modules")

# local files from their github
import AICity2019_winner.utils.utils as utils
import AICity2019_winner.src.reid.modeling as modeling
import AICity2019_winner.src.reid.misc as misc


import os
import json
import re

import cv2 as cv
import numpy as np
import pandas as pd

import sklearn.preprocessing as prep
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import linear_model
import skimage
from skimage.measure import label
import scipy
from scipy.ndimage.filters import gaussian_filter
import PIL
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from mmdet.apis import init_detector, inference_detector

import itertools
from collections import OrderedDict

# I/O Helpers

In [2]:
class VideoReader:
    def __init__(self, filename):
        self.filename = filename
        
        
        self.nframes = None
        self.framerate = None
        self.img_shape = None
        self._set_video_info()
        
    def load_video(self, interval=1):
        """
        Loads the images of the video
        Returns a generator with the images, and the corresponding frame numbers.
        
        interval: Interval between frames returned. eg. 1 = every frame, 20 = every 20th frame.
        """
        
        # Loads a video with opencv, returns PIL a generator
        def read_frames():
            vid = cv.VideoCapture(self.filename)

            for i in frame_nums:
                vid.set(cv.CAP_PROP_POS_FRAMES, i)

                has_frame, img = vid.read()

                if has_frame:
                    yield img
                else:
                    break

            vid.release()
            
        frame_nums = range(0, self.nframes, interval)
        
        return read_frames(), frame_nums

    def get_frame(self, n):
        """
        Returns the image at a specific frame number
        """
        
        vid = cv.VideoCapture(self.filename)
        vid.set(cv.CAP_PROP_POS_FRAMES, n)

        _, img = vid.read()


        vid.release()
        return img
    
    def _set_video_info(self):
        vid = cv.VideoCapture(self.filename)
        _, img = vid.read()

        self.nframes = int(vid.get(cv.CAP_PROP_FRAME_COUNT))
        self.framerate = vid.get(cv.CAP_PROP_FPS)
        self.img_shape = img.shape

        vid.release()

In [3]:
class ImageReader:
    def __init__(self, folder):
        self.folder = folder
        
        self.filenames = sorted(os.listdir(folder), key=lambda f: int(f[:-4])) # "123.jpg" -> sort by 123 instead of the full string
        
    def load_images(self):
        """
        Loads the images, returning them as a generator.
        
        """
        for filename in self.filenames:
            file_path = os.path.join(self.folder, filename)
            img = cv.imread(file_path)
            
            yield img

---
# Background Modelling

In [4]:
def calc_background(images, interval=20, alpha=0.1, start_frame=1, threshold=5):
    """
    Calculates the background over all of the images by averaging them.
    
    images: iterable of numpy images
    interval: number of images between each background calculation
    alpha: weighting of averaging. high = more of new frame, low = more of running average.
    start_frame: frame number to start averaging on
    threshold: Mean Absolute Error threshold between frames. Only calculates if there is a significant difference.
    
    
    """
    
    running_bg = None
    prev_img = None
    for i, img in enumerate(images):
        if running_bg is None: # initial image
            running_bg = img
            prev_img = img
            continue
        
        if (i - start_frame) % interval != 0: # every (i * internal_frame + start_frame) frames, do the calcs
            continue
        
        diff = np.mean(np.abs(prev_img - img))
        if diff > threshold: # if new image is significantly different from old
            running_bg = (1 - alpha) * running_bg + alpha * img # new background
            yield running_bg, i
        
        else:
            yield running_bg * 0, i  # black image
            
        prev_img = img

In [None]:
def calc_bg_tensor(images, interval=20, alpha=0.1, start_frame=1, threshold=5):
    """
    Same as calc_background function, uses GPU instead. Doesn't seem to speed it up much.
    """
    
    
    running_bg = None
    prev_img = None
    for i, img in enumerate(images):
        img = torch.as_tensor(img, device="cuda", dtype=torch.float16)
        
        if running_bg is None: # initial image
            running_bg = img.clone()
            prev_img = img.clone()
            continue
        
        if (i - start_frame) % interval != 0: # every (i * internal_frame + start_frame) frames, do the calcs
            continue
        
        diff = prev_img.sub_(img).abs_().mean()
        if diff.item() > threshold: # if new image is significantly different from old
            running_bg.mul_(1 - alpha).add_(img.mul(alpha)) # new background
            yield running_bg.to(dtype=torch.uint8, device="cpu", non_blocking=True).numpy(), i
        
        else:
            yield running_bg.mul(0).to(dtype=torch.uint8, device="cpu", non_blocking=True).numpy(), i  # black image
            
        prev_img = img

In [5]:
def calc_bg_full_video(video_path, output_folder, interval=20, alpha=0.1, start_frame=1, threshold=5, verbose=False):
    """
    Create background images for a single video. Assumes output_folder exists already.
    
    video_path: path to raw video
    output_folder: folder to put background images in
    interval, alpha, start_frame, threshold: see calc_background function
    verbose: print out progress
    
    """
    
    
    vid = VideoReader(video_path)
    raw_imgs, _ = vid.load_video()
    bg_images = calc_background(raw_imgs, interval, alpha, start_frame, threshold)
    
    for bg_img, frame in bg_images:
        filename = os.path.join(output_folder, f"{frame}.jpg")
        cv.imwrite(filename, bg_img)
        
        if verbose:
            print(f"{frame}/{vid.nframes}")

---
# Object Detection

In [6]:
class Detector:
    """
    built using: https://github.com/open-mmlab/mmdetection/tree/master/configs/htc
    (HTC + DCN + ResNeXt-101-FPN, mAP=50.7 model)
    
    Might take some fiddling to make it work with any mmdetection model
    
    config_file: "htc/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e.py"
    checkpoint_file: "htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e_20190408-0e50669c.pth"
    class_restrictions: list of classes to detect, others are discarded. if None, it will detect all classes
    
    """
    
    def __init__(self, config_path, model_path, verbose_interval=None, class_restrictions=set([2,3,5,7])):
        torch.cuda.empty_cache()
        self.model = init_detector(config_path, model_path, device='cuda:0')
        self.class_labels = self.model.CLASSES
        self.class_restrictions = class_restrictions
        
        self.verbose_interval = verbose_interval
        
    def detect_objects(self, img):
        """
        Runs object detection on an image.
        Returns bounding boxes [x1, y1, x2, y2], class labels, and confidence scores
        """
        
        results = inference_detector(self.model, img) # I think they are segmentations, not sure though
        if len(results) == 2:
            results, segments = results
        
        bbox_and_scores = np.vstack(results)
        bboxes, scores = bbox_and_scores[:, :4], bbox_and_scores[:, 4]
        labels = np.concatenate([[i] * len(bbox) for i, bbox in enumerate(results)]).astype(int)
        
        order = np.argsort(scores)[::-1] # sort
        return bboxes[order], labels[order], scores[order]
    
    def detect_crop(self, img, crop_boxes):
        """
        Splits an image into boxes, upscales them, performs detection, downscales detections, merges detections
        
        img: numpy array of image
        crop_boxes: list of crop bounding boxes [x1, y1, x2, y2]
        
        returns: detection results [[x1, y1, x2, y2, score, class], ...]
        """
        pil_img = PIL.Image.fromarray(img)
        crops = crop_image(pil_img, crop_boxes)
        resized, biggest = resize_crops(crops)
        
        resized_np = (np.array(img) for img in resized)
        
        crop_results = self.detect_images(resized_np, verbose=False)
        
        bboxes = []
        for i, x1, y1, x2, y2, score, cls in crop_results.values:
            bboxes.append(cropped_detection_to_original((x1, y1, x2, y2), crop_boxes[int(i)], biggest))
        bboxes = np.array(bboxes)
        
        scores = crop_results["score"].values
        labels = crop_results["class"].values
        
        order = np.argsort(scores)[::-1] # sort
        return bboxes[order], labels[order], scores[order]

    
    
    def detect_images(self, images, frames=None, crop_boxes=None, verbose=True):
        """
        Runs object detection on images, yields all results at the end.
        
        images: iterable of numpy array images
        frames: iterable of frame numbers/names corresponding to images
        crop_boxes: if provided, will crop and rescale for detection
        verbose: override for self.verbose_interval
        
        Returns dataframe with detection results
        """


        results = []
        for i, img in enumerate(images):
            frame = i if frames is None else frames[i]

            if crop_boxes is not None:
                bboxes, labels, scores = self.detect_crop(img, crop_boxes)
            else:
                bboxes, labels, scores = self.detect_objects(img)
            
            for (x1, y1, x2, y2), cls, score in zip(bboxes, labels, scores):
                if self.class_restrictions and cls not in self.class_restrictions:
                    continue 
                
                results.append([frame, x1, y1, x2, y2, score, cls])
                
            if verbose and self.verbose_interval and (i % self.verbose_interval) == 0:
                print(f"Detecting image: {frame}")
#                 print(*results[-5:], sep="\n")
                
        return pd.DataFrame(data=results, columns=["frame", "x1", "y1", "x2", "y2", "score", "class"])
    
    
    def detect_images_generator(self, images, frames=None, crop_boxes=None, verbose=True):
        """
        Runs object detection on images, yielding the results one frame at a time.
        
        images: iterable of numpy array images
        frames: iterable of frame numbers/names corresponding to images
        crop_boxes: if provided, will crop and rescale for detection
        verbose: override for self.verbose_interval
        
        Returns frame, dataframe with detection results.
        """
        
        for i, img in enumerate(images):
            frame = i if frames is None else frames[i]

            if crop_boxes is not None:
                bboxes, labels, scores = self.detect_crop(img, crop_boxes)
            else:
                bboxes, labels, scores = self.detect_objects(img)
                
            results = []
            for (x1, y1, x2, y2), cls, score in zip(bboxes, labels, scores):
                if self.class_restrictions and cls not in self.class_restrictions:
                    continue 
                
                results.append([frame, x1, y1, x2, y2, score, cls])
                
            yield frame, pd.DataFrame(data=results, columns=["frame", "x1", "y1", "x2", "y2", "score", "class"])
        
        
            
    
    def label_image(self, img, bboxes, labels, scores, score_thresh=0.1):
        """
        Returns an image with bounding boxes drawn on it
        """
        
        
        def get_class_color(cls):
            return (0, 0, 255) # todo
        
        img = np.copy(img)
        
        for (x1, y1, x2, y2), cls, score in zip(bboxes, labels, scores):
            if score < score_thresh:
                continue
            
            col = get_class_color(cls)
            cv.rectangle(img, (int(x1), int(y1)), (int(x2), int(y2)), col, 1) # draw boxes
            
        return img

In [7]:
def detect_video(video_path, output_path, model_config_path, model_checkpoint_path):
    """
    Runs object detection on a video. Saves results as a csv.
    """
    
    vid = VideoReader(video_path)
    images, filenames = vid.load_video()
    
    model = Detector(model_config_path, model_checkpoint_path)
    results = model.detect_images(images, filenames)
    
    results.to_csv(output_path, index=None)

In [8]:
def detect_image_folder(image_folder, output_path, model_config_path, model_checkpoint_path):
    """
    Runs object detection on a folder of images. Saves the results to a csv
    """
    
    
    img_reader = ImageReader(image_folder)
    
    model = Detector(model_config_path, model_checkpoint_path)
    results = model.detect_images(img_reader.load_images(), img_reader.filenames)
    
    results.to_csv(output_path, index=None)

---
# Perspective Cropping

In [9]:
def do_regression(df, plot=False):
    """
    Does linear regression of y-position vs vehicle height
    h = a * y + b
    
    df: DataFrame with columns: [x1, y1, x2, y2], the positions of the bounding boxes
    plot: plot the regression results
    """
    
    y = (df["y1"] + df["y2"]) / 2 # center y coord of bbox
    h = np.sqrt((df["y2"] - df["y1"]) * (df["x2"] - df["x1"]))  # sqrt bbox area

    y = y[np.isfinite(h)] # Remove some wonky values
    h = h[np.isfinite(h)]

        
    
    regression = linear_model.LinearRegression()
    
    regression.fit(
        np.array(y).reshape(-1, 1), # (n, 1)
        np.array(h) # (n)
    )
    
    a, b = regression.coef_[0], regression.intercept_
    
    if plot:
        f = lambda x: a * x + b
        # plot points
        plt.plot(y, h, "o", )
        plt.plot([0, max(y)], [f(0), f(max(y))])
        plt.xlabel("y position")
        plt.ylabel("sqrt bounding box area")
        plt.show()
        
        print(f"a: {a}, b: {b}")
    
    
    return a, b

In [10]:
def generate_box_row(y_min, box_height, box_width, img_width, exclude_last=False, clip_last=True):
    """
    Creates bounding boxes for a single row
    
    y_min: base y-coordinate for boxes
    box_height:
    box_width:
    img_width:
    exclude_last: whether or not to create the last box of the row, which usualy overlaps outside the image boundary.
    clip_last: if true, chops the end off the last box of the row. if exclude_last == false, this has no effect.
    
    """
    
    
    num_boxes = int(np.ceil(img_width / box_width))
    x_positions = np.linspace(0, box_width * num_boxes, num_boxes + 1)
    
    if exclude_last: # exclude box that would be partially outside image
        x_positions = x_positions[:-1]
    
    boxes = []
    for x_min, x_max in zip(x_positions[:-1], x_positions[1:]):
        if clip_last and x_max >= img_width: # clip last box to image boundary
            x_max = img_width - 1
        
        boxes.append([int(x_min), y_min, int(x_max), y_min + box_height])
    
    return boxes


def generate_crop_boxes(min_height, a_reg, b_reg, img_shape, row_capacity=3, box_aspect_ratio=2, exclude_last=False, clip_last=True):
    """
    Creates all crop boxes for the image.
    
    min_height: minimum vehicle height
    a_reg: linear regression coefficient for y-pos vs height
    b_reg: linear regression intercept for y-pos vs height
    img_shape: (height, width) of image
    row_capacity: Vehicle capacity for each row. Not 100% sure how to explain this.
    box_aspect_rato: width/height bounding box ratio
    exclude_last, clip_last: see generate_box_row function.
    
    """
    
    
    def exp_func(x):
        # Modified function, seems to work better/properly
        # capacity space -> height space
#         return int(np.exp(x) / a_reg) # theirs
        return int(np.exp(a_reg * x)) # mine
    
    def log_func(x):
        # height space -> capacity space
        return np.log(x) / a_reg
    
    def f(y):
        # h = a * y + b
        # y position -> vehicle height at that position
        return a_reg * y + b_reg
    
    def f_inv(h):
        # vehicle height -> y position
        return int((h - b_reg) / a_reg)
    
    # k * ln(k*y2+b) - k * ln(k*y1+b) this corresponds to the big integral in the paper
    total_capacity = log_func(f(img_shape[0])) - log_func(min_height)
    
    num_rows = int(np.ceil(total_capacity / row_capacity))
    stride_cap = total_capacity / num_rows # capacity stride
    
    start_capacity = log_func(min_height)
    vert_capacities = np.linspace(start_capacity, start_capacity + stride_cap * num_rows, num_rows + 1)
    
    # convert to y coord
#     y_positions = list(map((lambda x: exp_func(a_reg * x - b_reg)), vert_capacities)) # theirs
    y_heights = list(map(exp_func, vert_capacities)) # mine
    y_positions = list(map(f_inv, y_heights))
    
    
    boxes = []
    for y_min, y_max in zip(y_positions[:-1], y_positions[1:]):
        box_width = (y_max - y_min) * box_aspect_ratio
        
        boxes += generate_box_row(y_min, y_max - y_min, box_width, img_shape[1], exclude_last, clip_last)
    
    return boxes

In [11]:
def crop_image(img, crop_boxes):
    """
    img: PIL Image
    crop_boxes: list of boxes [x1, y1, x2, y2]
    """
    
    return [img.crop(box) for box in crop_boxes]

def resize_crops(crops, threshold=0.01):
    """
    Resizes images to the biggest in the list.
    Maintains aspect ratio, pads if necessary.
    
    Uses area to determine biggest image, might not work well if the biggest image is chopped off a bit
    
    """
    
    
    biggest_size = max((img.size for img in crops), key=np.prod)
    biggest_aspect = biggest_size[0] / biggest_size[1]
    
    resized = []
    for img in crops:
        aspect = img.size[0] / img.size[1]
        
        # chopped off image -> scale and pad
        if abs(biggest_aspect - aspect) > threshold: 
            scaled = img.resize((int(biggest_size[0] * aspect / biggest_aspect), biggest_size[1]))
            
            new = PIL.Image.new("RGB", biggest_size, (0,0,0))
            new.paste(scaled, scaled.getbbox())
        
        # normal image -> scale
        else:
            new = img.resize(biggest_size)
            
        resized.append(new)
            
    return resized, biggest_size

In [12]:
def create_crop_boxes(results_path, crop_boxes_path, img_shape, min_object_size=10, row_capacity=3, crop_box_aspect_ratio=2):
    # Read bboxes
    bbox_df = pd.read_csv(results_path)
    a, b = do_regression(df=bbox_df, plot=False)

    # Create crop boxes
    crop_boxes = generate_crop_boxes(min_object_size, a, b, img_shape, row_capacity, crop_box_aspect_ratio)
    
    # Save crop boxes
    pd.DataFrame(crop_boxes, columns=["x1", "y1", "x2", "y2"]).to_csv(crop_boxes_path, index=False)

In [13]:
def cropped_detection_to_original(bbox, crop_box, resized_shape):
    """
    Takes bounding boxes from detection on cropped, resized images.
    Translates them to their position on the original image
    Assumes crop boxes were not chopped off at the edge of the picture. todo
    
    
    bbox: detected bounding box on image after cropping/resizing
    crop_box: bounding box generated by generate_crop_boxes function
    resized_shape: (height, width) of image after resizing.
    
    """
    
    # Turn into numpy arrays for easier computation
    bbox = np.reshape(bbox, (2,2))
    crop_box = np.reshape(crop_box, (2,2))
    resized_shape = np.array(resized_shape)
    
    # Calculate scales
    crop_shape = crop_box[1] - crop_box[0]
    resize_scale = resized_shape / crop_shape
    
    # Translate bounding box
    bbox_original = bbox / resize_scale + crop_box[0]

    return bbox_original.reshape((4,))

---
# Ignore Region

In [14]:
def combine_boxes(fbf_bbox_df, img_height, img_width, score_threshold=0.1, normalize=True):
    count_matrix = np.zeros((img_height, img_width))
    for frame, df in fbf_bbox_df.groupby("frame"):
        tmp_score = np.zeros((img_height, img_width))

        for x1, y1, x2, y2, score in df[["x1", "y1", "x2", "y2", "score"]].values:
            x1, y1, x2, y2 = map(int, (x1, y1, x2, y2))

            if score > score_threshold:
                tmp_score[y1:y2, x1:x2] = np.maximum(score, tmp_score[y1:y2, x1:x2])  # add all the boxes into one image

        count_matrix += tmp_score

    if normalize:
        # scale to [0, 1]
        count_matrix = (count_matrix - count_matrix.min()) / (count_matrix.max() - count_matrix.min())
    
    return count_matrix

In [15]:
def get_connected_regions(mask, area_threshold=2000,):
    regions = label(mask, connectivity = 1) # get connected regions
    
    for region_idx in np.unique(regions):
        if region_idx == 0: # 0 is background
            continue
        
        region_mask = regions == region_idx
        if region_mask.sum() < area_threshold: # get rid of small regions
            mask = np.where(region_mask, False, mask)
            
    return mask

In [16]:
def create_ignore_mask(frame_by_frame_path, ignore_matrix_path, img_shape, count_threshold=0.08, area_threshold=2000, score_threshold=0.1, gaussian_sigma=3):
    # Read in bboxes
    fbf_bbox_df = pd.read_csv(frame_by_frame_path)
    
    # Combine bboxes
    heatmap = combine_boxes(fbf_bbox_df, img_shape[0], img_shape[1], score_threshold)
    
    # Create ignore mask
    mask = heatmap > count_threshold
    mask = get_connected_regions(mask, area_threshold) 
    mask = gaussian_filter(mask.astype(float), gaussian_sigma) > count_threshold
    
    # Save ignore mask
    np.save(ignore_matrix_path, mask)
    
def create_ignore_mask_generator(fbf_results, img_shape, count_threshold=0.08, area_threshold=2000, score_threshold=0.1, gaussian_sigma=3, alpha=0.1):
    """
    Creates a rolling ignore mask, suitable for live processing.
    """
    
    running_heatmap = np.zeros(img_shape[:2])
    for i, (frame, results) in enumerate(fbf_results):
        # Maybe have an alpha like how background images are made?
        running_heatmap = combine_boxes(results, img_shape[0], img_shape[1], score_threshold, normalize=False) * alpha + (1 - alpha) * running_heatmap
        # Normalise after adding instead of before
        heatmap_norm = (running_heatmap - running_heatmap.min()) / (running_heatmap.max() - running_heatmap.min())
        
        # Create ignore mask
        mask = heatmap_norm > count_threshold
        mask = get_connected_regions(mask, area_threshold)
        mask = gaussian_filter(mask.astype(float), gaussian_sigma) > count_threshold
        
        yield mask

---
# anomaly detection

### ReID Model

In [17]:
class ReidExtractor:
    """Base feature extractor class.
    args:
        features: List of features.
    """
    def __init__(self, model_name, model_path, image_size=320):
        model = modeling.build_model(model_name, 2000)
        model.cuda()
        model = nn.DataParallel(model)
        model.load_state_dict(torch.load(model_path), strict=False)
        model.eval()
        self.extractor = model
        self.image_size = image_size
        self.transforms = misc.preprocess(misc.normalize_torch, self.image_size)
        
    
    def extract(self, img_numpy, region):
        img = PIL.Image.fromarray(img_numpy)
        img = img.crop(region)
        
        model_input = self.transforms(img).unsqueeze(0)
        
        with torch.no_grad():
            features_after, features_before = self.extractor(model_input.cuda())
            
        features = prep.normalize(features_before.cpu().data.numpy(), norm="l2").astype("float32")
        
        return features

In [18]:
def anomaly_nms(all_results, iou_thresh=0.8):
    """
    Applies Non-maximal Supression to a list of anomalies. Resolves duplicate anomalies.
    
    all_results: list of anomalies [{"region": [x1, y1, x2, y2], "score": _, "start_time": _, "end_time": _}, ...]
    iou_thresh: intersection over union threshold to consider anomalies the same.
    
    """
    
    
    anomalies = np.array([[*res["region"], res["score"], res["start_time"], res["end_time"]]
                          for res in all_results])
    

    x1 = anomalies[:, 0]
    y1 = anomalies[:, 1]
    x2 = anomalies[:, 2]
    y2 = anomalies[:, 3]
    scores = anomalies[:, 4]
    start_time = anomalies[:, 5]
    end_time = anomalies[:, 6]

    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    
    order = scores.argsort()[::-1]  # sort by score
    keep = []  
    while len(order) > 0:   
        i = order[0]  
        keep.append(i)  
        
        xx1 = np.maximum(x1[i], x1[order[1:]])  # compute IoU
        yy1 = np.maximum(y1[i], y1[order[1:]])  
        xx2 = np.minimum(x2[i], x2[order[1:]])  
        yy2 = np.minimum(y2[i], y2[order[1:]])  
  
        w = np.maximum(0.0, xx2 - xx1 + 1)  
        h = np.maximum(0.0, yy2 - yy1 + 1)  
        inter = w * h  
        union = areas[i] + areas[order[1:]] - inter
        iou = inter / union 
        
        inds = np.where(iou > iou_thresh)[0]  # select overlapping boxes
        tmp_order = order[inds + 1]
        if len(tmp_order) > 0:
            anomalies[i,5] = np.min(start_time[tmp_order]) # take the widest time window
            anomalies[i,6] = np.max(end_time[tmp_order])
            
        inds = np.where(iou <= iou_thresh)[0]  
        order = order[inds + 1] 
        
    anomalies = anomalies[keep, :]
    return anomalies

In [19]:
def get_overlapping_time(anomaly_results, gap_threshold=1):
    """
    Turns the overlapping anomaly detection results into anomaly event times.
    
    eg. for anomalies starting/ending at times: (0, 100), (50, 200), (300, 400), (390, 800)
        it will return: (0, 200), (300, 800)
        
        
    anomaly_results: DataFrame with columns: "start_time", "end_time"
    gap_threshold: events with gaps less than this are merged into one.
    
    """
    
    starts = anomaly_results["start_time"].values
    ends = anomaly_results["end_time"].values
    
    order = starts.argsort() # sort by start time
    starts = starts[order]
    ends = ends[order]
    
    events = []
    i = 0
    while i < len(starts):
        event_start = starts[i]
        
        while i + 1 < len(starts) and starts[i + 1] - ends[i] < gap_threshold: # find the end of the overlap
            i += 1
        
        events.append((event_start, ends[i]))
        i += 1
    
    return events

In [20]:
def get_similarity(reid_model, img1, img2, region1, region2):
    return cosine_similarity(reid_model.extract(img1, region1), 
                             reid_model.extract(img2, region2))

In [21]:
def mask(arr, mask):
    """
    just makes it clear where masking is happening
    
    arr: np.array of data
    mask: boolean np.array with same shape as arr
    """
    
    return arr * mask

In [22]:
def add_boxes(bboxes, ignore_matrix):
    """
    Creates tmp_score and tmp_detect arrays.
    
    bboxes: list of bounding boxes and scores [x1, y1, x2, y2, score]
    ignore_matrix: Boolean mask of region to ignore for boxes.
    """
    h, w = ignore_matrix.shape
    
    tmp_score = np.zeros((h,w))
    tmp_detect = np.zeros((h,w), dtype=bool)
        
    for x1, y1, x2, y2, score in bboxes: # for each box
        x1, y1, x2, y2 = map(int, (x1, y1, x2, y2))

        tmp_score[y1:y2, x1:x2] = np.maximum(score, tmp_score[y1:y2, x1:x2]) # add box
        tmp_detect[y1:y2, x1:x2] = True

    tmp_score = mask(tmp_score, ignore_matrix) # get rid of stuff in ignore regions
    tmp_detect &= ignore_matrix
    
    return tmp_score, tmp_detect

In [23]:
def get_anomalies(video_path, reid_model_path, frame_by_frame_results_path, static_results_path, ignore_matrix_path=None, 
                  reid_model_name="resnet50", start_frame=1, frame_interval=20, abnormal_duration_thresh=60, detect_thresh=5, 
                  undetect_thresh=8, score_thresh=0.3, light_thresh=0.8, anomaly_score_thresh=0.7, similarity_thresh=0.95,
                  suspicious_time_thresh=18, verbose=False):
    """
    Performs the anomaly detection
    
    video_path: path to raw video
    reid_model_path: path to re-ID model checkpoint
    frame_by_frame_results_path: path to object detection results on raw video
    static_results_path: path to object detection results on background images
    ignore_matrix_path: path to ignore region mask
    reid_model_name: backbone used for reid model
    start_frame: video frame to start from
    frame_interval: interval between frames to do calculations on
    abnormal_duration_thresh: duration (in seconds) to consider an object abnormal
    detect_thresh: duration (in frames) to consider an object for tracking
    undetect_thresh: duration (in frames) to stop considering an object for tracking
    score_thresh: detection score threshold for bounding boxes
    light_thresh: brightness threshold (not sure what it does)
    anomaly_score_thresh: threshold to consider an object an anomaly
    similarity_thresh: threshold for object re-ID
    suspicious_time_thresh: duration (in seconds) for an object to be considered suspicious
    verbose: verbose printing
    
    
    """
    
    # Read result data
    fbf_bbox_df = pd.read_csv(frame_by_frame_results_path)
    static_results_df = pd.read_csv(static_results_path)
    
    
    # Get video data
    vid = VideoReader(video_path)
    num_frames, framerate, image_shape = vid.nframes, vid.framerate, vid.img_shape
    
    # load model
    reid_model = ReidExtractor(reid_model_name, reid_model_path)
    
    # Set up information matrices
    h, w, _ = image_shape
    
    if ignore_matrix_path is None:
        ignore_matrix = np.ones((h, w), dtype=bool) # Dont ignore anything
    else:
        ignore_matrix = np.load(ignore_matrix_path).astype(bool)
    
    detect_count_matrix = np.zeros((h,w))            
    undetect_count_matrix = np.zeros((h,w))  
    start_time_matrix = np.zeros((h,w))
    end_time_matrix = np.zeros((h,w))
    score_matrix = np.zeros((h,w))
    state_matrix = np.zeros((h,w), dtype=bool)      # State matrix, 0/1 distinguishes suspicious candidate states
    
    if verbose:
        print(f"total frames: {num_frames}, framerate: {framerate}, height: {h}, width: {w}")
        print("-------------------------")
    
    
    ### Main loop
    start = False
    tmp_start = False
    all_results=[]
    anomaly_now ={}
    for frame in range(start_frame, num_frames, frame_interval):
        # create tmp_score, tmp_detect
        boxes = static_results_df.loc[(static_results_df["score"] > score_thresh) & 
                                      (static_results_df["frame"] == frame), 
                                      ["x1", "y1", "x2", "y2", "score"]].values

        tmp_score, tmp_detect = add_boxes(boxes, ignore_matrix)
        
        if verbose:
            print(f"frame: {frame}")
            
            if len(boxes) > 0:
                print("\tboxes:", len(boxes))

        score_matrix += tmp_score # add running totals
        detect_count_matrix += tmp_detect


        # Update detection matrices
        undetect_count_matrix += ~ tmp_detect
        undetect_count_matrix[tmp_detect] = 0

        # Update time matrices
        start_time_matrix[detect_count_matrix == 1] = -600 if frame == 1 else frame # why -600 for frame 1?
        end_time_matrix[detect_count_matrix > 0] = frame

        # Update state matrices
        state_matrix[detect_count_matrix > detect_thresh] = True

        # Detect anomaly
        time_delay = mask(end_time_matrix - start_time_matrix, state_matrix)
        delay_max_idx = np.unravel_index(time_delay.argmax(), time_delay.shape)
        
#         print(f"\tmax delay: {time_delay.max()}, start: {start_time_matrix[delay_max_idx]}, end: {end_time_matrix[delay_max_idx]}, state: {state_matrix[delay_max_idx]}")
        if not start and time_delay.max() / framerate > abnormal_duration_thresh: # and score_matrix[delay_max_idx]/detect_count_matrix[delay_max_idx]>0.8:
            
            delay_max_idx = np.unravel_index(time_delay.argmax(), time_delay.shape)

            # backtrack the start time
            time_frame = int(start_time_matrix[delay_max_idx] / 5) * 5 + 1 # why 5s and 1?

            G = np.where(detect_count_matrix < detect_count_matrix[delay_max_idx] - 2, 0, 1) # What does G represent?, why -2?
            region = utils.search_region(G, delay_max_idx)

            # vehicle reid
            if 'start_time' in anomaly_now and (time_frame / framerate - anomaly_now['end_time']) < 30: # why 30?
                f1_frame_num = max(1, anomaly_now['start_time'] * framerate)
                f2_frame_num = max(1, time_frame)

                similarity = get_similarity(reid_model, vid.get_frame(f1_frame_num), vid.get_frame(f2_frame_num), anomaly_now["region"], region)

                if similarity > similarity_thresh:
                    time_frame = int(anomaly_now['start_time'] * framerate / 5) * 5 + 1 # why 5s and 1?
                else:
                    anomaly_now['region'] = region

            else: 
                anomaly_now['region'] = region


            # IoU stuff
            max_iou = 1
            count = 1
            start_time = time_frame
            tmp_len = 1
#             raio = 1
            while (max_iou > 0.1 or tmp_len < 40 or raio > 0.6) and time_frame > 1: # why 0.1, 40, 0.6?
                raio = count / tmp_len

                if time_frame in fbf_bbox_df.frame:
                    bboxes = fbf_bbox_df.loc[fbf_bbox_df.frame == time_frame, ["x1", "y1", "x2", "y2", "score"]].values
                    max_iou = utils.compute_iou(anomaly_now['region'], bboxes)

                else:
                    max_iou = 0

                time_frame -= 5 # why 5?
                if max_iou > 0.3: # why 0.3?
                    count += 1
                    if max_iou > 0.5: # why 0.5?  # they mention 0.5 IoU in the paper for NMS, might not be this 
                        start_time = time_frame

                tmp_len += 1


            # back track start_time, until brightness at that spot falls below a threshold
            for time_frame in range(start_time, 1, -5):
#                 print(f"\ttimeframe: {time_frame}")
                tmp_im = vid.get_frame(time_frame)
                if utils.compute_brightness(tmp_im[region[1]:region[3], region[0]:region[2]]) <= light_thresh:
                    break

                start_time = time_frame


            anomaly_now['start_time'] = max(0, start_time / framerate)
            anomaly_now['end_time'] = max(0, end_time_matrix[delay_max_idx] / framerate)
            start = True



        elif not tmp_start and time_delay.max() > suspicious_time_thresh * framerate:
            time_frame = start_time_matrix[delay_max_idx]

            G = np.where(detect_count_matrix < detect_count_matrix[delay_max_idx] - 2, 0, 1) # what does G represent?
            region = utils.search_region(G, delay_max_idx)


            # vehicle reid
            if 'start_time' in anomaly_now and (time_frame / framerate - anomaly_now['end_time']) < 30: # why 30?
                f1_frame_num = max(1, anomaly_now['start_time'] * framerate)
                f2_frame_num = max(1, time_frame)

                similarity = get_similarity(reid_model, vid.get_frame(f1_frame_num), vid.get_frame(f2_frame_num), anomaly_now["region"], region)

                if similarity > similarity_thresh:
                    time_frame = int(anomaly_now['start_time'] * framerate / 5) * 5 + 1
                    region = anomaly_now['region']

            anomaly_now['region'] = region
            anomaly_now['start_time'] = max(0, time_frame / framerate)
            anomaly_now['end_time'] = max(0, end_time_matrix[delay_max_idx] / framerate)

            tmp_start = True


        if start and time_delay.max() / framerate > abnormal_duration_thresh:

            delay_max_idx = np.unravel_index(time_delay.argmax(), time_delay.shape)

            if undetect_count_matrix[delay_max_idx] > undetect_thresh:
                anomaly_score = score_matrix[delay_max_idx] / detect_count_matrix[delay_max_idx]
                
                print("\t", anomaly_now, anomaly_score)
                if anomaly_score > anomaly_score_thresh:
                    anomaly_now['end_time'] = end_time_matrix[delay_max_idx] / framerate
                    anomaly_now['score'] = anomaly_score

                    all_results.append(anomaly_now)
                    anomaly_now = {}

                start = False


        elif tmp_start and time_delay.max() > suspicious_time_thresh * framerate:
            if undetect_count_matrix[delay_max_idx] > undetect_thresh:

                anomaly_score = score_matrix[delay_max_idx] / detect_count_matrix[delay_max_idx]
                if anomaly_score > anomaly_score_thresh:
                    anomaly_now['end_time'] = end_time_matrix[delay_max_idx] / framerate
                    anomaly_now['score'] = anomaly_score

                tmp_start = False

        # undetect matrix change state_matrix
        state_matrix[undetect_count_matrix > undetect_thresh] = False
        undetect_count_matrix[undetect_count_matrix > undetect_thresh] = 0

        # update matrix
        tmp_detect |= state_matrix
        detect_count_matrix = mask(detect_count_matrix, tmp_detect)
        score_matrix = mask(score_matrix, tmp_detect)
        
        
    
    # Add all anomalies to the results list
    print("---", start, time_delay.max(), score_matrix[delay_max_idx], detect_count_matrix[delay_max_idx])
    if start and time_delay.max() > abnormal_duration_thresh * framerate:
        anomaly_score = score_matrix[delay_max_idx] / detect_count_matrix[delay_max_idx]
        if anomaly_score > anomaly_score_thresh:
            anomaly_now['end_time'] = end_time_matrix[delay_max_idx] / framerate
            anomaly_now['score'] = anomaly_score

            all_results.append(anomaly_now)
            anomaly_now = {}
            start = False
            
    
    # Apply Non-Maximal Supression to the results
    if all_results:
        nms_out = anomaly_nms(all_results)

#         final_result = {'start_time': 892, 'score': 0} # why 892?
#         for nms_start_time, nms_end_time in nms_out[:, 5:7]:
#             if nms_start_time < final_result["start_time"]:
#                 final_result["start_time"] = max(0, int(nms_start_time - 1))
#                 final_result["score"] = 1
#                 final_result["end_time"] = nms_end_time
                
        final_results = pd.DataFrame(nms_out, columns=["x1", "y1", "x2", "y2", "score", "start_time", "end_time"])

        return final_results
    
    return None

In [24]:
class ResultsDict(OrderedDict):
    """
    Accumulates detection results as they are generated.
    
    results_gen: Generator that yields (key, results) pairs
    args, kwargs: extra arguments to pass to the dictionary
    
    """
    
    def __init__(self, results_gen, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.results_gen = results_gen
        self.max_frame = -1
    
    def __getitem__(self, key):
        """
        Retrieve the results for given key.
        If the key is not in the dictionary, generate results up to the key (frame).
        Returns None if the frame is not produced from the generator
        """
        assert type(key) == int  # only support integer frame number for now
        
        if key in self:
            return super().__getitem__(key)
        
        else:
            # Generate new results
            while self.max_frame < key:
                frame, results = next(self.results_gen)
                print("generated:", frame)
                self[frame] = results
                
            if key not in self:
                print("Key not found:", key)
                return None
#                 raise KeyError
            
    def __setitem__(self, key, value):
        super().__setitem__(key, value)
        self.max_frame = max(key, self.max_frame)
        
    def iterator(self):
        """
        Returns an iterator over all items, generates new results too.
        """
        
        for key, value in self.items():
            yield key, value
            
        for frame, results in self.results_gen:
            print("generated:", frame)
            self[frame] = results
            yield frame, results

In [25]:
def get_anomalies_sequential(video_reader, reid_model_path, fbf_results_dict, static_results_dict, ignore_matrix_gen=None, 
                  reid_model_name="resnet50", start_frame=1, frame_interval=20, abnormal_duration_thresh=60, detect_thresh=5, 
                  undetect_thresh=8, score_thresh=0.3, light_thresh=0.8, anomaly_score_thresh=0.7, similarity_thresh=0.95,
                  suspicious_time_thresh=18, verbose=False):
    """
    Performs the anomaly detection. Sequential version
    
    video_reader: VideoReader object for raw video
    reid_model_path: path to re-ID model checkpoint
    fbf_results_dict: ResultsDict object for frame-by-frame/raw video detection results
    static_results_dict: ResultsDict object for static/background detection results
    ignore_matrix_gen: generator yielding ignore matrix, must have the same interval as frame_interval
    reid_model_name: backbone used for reid model
    start_frame: video frame to start from
    frame_interval: interval between frames to do calculations on
    abnormal_duration_thresh: duration (in seconds) to consider an object abnormal
    detect_thresh: duration (in frames) to consider an object for tracking
    undetect_thresh: duration (in frames) to stop considering an object for tracking
    score_thresh: detection score threshold for bounding boxes
    light_thresh: brightness threshold (not sure what it does)
    anomaly_score_thresh: threshold to consider an object an anomaly
    similarity_thresh: threshold for object re-ID
    suspicious_time_thresh: duration (in seconds) for an object to be considered suspicious
    verbose: verbose printing
    
    
    """
    
    # Get video data
    vid = video_reader
    num_frames, framerate, image_shape = vid.nframes, vid.framerate, vid.img_shape
    
    # load model
    reid_model = ReidExtractor(reid_model_name, reid_model_path)
    
    # Set up information matrices
    h, w, _ = image_shape
    
    if ignore_matrix_gen is None:
        ign = np.ones((h, w), dtype=bool) # Dont ignore anything, 
        ignore_matrix_gen = (ign for _ in iter(int, 1)) # infinite generator
    
    detect_count_matrix = np.zeros((h,w))            
    undetect_count_matrix = np.zeros((h,w))  
    start_time_matrix = np.zeros((h,w))
    end_time_matrix = np.zeros((h,w))
    score_matrix = np.zeros((h,w))
    state_matrix = np.zeros((h,w), dtype=bool)      # State matrix, 0/1 distinguishes suspicious candidate states
    
    if verbose:
        print(f"total frames: {num_frames}, framerate: {framerate}, height: {h}, width: {w}")
        print("-------------------------")
    
    
    ### Main loop
    start = False
    tmp_start = False
    all_results=[]
    anomaly_now ={}
    for frame in range(start_frame, num_frames, frame_interval):
        ignore_matrix = next(ignore_matrix_gen)
        
        # create tmp_score, tmp_detect
        static_results = static_results_dict[frame]
        if static_results is not None:
            boxes = static_results.loc[static_results["score"] > score_thresh, 
                                       ["x1", "y1", "x2", "y2", "score"]].values
        else:
            boxes = []

        tmp_score, tmp_detect = add_boxes(boxes, ignore_matrix)
        
        if verbose:
            print(f"frame: {frame}")
            
            if len(boxes) > 0:
                print("\tboxes:", len(boxes))

        score_matrix += tmp_score # add running totals
        detect_count_matrix += tmp_detect


        # Update detection matrices
        undetect_count_matrix += ~ tmp_detect
        undetect_count_matrix[tmp_detect] = 0

        # Update time matrices
        start_time_matrix[detect_count_matrix == 1] = -600 if frame == 1 else frame # why -600 for frame 1?
        end_time_matrix[detect_count_matrix > 0] = frame

        # Update state matrices
        state_matrix[detect_count_matrix > detect_thresh] = True

        # Detect anomaly
        time_delay = mask(end_time_matrix - start_time_matrix, state_matrix)
        delay_max_idx = np.unravel_index(time_delay.argmax(), time_delay.shape)
        
#         print(f"\tmax delay: {time_delay.max()}, start: {start_time_matrix[delay_max_idx]}, end: {end_time_matrix[delay_max_idx]}, state: {state_matrix[delay_max_idx]}")
        if not start and time_delay.max() / framerate > abnormal_duration_thresh: # and score_matrix[delay_max_idx]/detect_count_matrix[delay_max_idx]>0.8:
            
            delay_max_idx = np.unravel_index(time_delay.argmax(), time_delay.shape)

            # backtrack the start time
            time_frame = int(start_time_matrix[delay_max_idx] / 5) * 5 + 1 # why 5s and 1?

            G = np.where(detect_count_matrix < detect_count_matrix[delay_max_idx] - 2, 0, 1) # What does G represent?, why -2?
            region = utils.search_region(G, delay_max_idx)

            # vehicle reid
            if 'start_time' in anomaly_now and (time_frame / framerate - anomaly_now['end_time']) < 30: # why 30?
                f1_frame_num = max(1, anomaly_now['start_time'] * framerate)
                f2_frame_num = max(1, time_frame)

                similarity = get_similarity(reid_model, vid.get_frame(f1_frame_num), vid.get_frame(f2_frame_num), anomaly_now["region"], region)

                if similarity > similarity_thresh:
                    time_frame = int(anomaly_now['start_time'] * framerate / 5) * 5 + 1 # why 5s and 1?
                else:
                    anomaly_now['region'] = region

            else: 
                anomaly_now['region'] = region


            # IoU stuff
            max_iou = 1
            count = 1
            start_time = time_frame
            tmp_len = 1
#             raio = 1
            while (max_iou > 0.1 or tmp_len < 40 or raio > 0.6) and time_frame > 1: # why 0.1, 40, 0.6?
                raio = count / tmp_len

                fbf_results = fbf_results_dict[time_frame]
                if fbf_results is not None:
                    bboxes = fbf_results[["x1", "y1", "x2", "y2", "score"]].values
                    max_iou = utils.compute_iou(anomaly_now['region'], bboxes)

                else:
                    max_iou = 0

                time_frame -= 5 # why 5?
                if max_iou > 0.3: # why 0.3?
                    count += 1
                    if max_iou > 0.5: # why 0.5?  # they mention 0.5 IoU in the paper for NMS, might not be this 
                        start_time = time_frame

                tmp_len += 1


            # back track start_time, until brightness at that spot falls below a threshold
            for time_frame in range(start_time, 1, -5):
#                 print(f"\ttimeframe: {time_frame}")
                tmp_im = vid.get_frame(time_frame)
                if utils.compute_brightness(tmp_im[region[1]:region[3], region[0]:region[2]]) <= light_thresh:
                    break

                start_time = time_frame


            anomaly_now['start_time'] = max(0, start_time / framerate)
            anomaly_now['end_time'] = max(0, end_time_matrix[delay_max_idx] / framerate)
            start = True



        elif not tmp_start and time_delay.max() > suspicious_time_thresh * framerate:
            time_frame = start_time_matrix[delay_max_idx]

            G = np.where(detect_count_matrix < detect_count_matrix[delay_max_idx] - 2, 0, 1) # what does G represent?
            region = utils.search_region(G, delay_max_idx)


            # vehicle reid
            if 'start_time' in anomaly_now and (time_frame / framerate - anomaly_now['end_time']) < 30: # why 30?
                f1_frame_num = max(1, anomaly_now['start_time'] * framerate)
                f2_frame_num = max(1, time_frame)

                similarity = get_similarity(reid_model, vid.get_frame(f1_frame_num), vid.get_frame(f2_frame_num), anomaly_now["region"], region)

                if similarity > similarity_thresh:
                    time_frame = int(anomaly_now['start_time'] * framerate / 5) * 5 + 1
                    region = anomaly_now['region']

            anomaly_now['region'] = region
            anomaly_now['start_time'] = max(0, time_frame / framerate)
            anomaly_now['end_time'] = max(0, end_time_matrix[delay_max_idx] / framerate)

            tmp_start = True


        if start and time_delay.max() / framerate > abnormal_duration_thresh:

            delay_max_idx = np.unravel_index(time_delay.argmax(), time_delay.shape)

            if undetect_count_matrix[delay_max_idx] > undetect_thresh:
                anomaly_score = score_matrix[delay_max_idx] / detect_count_matrix[delay_max_idx]
                
                print("\t", anomaly_now, anomaly_score)
                if anomaly_score > anomaly_score_thresh:
                    anomaly_now['end_time'] = end_time_matrix[delay_max_idx] / framerate
                    anomaly_now['score'] = anomaly_score

                    all_results.append(anomaly_now)
                    anomaly_now = {}

                start = False


        elif tmp_start and time_delay.max() > suspicious_time_thresh * framerate:
            if undetect_count_matrix[delay_max_idx] > undetect_thresh:

                anomaly_score = score_matrix[delay_max_idx] / detect_count_matrix[delay_max_idx]
                if anomaly_score > anomaly_score_thresh:
                    anomaly_now['end_time'] = end_time_matrix[delay_max_idx] / framerate
                    anomaly_now['score'] = anomaly_score

                tmp_start = False

        # undetect matrix change state_matrix
        state_matrix[undetect_count_matrix > undetect_thresh] = False
        undetect_count_matrix[undetect_count_matrix > undetect_thresh] = 0

        # update matrix
        tmp_detect |= state_matrix
        detect_count_matrix = mask(detect_count_matrix, tmp_detect)
        score_matrix = mask(score_matrix, tmp_detect)
        
        
    
    # Add all anomalies to the results list
    print("---", start, time_delay.max(), score_matrix[delay_max_idx], detect_count_matrix[delay_max_idx])
    if start and time_delay.max() > abnormal_duration_thresh * framerate:
        anomaly_score = score_matrix[delay_max_idx] / detect_count_matrix[delay_max_idx]
        if anomaly_score > anomaly_score_thresh:
            anomaly_now['end_time'] = end_time_matrix[delay_max_idx] / framerate
            anomaly_now['score'] = anomaly_score

            all_results.append(anomaly_now)
            anomaly_now = {}
            start = False
            
    
    # Apply Non-Maximal Supression to the results
    if all_results:
        nms_out = anomaly_nms(all_results)

#         final_result = {'start_time': 892, 'score': 0} # why 892?
#         for nms_start_time, nms_end_time in nms_out[:, 5:7]:
#             if nms_start_time < final_result["start_time"]:
#                 final_result["start_time"] = max(0, int(nms_start_time - 1))
#                 final_result["score"] = 1
#                 final_result["end_time"] = nms_end_time
                
        final_results = pd.DataFrame(nms_out, columns=["x1", "y1", "x2", "y2", "score", "start_time", "end_time"])

        return final_results
    
    return None

---
---
# Running everything

In [26]:
def full_run_single(video_id, video_dir, static_dir, frame_by_frame_results_dir, static_results_dir, crop_boxes_dir, 
                    ignore_mask_dir, detector_config_path, detector_model_path, reid_model_path, reid_model_backbone,
                    crop_results_dir, anomaly_results_dir,
                    bg_interval=4, bg_alpha=0.05, bg_start_frame=1, bg_threshold=5, raw_detect_interval=30, 
                    crop_min_obj_size=8, crop_row_capacity=3, crop_box_aspect_ratio=2,
                    ignore_count_thresh=0.08, ignore_area_thresh=2000, ignore_score_thresh=0.1, ignore_gau_sigma=3,
                    abnormal_duration_thresh=60, detect_duration_thresh=6, undetect_duration_thresh=8, bbox_score_thresh=0.3,
                    light_thresh=0.8, anomaly_thresh=0.8, similarity_thresh=0.95, suspicious_duration_thresh=18,
                    detector_verbose_interval=20, verbose=True):
    
    """
    Runs the full anomaly detection pipeline on a video
    
    video_id: video id/name
    video_dir: folder the video is in
    static_dir: folder to put the background images in
    frame_by_frame_results_dir: folder to put the raw video detection results in
    static_results_dir: folder to put the background image detection results in
    crop_boxes_dir: folder to put the crop boxes in
    ignore_mask_dir: folder to put the ignore region mask in
    
    detector_config_path: path to detector configuration file
    detector_model_path: path to detector model checkpoint
    reid_model_path: path to re-ID model checkpoint
    reid_model_backbone: re-ID model backbone. eg. "resnet50"
    
    bg_interval, bg_alpha, bg_start_frame, bg_threshold: see calc_bg_full_video function
    raw_detect_interval: number of frames between detection on raw video
    crop_min_obj_size, crop_row_capacity, crop_box_aspect_ratio: see create_crop_boxes function
    ignore_count_thresh, ignore_area_thresh, ignore_score_thresh, ignore_gau_sigma: see create_ignore_mask function
    abnormal_duration_thresh, detect_duration_thresh, undetect_duration_thresh, bbox_score_thresh,
        light_thresh, anomaly_thresh, similarity_thresh, suspicious_duration_thresh:
            See get_anomalies function
    
    detector_verbose_interval: detector progress printing interval
    verbose: verbose printing
    
    
    """
    
    
    # Set up file paths
    video_path = os.path.join(video_dir, f"{video_id}.mp4")
    static_images_folder = os.path.join(static_dir, f"{video_id}")
    fbf_results_path = os.path.join(frame_by_frame_results_dir, f"{video_id}.csv")
    static_results_path = os.path.join(static_results_dir, f"{video_id}.csv")
    crop_boxes_path = os.path.join(crop_boxes_dir, f"{video_id}.csv")
    crop_results_path = os.path.join(crop_results_dir, f"{video_id}.csv")
    ignore_mask_path = os.path.join(ignore_mask_dir, f"{video_id}.npy")
    anomaly_results_path = os.path.join(anomaly_results_dir, f"{video_id}.csv")
    
    # Create folders
    os.makedirs(static_images_folder, exist_ok=True)
    os.makedirs(frame_by_frame_results_dir, exist_ok=True)
    os.makedirs(static_results_dir, exist_ok=True)
    os.makedirs(crop_boxes_dir, exist_ok=True)
    os.makedirs(crop_results_dir, exist_ok=True)
    os.makedirs(ignore_mask_dir, exist_ok=True)
    os.makedirs(anomaly_results_dir, exist_ok=True)
    
    # Read Video
    raw_video = VideoReader(video_path)
    
#     # bg modeling
#     print("Creating background...")
#     calc_bg_full_video(video_path, static_images_folder, bg_interval, bg_alpha, bg_start_frame, bg_threshold, verbose)
    
#     # Detection
#     detector = Detector(detector_config_path, detector_model_path, detector_verbose_interval)
#     ## Raw Video
#     print("Detecting raw video...")
#     raw_images, raw_frame_nums = raw_video.load_video(raw_detect_interval)
#     fbf_results = detector.detect_images(raw_images, raw_frame_nums)
#     fbf_results.to_csv(fbf_results_path, index=False)
    
    
#     ## Static Images
#     static_reader = ImageReader(static_images_folder)
#     static_frame_names = list(map(lambda f: int(f[:-4]), static_reader.filenames)) # "123.jpg" -> 123
    
#     print("Detecting background...")
#     static_results = detector.detect_images(static_reader.load_images(), static_frame_names)
#     static_results.to_csv(static_results_path, index=False)
    
    
    # Perspective Cropping
#     print("Creating crop boxes...")
#     create_crop_boxes(fbf_results_path, crop_boxes_path, raw_video.img_shape, crop_min_obj_size, crop_row_capacity, crop_box_aspect_ratio) # either static/fbf results should work
    
    
    # Should be able to use this in place of normal static images. Doesnt look feasable atm, way too long detection time
#     crop_boxes = pd.read_csv(crop_boxes_path).values
#     print("Detecting cropped background...") 
#     crop_detect_results = detector.detect_images(static_reader.load_images(), static_frame_names, crop_boxes=crop_boxes)
#     crop_detect_results.to_csv(crop_results_path)

    
#     # Ignore Region
    print("Creating ingore mask...")
#     create_ignore_mask(fbf_results_path, ignore_mask_path, raw_video.img_shape, ignore_count_thresh, ignore_area_thresh, ignore_score_thresh, ignore_gau_sigma)
    
    # Detect anomalies
    print("Detecting anomalies...")
    anomalies = get_anomalies(video_path, reid_model_path, fbf_results_path, static_results_path, ignore_mask_path, 
                              reid_model_backbone, bg_start_frame, bg_interval, abnormal_duration_thresh, detect_duration_thresh, 
                              undetect_duration_thresh, bbox_score_thresh, light_thresh, anomaly_thresh, 
                              similarity_thresh, suspicious_duration_thresh, verbose)
    
    if anomalies is not None:
        anomaly_event_times = get_overlapping_time(anomalies)
    
        # Save results
        print("Saving Results...")
        anomalies.to_csv(anomaly_results_path, index=False)

        return anomalies, anomaly_event_times
    
    else:
        return [], []

In [34]:
def full_run_sequential(video_id, video_dir, static_dir, frame_by_frame_results_dir, static_results_dir, crop_boxes_dir, 
                    ignore_mask_dir, detector_config_path, detector_model_path, reid_model_path, reid_model_backbone,
                    crop_results_dir, anomaly_results_dir,
                    bg_interval=4, bg_alpha=0.05, bg_start_frame=1, bg_threshold=5, raw_detect_interval=30, 
                    crop_min_obj_size=8, crop_row_capacity=3, crop_box_aspect_ratio=2,
                    ignore_count_thresh=0.08, ignore_area_thresh=2000, ignore_score_thresh=0.1, ignore_gau_sigma=3,
                    abnormal_duration_thresh=60, detect_duration_thresh=6, undetect_duration_thresh=8, bbox_score_thresh=0.3,
                    light_thresh=0.8, anomaly_thresh=0.8, similarity_thresh=0.95, suspicious_duration_thresh=18,
                    detector_verbose_interval=20, verbose=True):
    
    """
    Full run but runs one frame at a time. This should be more suitable for live processing.
    
    """
    
    # Set up file paths
    video_path = os.path.join(video_dir, f"{video_id}.mp4")
    static_images_folder = os.path.join(static_dir, f"{video_id}")
    fbf_results_path = os.path.join(frame_by_frame_results_dir, f"{video_id}.csv")
    static_results_path = os.path.join(static_results_dir, f"{video_id}.csv")
    crop_boxes_path = os.path.join(crop_boxes_dir, f"{video_id}.csv")
    crop_results_path = os.path.join(crop_results_dir, f"{video_id}.csv")
    ignore_mask_path = os.path.join(ignore_mask_dir, f"{video_id}.npy")
    anomaly_results_path = os.path.join(anomaly_results_dir, f"{video_id}.csv")
    
    # Create folders
    os.makedirs(static_images_folder, exist_ok=True)
    os.makedirs(frame_by_frame_results_dir, exist_ok=True)
    os.makedirs(static_results_dir, exist_ok=True)
    os.makedirs(crop_boxes_dir, exist_ok=True)
    os.makedirs(crop_results_dir, exist_ok=True)
    os.makedirs(ignore_mask_dir, exist_ok=True)
    os.makedirs(anomaly_results_dir, exist_ok=True)
    
    
    # Read Video
    raw_video = VideoReader(video_path)
    
    # bg modeling
    print("Creating background...")
#     bg_images = calc_background(raw_video.load_video()[0], bg_interval, bg_alpha, bg_start_frame, bg_threshold)
    bg_images = calc_bg_tensor(raw_video.load_video()[0], bg_interval, bg_alpha, bg_start_frame, bg_threshold)
    bg_images = (img for img, _ in bg_images) # throw out frame
    
    # Detection
    detector = Detector(detector_config_path, detector_model_path, detector_verbose_interval)
    ## Raw Video
    print("Detecting raw video...")
    raw_images, raw_frame_nums = raw_video.load_video(raw_detect_interval)
    fbf_results_gen = detector.detect_images_generator(raw_images, raw_frame_nums)
    fbf_results = ResultsDict(fbf_results_gen)
    
    
    print("Detecting background...")
    static_results_gen = detector.detect_images_generator(bg_images, range(bg_start_frame, raw_video.nframes, bg_interval))
    static_results = ResultsDict(static_results_gen)
    
    # Ignore Region
    print("Creating ingore mask...")
    ignore_alpha = 0.1
    ignore_alpha_2 = 1 - (1 - ignore_alpha) ** bg_interval # adjusted for different intervals
    ignore_mask_gen = create_ignore_mask_generator(static_results.iterator(), raw_video.img_shape, ignore_count_thresh, ignore_area_thresh, ignore_score_thresh, ignore_gau_sigma, alpha=ignore_alpha_2)
    
    
    for i in range(1000):
        next(ignore_mask_gen)
        
    
    
#     anomalies = get_anomalies_sequential(raw_video, reid_model_path, fbf_results, static_results, ignore_mask_gen,
#                         reid_model_backbone, bg_start_frame, bg_interval, abnormal_duration_thresh, detect_duration_thresh, 
#                         undetect_duration_thresh, bbox_score_thresh, light_thresh, anomaly_thresh, 
#                         similarity_thresh, suspicious_duration_thresh, verbose)
    
    pd.concat(static_results.values()).to_csv("/data/tmp/static.csv", index=False)
    pd.concat(fbf_results.values()).to_csv("/data/tmp/fbf.csv", index=False)
    
    if anomalies is not None:
        anomaly_event_times = get_overlapping_time(anomalies)
    
        # Save results
        print("Saving Results...")
        anomalies.to_csv(anomaly_results_path, index=False)

        return anomalies, anomaly_event_times
    
    else:
        return [], []

In [28]:
def process_folder(video_dir, static_dir, frame_by_frame_results_dir, static_results_dir, crop_boxes_dir, 
                    ignore_mask_dir, detector_config_path, detector_model_path, reid_model_path, reid_model_backbone,
                    crop_results_dir, anomaly_results_dir,
                    bg_interval=4, bg_alpha=0.05, bg_start_frame=1, bg_threshold=5, raw_detect_interval=30, 
                    crop_min_obj_size=8, crop_row_capacity=3, crop_box_aspect_ratio=2,
                    ignore_count_thresh=0.08, ignore_area_thresh=2000, ignore_score_thresh=0.1, ignore_gau_sigma=3,
                    abnormal_duration_thresh=60, detect_duration_thresh=6, undetect_duration_thresh=8, bbox_score_thresh=0.3,
                    light_thresh=0.8, anomaly_thresh=0.8, similarity_thresh=0.95, suspicious_duration_thresh=18,
                    detector_verbose_interval=20, verbose=True):
    """
    See full_run_single function for documentation.
    """
    
    anomalies_dict, anomaly_times_dict = {}, {}
    for filename in sorted(os.listdir(video_dir), key=lambda f: int(f[:-4])):
    
        video_id = int(filename[:-4])  # "123.mp4" -> 123
        print("Processing video:", video_id)
        
        # Sequential by processing steps
#         anomalies, anomaly_event_times = full_run_single(video_id, video_dir, static_dir, frame_by_frame_results_dir, static_results_dir, crop_boxes_dir, 
#                     ignore_mask_dir, detector_config_path, detector_model_path, reid_model_path, reid_model_backbone,
#                     crop_results_dir, anomaly_results_dir,
#                     bg_interval, bg_alpha, bg_start_frame, bg_threshold, raw_detect_interval, 
#                     crop_min_obj_size, crop_row_capacity, crop_box_aspect_ratio,
#                     ignore_count_thresh, ignore_area_thresh, ignore_score_thresh, ignore_gau_sigma,
#                     abnormal_duration_thresh, detect_duration_thresh, undetect_duration_thresh, bbox_score_thresh,
#                     light_thresh, anomaly_thresh, similarity_thresh, suspicious_duration_thresh,
#                     detector_verbose_interval, verbose)
        
        # Sequential by frame
        anomalies, anomaly_event_times = full_run_sequential(video_id, video_dir, static_dir, frame_by_frame_results_dir, static_results_dir, crop_boxes_dir, 
                    ignore_mask_dir, detector_config_path, detector_model_path, reid_model_path, reid_model_backbone,
                    crop_results_dir, anomaly_results_dir,
                    bg_interval, bg_alpha, bg_start_frame, bg_threshold, raw_detect_interval, 
                    crop_min_obj_size, crop_row_capacity, crop_box_aspect_ratio,
                    ignore_count_thresh, ignore_area_thresh, ignore_score_thresh, ignore_gau_sigma,
                    abnormal_duration_thresh, detect_duration_thresh, undetect_duration_thresh, bbox_score_thresh,
                    light_thresh, anomaly_thresh, similarity_thresh, suspicious_duration_thresh,
                    detector_verbose_interval, verbose)
    
        anomalies_dict[video_id] = anomalies
        anomaly_times_dict[video_id] = anomaly_event_times
        
        print(video_id, anomalies, anomaly_event_times)
        
        
    return anomalies_dict, anomaly_times_dict        

In [29]:
video_dir = "/data/aicity/test"
video_id = 1

# todo: create all these in a temp directory
static_dir = "/data/aicity/winner_team/background_images/test"
frame_by_frame_results_dir = "/data/aicity/winner_team/detection_results/test_framebyframe"
static_results_dir = "/data/aicity/winner_team/detection_results/test_static"
crop_results_dir = "/data/aicity/winner_team/detection_results/test_crop" # todo: add this to program arguments
crop_boxes_dir = "/data/aicity/winner_team/crop_boxes/test"
ignore_mask_dir = "/data/aicity/winner_team/detection_results/test_seg_masks"
anomaly_results_dir = "/data/aicity/winner_team/anomaly_results/test"

reid_model_backbone = "resnet50"
reid_model_path = "/data/modules/AICity2019_winner/models/reid/reid.pth"

detector_config_path = "/data/modules/mmdetection/configs/htc/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e.py"
detector_model_path = "/data/modules/mmdetection/checkpoints/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e_20190408-0e50669c.pth"

ssd_config_path = "/data/modules/mmdetection/configs/ssd512_coco_custom.py"
ssd_model_path = "/data/modules/mmdetection/work_dirs/ssd512_coco/latest.pth"

In [43]:
anomalies = full_run_single(video_id, video_dir, static_dir, frame_by_frame_results_dir, static_results_dir, crop_boxes_dir, 
                            ignore_mask_dir, detector_config_path, detector_model_path, reid_model_path, reid_model_backbone,
                            crop_results_dir, anomaly_results_dir, ignore_area_thresh=500, anomaly_thresh=0.5
                           )

Creating ingore mask...
Detecting anomalies...
total frames: 26760, framerate: 30.0, height: 410, width: 800
-------------------------
frame: 1
	boxes: 8
frame: 21
	boxes: 7
frame: 41
	boxes: 6
frame: 61
	boxes: 7
frame: 81
	boxes: 5
frame: 101
	boxes: 4
frame: 121
	boxes: 3
frame: 141
	boxes: 2
frame: 161
	boxes: 2
frame: 181
	boxes: 1
frame: 201
	boxes: 1
frame: 221
	boxes: 1
frame: 241
	boxes: 1
frame: 261
	boxes: 1
frame: 281
	boxes: 1
frame: 301
	boxes: 1
frame: 321
	boxes: 1
frame: 341
	boxes: 2
frame: 361
	boxes: 2
frame: 381
	boxes: 3
frame: 401
	boxes: 3
frame: 421
	boxes: 3
frame: 441
	boxes: 1
frame: 461
	boxes: 1
frame: 481
	boxes: 1
frame: 501
	boxes: 1
frame: 521
	boxes: 1
frame: 541
	boxes: 1
frame: 561
	boxes: 1
frame: 581
	boxes: 1
frame: 601
	boxes: 1
frame: 621
	boxes: 1
frame: 641
	boxes: 1
frame: 661
	boxes: 1
frame: 681
	boxes: 1
frame: 701
	boxes: 1
frame: 721
	boxes: 1
frame: 741
	boxes: 1
frame: 761
	boxes: 1
frame: 781
	boxes: 1
frame: 801
	boxes: 1
frame: 821

frame: 7881
	boxes: 4
frame: 7901
	boxes: 4
frame: 7921
	boxes: 5
frame: 7941
	boxes: 5
frame: 7961
	boxes: 5
frame: 7981
frame: 8001
frame: 8021
frame: 8041
	boxes: 4
frame: 8061
	boxes: 4
frame: 8081
	boxes: 5
frame: 8101
	boxes: 4
frame: 8121
	boxes: 5
frame: 8141
	boxes: 5
frame: 8161
	boxes: 5
frame: 8181
	boxes: 5
frame: 8201
	boxes: 4
frame: 8221
	boxes: 4
frame: 8241
	boxes: 4
frame: 8261
	boxes: 4
frame: 8281
	boxes: 5
frame: 8301
	boxes: 5
frame: 8321
	boxes: 6
frame: 8341
	boxes: 6
frame: 8361
	boxes: 3
frame: 8381
	boxes: 4
frame: 8401
	boxes: 3
frame: 8421
	boxes: 3
frame: 8441
	boxes: 4
frame: 8461
	boxes: 4
frame: 8481
	boxes: 3
frame: 8501
	boxes: 4
frame: 8521
	boxes: 3
frame: 8541
	boxes: 4
frame: 8561
	boxes: 4
frame: 8581
	boxes: 3
frame: 8601
	boxes: 3
frame: 8621
	boxes: 3
frame: 8641
	boxes: 4
frame: 8661
	boxes: 3
frame: 8681
	boxes: 3
frame: 8701
	boxes: 3
frame: 8721
	boxes: 4
frame: 8741
	boxes: 4
frame: 8761
	boxes: 4
frame: 8781
	boxes: 4
frame: 8801
	boxes

frame: 15181
	boxes: 6
frame: 15201
	boxes: 6
frame: 15221
	boxes: 6
frame: 15241
	boxes: 6
frame: 15261
	boxes: 5
frame: 15281
	boxes: 6
frame: 15301
	boxes: 5
frame: 15321
	boxes: 5
frame: 15341
	boxes: 6
frame: 15361
	boxes: 6
frame: 15381
	boxes: 7
frame: 15401
	boxes: 6
frame: 15421
	boxes: 6
frame: 15441
	boxes: 6
frame: 15461
	boxes: 7
frame: 15481
	boxes: 6
frame: 15501
	boxes: 7
frame: 15521
	boxes: 7
frame: 15541
	boxes: 6
frame: 15561
	boxes: 6
frame: 15581
	boxes: 5
frame: 15601
	boxes: 6
frame: 15621
	boxes: 6
frame: 15641
	boxes: 6
frame: 15661
	boxes: 6
frame: 15681
	boxes: 5
frame: 15701
	boxes: 6
frame: 15721
	boxes: 6
frame: 15741
	boxes: 6
frame: 15761
	boxes: 6
frame: 15781
	boxes: 6
frame: 15801
	boxes: 6
frame: 15821
	boxes: 5
frame: 15841
	boxes: 6
frame: 15861
	boxes: 5
frame: 15881
	boxes: 5
frame: 15901
	boxes: 5
frame: 15921
	boxes: 5
frame: 15941
	boxes: 5
frame: 15961
	boxes: 5
frame: 15981
	boxes: 5
frame: 16001
	boxes: 6
frame: 16021
	boxes: 7
frame: 1604

frame: 22441
	boxes: 11
frame: 22461
	boxes: 11
frame: 22481
	boxes: 8
frame: 22501
	boxes: 9
frame: 22521
	boxes: 10
frame: 22541
	boxes: 9
frame: 22561
	boxes: 9
frame: 22581
	boxes: 9
frame: 22601
	boxes: 9
frame: 22621
	boxes: 8
frame: 22641
	boxes: 9
frame: 22661
	boxes: 8
frame: 22681
	boxes: 9
frame: 22701
	boxes: 9
frame: 22721
	boxes: 10
frame: 22741
	boxes: 10
frame: 22761
	boxes: 9
frame: 22781
	boxes: 9
frame: 22801
	boxes: 9
frame: 22821
	boxes: 8
frame: 22841
	boxes: 7
frame: 22861
	boxes: 7
frame: 22881
	boxes: 7
frame: 22901
	boxes: 7
frame: 22921
	boxes: 8
frame: 22941
	boxes: 8
frame: 22961
	boxes: 9
frame: 22981
	boxes: 8
frame: 23001
	boxes: 8
frame: 23021
	boxes: 9
frame: 23041
	boxes: 10
frame: 23061
	boxes: 10
frame: 23081
	boxes: 11
frame: 23101
	boxes: 10
frame: 23121
	boxes: 9
frame: 23141
	boxes: 9
frame: 23161
	boxes: 10
frame: 23181
frame: 23201
frame: 23221
frame: 23241
	boxes: 10
frame: 23261
	boxes: 8
frame: 23281
	boxes: 10
frame: 23301
	boxes: 13
frame

In [60]:
anomalies = full_run_sequential(video_id, video_dir, static_dir, frame_by_frame_results_dir, static_results_dir, crop_boxes_dir, 
                            ignore_mask_dir, detector_config_path, detector_model_path, reid_model_path, reid_model_backbone,
                            crop_results_dir, anomaly_results_dir, ignore_area_thresh=500, anomaly_thresh=0.5, bg_interval=20,
                           )

Creating background...
Detecting raw video...
Detecting background...
Creating ingore mask...
total frames: 26760, framerate: 30.0, height: 410, width: 800
-------------------------
generated: 1
frame: 1
	boxes: 9
generated: 21
frame: 21
	boxes: 9
generated: 41
frame: 41
	boxes: 10
generated: 61
frame: 61
	boxes: 9
generated: 81
frame: 81
	boxes: 8
generated: 101
frame: 101
	boxes: 8
generated: 121
frame: 121
	boxes: 8
generated: 141
frame: 141
	boxes: 7
generated: 161
frame: 161
	boxes: 6
generated: 181
frame: 181
	boxes: 6
generated: 201
frame: 201
	boxes: 6
generated: 221
frame: 221
	boxes: 6
generated: 241
frame: 241
	boxes: 6
generated: 261
frame: 261
	boxes: 7
generated: 281
frame: 281
	boxes: 7
generated: 301
frame: 301
	boxes: 5
generated: 321
frame: 321
	boxes: 5
generated: 341
frame: 341
	boxes: 5
generated: 361
frame: 361
	boxes: 4
generated: 381
frame: 381
	boxes: 4
generated: 401
frame: 401
	boxes: 4
generated: 421
frame: 421
	boxes: 4
generated: 441
frame: 441
	boxes: 4
g

KeyboardInterrupt: 

In [None]:
anomalies, anomaly_times = process_folder(video_dir, static_dir, frame_by_frame_results_dir, static_results_dir, crop_boxes_dir, 
                            ignore_mask_dir, ssd_config_path, ssd_model_path, reid_model_path, reid_model_backbone,
                            crop_results_dir, anomaly_results_dir, ignore_area_thresh=500, anomaly_thresh=0.5, bg_interval=30)

In [None]:
def update_matrices(mat_detected, mat_undetected, mat_start, mat_end, mat_score, mat_state, frame, ignore_mask, boxes, box_score_thresh=0.3, time_tresh, score_thresh, undetected_thresh):
    """
    Update algorithm identical to paper, doesnt actually work.
    
    
    """
    def get_connected_region(mat, pos):
        regions = label(mat, connectivity=1)
        return regions == regions[pos]
    
    def mask(mat, mask):
        return mat * mask
    
    
    
    h, w = mat_detected.shape
    
    tmp_mat_detected = np.zeros((h, w))
    tmp_mat_score = np.zeros((h, w))
    
    for (x1, y1, x2, y2), score in boxes:
        if score > box_score_thresh:
            tmp_mat_detected[y1:y2, x1:x2] = 1
            tmp_mat_score[y1:y2, x1:x2] = np.maximum(tmp_mat_detected[y1:y2, x1:x2], score) # max will never be score?

            
    tmp_mat_undetected = mask(1 - tmp_mat_score, ignore_mask)
    tmp_mat_detected = mask(tmp_mat_detected, ignore_mask)
    tmp_mat_score = mask(tmp_mat_score, ignore_mask)
    
    mat_detected = mat_detected + tmp_mat_detected
    mat_score = mat_score + tmp_mat_score
    mat_undetected = mat_undetected + tmp_mat_undetected
    mat_undetected[tmp_mat_detected] = 0
    
    mat_start[mat_detected == 1] = frame
    mat_end[mat_detected > 0] = frame
    
    tmp_mat_delay = mat_end - mat_start
    pos = argmax(tmp_mat_delay)
    
    if tmp_mat_delay[pos] > time_thresh:
        tmp_mat_binary = mat_detected[pos] - mat_detected <= 1
        anomaly_region = get_connected_region(tmp_mat_binary, pos)
        
        if mat_score / mat_undetected > score_thresh:
            # start or keep anomaly status
            
        if mat_undetected[pos] > undetected_thresh:
            # finish anomaly, output anomaly information
            
            
    mat_state[mat_detected > detected_thresh] = True
    mat_state[mat_undetected > undetected_thresh] = False
    
    mat_detected[tmp_mat_detected | mat_state] = False
    mat_score[tmp_mat_detected | mat_state] = False
    
    return mat_detected, mat_undetected, mat_start, mat_end, mat_score, mat_state

---
# Notes

+ Video with gaps in them, (eg frames are black, no data) that happen during an anomaly, will create 2 separate anomaly events, or cause it to stop being tracked properly.
    + see test vid 1: 4:26 - 4:28, 6:04 - 6:06, 11:21 - 11:23, 12:58 - 13:00
    + Increasing interval between frames seems to help deal with these gaps
+ Produces about 680MB of intermediate data per 15 min video processed, mostly in background images.
    + These are not used after object detection, so the anomaly detection part can be re-ran with different hyperparameters without them.
+ Currently the detector was trained on the COCO dataset. The detector in the paper was trained on UA-DETRAC and VisDrone, with a gaussian blur applied. It should be fine tuned on these datasets.
+ Much of the code on the paper's github just does not work. Most of it is full of errors, and does not reflect the algorithm in the paper. 
+ test vid 11: doesnt pick up on stopped car, but does when the repair van comes. Seems to work even with large camera movements. Seems to detect anomaly when brightness is increased. see 7:22
+ test vid 6: Seems to be detecting the car fine, but there are periods of large drops in detection scores (see frames 10600-11000)
+ Most of the time, the anomalies happen in the ignored area, so they are not picked up.
    + Reducing the ignore_area_thresh and ignore_score_thresh parameters should help this.
    + There is also the issue of moving anomalies. eg cars swerving off the road/out of camera view.
+ Increasing the interval between detecting frames does not seem to impact performance significantly.
    + Perhaps some sort of adaptive or 2 step detection would work. Run once with a large interval to produce candidate times, then go back with a finer interval to confirm.
+ I want to write the code to run frame by frame, instead of one processing step at a time. This is needed if I want to run the program in a live setting.
+ Background creation is significantly slow. 
    + Using SSD detecting every 30 frames: 25 fps, every 4 frames: 19 fps, every 600 frames: 27 fps.
    + Background takes 37ms per frame, detection ~67ms per frame.
    + Potential solution: change bg modelling so it only calculates every x frames, instead of calulating every frame and yielding every x frames.
    + Problem was actually a bottleneck in reading images
        + Changed VideoReader to work on a separate thread, and only decode images that it actually needs.
        + Sped up process by ~4x
    + Moving calulations to GPU was actually slower, running on CPU was significantly faster
+ Using HTC model is ~450ms per frame. A 15 min video takes ~55mins to process at bg_interval=4, ~9mins at bg_interval=30