# 1. Install necesessary libraries

In [181]:
# # norfair dependencies
# %cd /kaggle/input/norfair031py3/
# !pip install commonmark-0.9.1-py2.py3-none-any.whl -f ./ --no-index
# !pip install rich-9.13.0-py3-none-any.whl

# !mkdir /kaggle/working/tmp
# !cp -r /kaggle/input/norfair031py3/filterpy-1.4.5/filterpy-1.4.5/ /kaggle/working/tmp/
# %cd /kaggle/working/tmp/filterpy-1.4.5/
# !pip install .
# !rm -rf /kaggle/working/tmp

# # norfair
# %cd /kaggle/input/norfair031py3/
# !pip install norfair-0.3.1-py3-none-any.whl -f ./ --no-index

In [182]:
import cv2
import ast
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.notebook import tqdm
tqdm.pandas()
import torch
import sys
from PIL import Image as Img
from IPython.display import display
from norfair import Detection, Tracker
from sklearn.model_selection import GroupKFold

from fastai.vision.all import *
import timm

sys.path.append('tensorflow-great-barrier-reef')

# 2. Set model parameters

In [198]:
FOLD = 2
ANNS = True
ann_str = '' if ANNS else '_na'

ROOT_DIR  = 'tensorflow-great-barrier-reef'
DATASET_PATH = 'tensorflow-great-barrier-reef/train_images/'

# !mkdir -p /root/.config/Ultralytics
# !cp /kaggle/input/yolov5-font/Arial.ttf /root/.config/Ultralytics/

f_best = f'yolov5_models/f{FOLD}_na_10_adam_25.pt'
f_last = f'yolov5_models/f{FOLD}_na_10_adam_25.pt'

# 3. Evaluation

## Evaluation utils

In [199]:
def calc_iou(bboxes1, bboxes2, bbox_mode='xywh'):
    assert len(bboxes1.shape) == 2 and bboxes1.shape[1] == 4
    assert len(bboxes2.shape) == 2 and bboxes2.shape[1] == 4
    
    bboxes1 = bboxes1.copy()
    bboxes2 = bboxes2.copy()
    
    if bbox_mode == 'xywh':
        bboxes1[:, 2:] += bboxes1[:, :2]
        bboxes2[:, 2:] += bboxes2[:, :2]

    x11, y11, x12, y12 = np.split(bboxes1, 4, axis=1)
    x21, y21, x22, y22 = np.split(bboxes2, 4, axis=1)
    xA = np.maximum(x11, np.transpose(x21))
    yA = np.maximum(y11, np.transpose(y21))
    xB = np.minimum(x12, np.transpose(x22))
    yB = np.minimum(y12, np.transpose(y22))
    interArea = np.maximum((xB - xA + 1), 0) * np.maximum((yB - yA + 1), 0)
    boxAArea = (x12 - x11 + 1) * (y12 - y11 + 1)
    boxBArea = (x22 - x21 + 1) * (y22 - y21 + 1)
    iou = interArea / (boxAArea + np.transpose(boxBArea) - interArea)
    return iou

def f_beta(tp, fp, fn, beta=2):
    return (1+beta**2)*tp / ((1+beta**2)*tp+beta**2*fn+fp)

def calc_is_correct_at_iou_th(gt_bboxes, pred_bboxes, iou_th, verbose=False):
    gt_bboxes = gt_bboxes.copy()
    pred_bboxes = pred_bboxes.copy()
    
    tp = 0
    fp = 0
    for k, pred_bbox in enumerate(pred_bboxes): # fixed in ver.7
        ious = calc_iou(gt_bboxes, pred_bbox[None, 1:])
        max_iou = ious.max()
        if max_iou > iou_th:
            tp += 1
            gt_bboxes = np.delete(gt_bboxes, ious.argmax(), axis=0)
        else:
            fp += 1
        if len(gt_bboxes) == 0:
            fp += len(pred_bboxes) - (k + 1) # fix in ver.7
            break

    fn = len(gt_bboxes)
    return tp, fp, fn

def calc_is_correct(gt_bboxes, pred_bboxes):
    """
    gt_bboxes: (N, 4) np.array in xywh format
    pred_bboxes: (N, 5) np.array in conf+xywh format
    """
    if len(gt_bboxes) == 0 and len(pred_bboxes) == 0:
        tps, fps, fns = 0, 0, 0
        return tps, fps, fns
    
    elif len(gt_bboxes) == 0:
        tps, fps, fns = 0, len(pred_bboxes)*11, 0
        return tps, fps, fns
    
    elif len(pred_bboxes) == 0:
        tps, fps, fns = 0, 0, len(gt_bboxes)*11
        return tps, fps, fns
    
    pred_bboxes = pred_bboxes[pred_bboxes[:,0].argsort()[::-1]] # sort by conf
    
    tps, fps, fns = 0, 0, 0
    for iou_th in np.arange(0.3, 0.85, 0.05):
        tp, fp, fn = calc_is_correct_at_iou_th(gt_bboxes, pred_bboxes, iou_th)
        tps += tp
        fps += fp
        fns += fn
    return tps, fps, fns

def calc_f2_score(gt_bboxes_list, pred_bboxes_list, verbose=False):
    """
    gt_bboxes_list: list of (N, 4) np.array in xywh format
    pred_bboxes_list: list of (N, 5) np.array in conf+xywh format
    """
    tps, fps, fns, total_pred, total_gt = 0, 0, 0, 0, 0
    for gt_bboxes, pred_bboxes in zip(gt_bboxes_list, pred_bboxes_list):
        tp, fp, fn = calc_is_correct(gt_bboxes, pred_bboxes)
        tps += tp
        fps += fp
        fns += fn
        total_pred += len(pred_bboxes)
        total_gt += len(gt_bboxes)
        if verbose:
            num_gt = len(gt_bboxes)
            num_pred = len(pred_bboxes)
            print(f'num_gt:{num_gt:<3} num_pred:{num_pred:<3} tp:{tp:<3} fp:{fp:<3} fn:{fn:<3}')
    return f_beta(tps, fps, fns, beta=2), tps/(tps+fps), tps/(tps+fns)

## Tracking utils

In [200]:
from norfair import Detection, Tracker

# Helper to convert bbox in format [x_min, y_min, x_max, y_max, score] to norfair.Detection class
def to_norfair(detects, frame_id):
    result = []
    for x_min, y_min, x_max, y_max, score in detects:
        xc, yc = (x_min + x_max) / 2, (y_min + y_max) / 2
        w, h = x_max - x_min, y_max - y_min
        result.append(Detection(points=np.array([xc, yc]), scores=np.array([score]), data=np.array([w, h, frame_id])))
        
    return result

# Euclidean distance function to match detections on this frame with tracked_objects from previous frames
def euclidean_distance(detection, tracked_object):
    return np.linalg.norm(detection.points - tracked_object.estimate)
        

## Visualization utils

In [201]:
def show_prediction(img, bboxes, gts, show=True):
    colors = [(0, 0, 255)]

    obj_names = ["s"]

    for box in bboxes:
#         cv2.rectangle(img, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (255,0,0), 2)
        cv2.rectangle(img, (int(box[1]), int(box[2])), (int(box[1] + box[3]), int(box[2] + box[4])), (255,0,0), 2)
        cv2.putText(img, f'{box[0]}', (int(box[1]), int(box[2])-3), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1, cv2.LINE_AA)
        
    for gt in gts:
        cv2.rectangle(img, (int(gt[0]), int(gt[1])), (int(gt[0]+gt[2]), int(gt[1]+gt[3])), (0,255,0), 2)
    
    if show:
        img = Img.fromarray(img).resize((960, 540))
    return img


## Inference utils

In [202]:
def is_cots(learner, img, bbox, bbox_conf, conf_ratio=0.5, visualize=False):   
    cropped_img = img[bbox[1] : bbox[3], bbox[0] : bbox[2]]
#     cropped_img = Img.fromarray(cropped_img).resize((240, 200))
#     cropped_img = np.asarray(cropped_img)
    
    with learner.no_bar():
        preds = learner.predict(cropped_img)
    pred_conf = float(preds[2][0])
    
    conf = bbox_conf + (pred_conf-0.5) * conf_ratio
    conf = max(conf, 0)
    conf = min(conf, 1)
    
    if visualize:
        display(Img.fromarray(cropped_img))
        print(bbox_conf, bbox)
        print(preds)
        print(conf)
        print('+'*80)
    
    return conf

## Get GT bboxes from the dataset

In [203]:
dir = f'{DATASET_PATH}'
imgs = [dir + f for f in ('video_2/5748.jpg',
                          'video_2/5772.jpg',
                          'video_2/5820.jpg',
                          'video_1/4159.jpg', 
                          'video_1/4183.jpg', 
                          'video_1/4501.jpg', 
                          'video_1/5375.jpg', 
                          'video_1/5414.jpg',
                          'video_1/5495.jpg',
                          'video_1/4775.jpg', 
                          'video_0/9794.jpg', 
                          'video_0/4502.jpg', 
                          'video_0/9651.jpg', 
                          'video_0/9700.jpg',  
                          'video_0/9674.jpg',
                          'video_0/20.jpg', 
                          'video_0/17.jpg', 
                          'video_1/5474.jpg', 
                          'video_0/0.jpg')]

def get_path(row):
    row['image_path'] = f'{ROOT_DIR}/train_images/video_{row.video_id}/{row.video_frame}.jpg'
    return row

def get_bbox(annots):
    bboxes = [list(annot.values()) for annot in annots]
    return bboxes

# Train Data
df = pd.read_csv(f'{ROOT_DIR}/train.csv')
# df = df[df.video_id == FOLD]
df = df.progress_apply(get_path, axis=1)
df['annotations'] = df['annotations'].progress_apply(lambda x: ast.literal_eval(x))
df.head(2)

df['num_bbox'] = df['annotations'].progress_apply(lambda x: len(x))
data = (df.num_bbox>0).value_counts(normalize=True)*100
print(f"No BBox: {data[0]:0.2f}% | With BBox: {data[1]:0.2f}%")

df['bboxes'] = df.annotations.progress_apply(get_bbox)

  0%|          | 0/23501 [00:00<?, ?it/s]

  0%|          | 0/23501 [00:00<?, ?it/s]

  0%|          | 0/23501 [00:00<?, ?it/s]

No BBox: 79.07% | With BBox: 20.93%


  0%|          | 0/23501 [00:00<?, ?it/s]

In [204]:
FOLD_NUM = 10

train = df[df.num_bbox>0]
kf = GroupKFold(n_splits = FOLD_NUM) 
train = train.reset_index(drop=True)
train['fold'] = -1
for f, (train_idx, val_idx) in enumerate(kf.split(train, y = train.video_id.tolist(), groups=train.sequence)):
    train.loc[val_idx, 'fold'] = f

train.groupby('fold').agg({'num_bbox': 'count'})

Unnamed: 0_level_0,num_bbox
fold,Unnamed: 1_level_1
0,1100
1,704
2,654
3,577
4,564
5,285
6,238
7,252
8,274
9,271


## Evaluate model

### Utils

In [205]:
def clahe_hsv(img):
    hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

    h, s, v = hsv_img[:,:,0], hsv_img[:,:,1], hsv_img[:,:,2]
    clahe = cv2.createCLAHE(clipLimit = 15.0, tileGridSize = (20,20))
    v = clahe.apply(v)

    hsv_img = np.dstack((h,s,v))

    rgb = cv2.cvtColor(hsv_img, cv2.COLOR_HSV2RGB)
    
    return rgb

def _load_model(path, conf, iou):
    model = torch.hub.load('yolov5', 
                          'custom', 
                          path = path,
                          source='local',
                          force_reload=True)  # local repo
    model.conf = conf
    model.iou = iou
    model.classes = None   # (optional list) filter by class, i.e. = [0, 15, 16] for persons, cats and dogs
    model.multi_label = False  # NMS multiple labels per box
    model.max_det = 1000  # maximum number of detections per image
    return model

def evaluate(path, test_df, conf, iou, img_size, area_thr, conf_thr=0.5, conf_ratio=0.5, augment=True, tracking=True, visualize=False):
    # Tracker will update tracks based on detections from current frame
    tracker = Tracker(
        distance_function=euclidean_distance, 
        distance_threshold=30,
        hit_inertia_min=3,
        hit_inertia_max=6,
        initialization_delay=1,
    )
        
    # Save frame_id into detection to know which tracks have no detections on current frame
    frame_id = 0
    
    model = _load_model(path, conf, iou)
    gt_bboxes_list, prd_bboxes_list = [], []

    for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
        bboxes = np.empty((0,5), int)
        gt_bboxes, pred_bboxes = [], []
        
        img_path = row.image_path
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
#         img = clahe_hsv(img)

        # get GT bboxes for evaluation
        for gt in row.bboxes:
            gt_bbox = np.array(list(map(float, gt)))
            gt_bboxes.append(gt_bbox)
        gt_bboxes_list.append(np.array(gt_bboxes))

        r = model(img, size=img_size, augment=augment)
        learner_resnet18 = load_learner('fastai_models/resnet18.pkl')
        
        
        if r.pandas().xyxy[0].shape[0] == 0:
            anno = ''
        else:
            for idx, row in r.pandas().xyxy[0].iterrows():
                  bboxes = np.append(bboxes, [[row.xmin, row.ymin, row.xmax, row.ymax, row.confidence]], axis=0)
        
        predictions = []
        detects = []
        
        if len(bboxes) > 0:
            # if image classifier recognize cots - increase bbox confidence, else decrease it
            for bbox in bboxes:
                score, xmin, ymin, xmax, ymax = bbox[4], int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
                bbox_conf = is_cots(learner_resnet18, img, [xmin, ymin, xmax, ymax], score, conf_ratio, visualize=False) 
                
                if bbox_conf > conf_thr:
                    width, height = xmax-xmin, ymax-ymin
                    detects.append(bbox)
                    predictions.append('{:.2f} {} {} {} {}'.format(bbox[4], int(bbox[0]), int(bbox[1]), width, height))
                    pred_bboxes.append(np.array([score, int(bbox[0]), int(bbox[1]), width, height]))
    
        #  Tracking
        if tracking:
            # Update tracks using detects from current frame
            tracked_objects = tracker.update(detections=to_norfair(detects, frame_id))
            for tobj in tracked_objects:
                bbox_width, bbox_height, last_detected_frame_id = tobj.last_detection.data
                if last_detected_frame_id == frame_id:  # Skip objects that were detected on current frame
                    continue

                # Add objects that have no detections on current frame to predictions
                xc, yc = tobj.estimate[0]
                x_min, y_min = int(round(xc - bbox_width / 2)), int(round(yc - bbox_height / 2))
                score = tobj.last_detection.scores[0]
                area = bbox_width * bbox_height
                if area >= area_thr:
                    predictions.append('{:.2f} {} {} {} {}'.format(score, x_min, y_min, bbox_width, bbox_height))
                    pred_bboxes.append(np.array([score, x_min, y_min, bbox_width, bbox_height]))
        
        if visualize and idx < 3:
            display(show_prediction(img, pred_bboxes, gt_bboxes))
        
        # get pred bboxes for evaluation
        prd_bboxes_list.append(np.array(pred_bboxes))
        
        prediction_str = ' '.join(predictions)
        
        frame_id += 1

    f2_score, p, r = calc_f2_score(gt_bboxes_list, prd_bboxes_list, verbose=False)

    return f2_score, p, r

### Search optimal hypreparameters

In [207]:
# sequences = train.loc[train.fold == FOLD, 'sequence'].unique()
test_df = df[df.video_id == FOLD]

area_thr = 0
img_size = 2700
conf = 0.1
iou = 0.4
conf_thr = 0.6
conf_ratio = 0.6

augment = True
tracking = True

f2, p, r = evaluate(f_best, test_df, conf, iou, img_size, area_thr, conf_thr, conf_ratio, augment, tracking, visualize=False)
print(f'F2 is {f2}, Precision is {p}, Recall is {r}')

# # search for the best image size
# size_dict = {}
# best_size = 0
# best_f2 = 0

# for img_size in np.arange(2600, 3800, 200):
#     f2, p, r = evaluate(f_best, test_df, conf, iou, img_size, area_thr, conf_thr, conf_ratio, augment, tracking, visualize=False)
#     size_dict[conf] = f2
#     print(80*'=')
#     print(f'Image size is {img_size}, F2 is {f2}, Precision is {p}, Recall is {r}')
#     print(80*'=')
#     if f2 > best_f2:
#         best_f2 = f2
#         best_size = img_size
        
# print(f'Best image size is {best_size}, best F2 is {best_f2}')
    
# # search for the best confidence threshold
# conf_dict = {}
# best_conf = 0
# best_f2 = 0

# for conf_thr in np.arange(0.3, 0.7, 0.05):
#     f2, p, r = evaluate(f_best, test_df, conf, iou, img_size, area_thr, conf_thr, conf_ratio, augment, tracking, visualize=False)
#     conf_dict[conf_thr] = f2
#     print(80*'=')
#     print(f'Confidence threshold is {conf_thr}, F2 is {f2}, Precision is {p}, Recall is {r}')
#     print(80*'=')
#     if f2 > best_f2:
#         best_f2 = f2
#         best_conf = conf_thr
        
# print(f'Best confidence threshold is {best_conf}, best F2 is {best_f2}')

# # search for the best IOU
# iou_dict = {}
# best_iou = 0
# best_f2 = 0
# conf_thr = 0.5
# conf_ratio = 0.3

# for iou in np.arange(0.35, 0.5, 0.05):
#     f2, p, r = evaluate(f_best, test_df, conf, iou, img_size, area_thr, conf_thr, conf_ratio, augment, tracking, visualize=False)
#     print(80*'=')
#     print(f'IOU is {iou}, F2 is {f2}, Precision is {p}, Recall is {r}')
#     print(80*'=')
#     if f2 > best_f2:
#         best_f2 = f2
#         best_iou = iou

# print(f'Best IOU is {best_iou}, best F2 is {best_f2}')

# # search for the best area
# area_dict = {}
# best_area = 0
# best_f2 = 0

# for area_thr in np.arange(0, 450, 50):
#     f2, p, r = evaluate(f_best, test_df, conf, iou, img_size, area_thr, conf_thr, conf_ratio, augment, tracking, visualize=False)
#     area_dict[area_thr] = f2
#     print(80*'=')
#     print(f'Area threshold is {area_thr}, F2 is {f2}, Precision is {p}, Recall is {r}')
#     print(80*'=')
#     if f2 > best_f2:
#         best_f2 = f2
#         best_area = area_thr

# print(f'Best area is {best_area}, best F2 is {best_f2}')

YOLOv5 🚀 v6.0-244-g9cf80b7 torch 1.10.2 CUDA:0 (NVIDIA GeForce GTX 1070, 8112MiB)

Fusing layers... 
Model Summary: 280 layers, 12308200 parameters, 0 gradients, 16.2 GFLOPs
Adding AutoShape... 


  0%|          | 0/8561 [00:00<?, ?it/s]

F2 is 0.7208568523647583, Precision is 0.5898333285521673, Recall is 0.7632428820668918


In [208]:
conf_thr = 0.6
conf_ratio = 0.5

augment = True
tracking = True

f2, p, r = evaluate(f_best, test_df, conf, iou, img_size, area_thr, conf_thr, conf_ratio, augment, tracking, visualize=False)
print(f'F2 is {f2}, Precision is {p}, Recall is {r}')

YOLOv5 🚀 v6.0-244-g9cf80b7 torch 1.10.2 CUDA:0 (NVIDIA GeForce GTX 1070, 8112MiB)

Fusing layers... 
Model Summary: 280 layers, 12308200 parameters, 0 gradients, 16.2 GFLOPs
Adding AutoShape... 


  0%|          | 0/8561 [00:00<?, ?it/s]

F2 is 0.7275503231500726, Precision is 0.6244806940371457, Recall is 0.75886261553881


In [209]:
conf_thr = 0.6
conf_ratio = 0.7

augment = True
tracking = True

f2, p, r = evaluate(f_best, test_df, conf, iou, img_size, area_thr, conf_thr, conf_ratio, augment, tracking, visualize=False)
print(f'F2 is {f2}, Precision is {p}, Recall is {r}')

YOLOv5 🚀 v6.0-244-g9cf80b7 torch 1.10.2 CUDA:0 (NVIDIA GeForce GTX 1070, 8112MiB)

Fusing layers... 
Model Summary: 280 layers, 12308200 parameters, 0 gradients, 16.2 GFLOPs
Adding AutoShape... 


  0%|          | 0/8561 [00:00<?, ?it/s]

F2 is 0.7105877506179621, Precision is 0.546352021961778, Recall is 0.7683284457478006


In [210]:
conf_thr = 0.6
conf_ratio = 0.4

augment = True
tracking = False

f2, p, r = evaluate(f_best, test_df, conf, iou, img_size, area_thr, conf_thr, conf_ratio, augment, tracking, visualize=False)
print(f'F2 is {f2}, Precision is {p}, Recall is {r}')

YOLOv5 🚀 v6.0-244-g9cf80b7 torch 1.10.2 CUDA:0 (NVIDIA GeForce GTX 1070, 8112MiB)

Fusing layers... 
Model Summary: 280 layers, 12308200 parameters, 0 gradients, 16.2 GFLOPs
Adding AutoShape... 


  0%|          | 0/8561 [00:00<?, ?it/s]

F2 is 0.7375072784006927, Precision is 0.754102740248836, Recall is 0.7334719180370467


- F2_na_10_adam, conf_thr 0.5, conf_ratio 0.3, image size is 2700, F2 = 0.725
- F2_na_10_adam, conf_thr 0.5, conf_ratio 0.3, image size is 2800, F2 = 0.720
- F2_na_10_adam, conf_thr 0.5, conf_ratio 0.3, image size is 2900, F2 = 0.719
- F2_na_10_adam, conf_thr 0.5, conf_ratio 0.3, image size is 3000, F2 = 0.717
- F2_na_10_adam, conf_thr 0.5, conf_ratio 0.3, image size is 3200, F2 = 0.712
- F2_na_10_adam, conf_thr 0.5, conf_ratio 0.3, image size is 3400, F2 = 0.703

- F0_best - Conf 0.28, image size is 3200, F2 is 0.578
- F1_best - Conf 0.28, image size is 2560, F2 is 0.558
- F2_best - Conf 0.28, image size is 3200, F2 is 0.700


- F0_na_20_best - Conf 0.28, image size is 3200, F2 is 0.581
- F1_na_20_best - Conf 0.28, image size is 2560, F2 is 0.558
- F2_na_20_best - Conf 0.28, image size is 3200, F2 is 0.678


- F0_na_20_last - Conf 0.28, image size is 3200, F2 is 0.58
- F1_na_20_last - Conf 0.28, image size is 2560, F2 is 0.545  3200 - 0.544
- F2_na_20_last - Conf 0.28, image size is 3200, F2 is 0.706


- F0_na_10_best - Conf 0.28, image size is 3200, F2 is 
- F1_na_10_best - Conf 0.28, image size is 2560, F2 is
- F2_na_10_best - Conf 0.28, image size is 3200, F2 is 


- F0_na_10_last - Conf 0.28, image size is 3200, F2 is 
- F1_na_10_last - Conf 0.28, image size is 2560, F2 is
- F2_na_10_last - Conf 0.28, image size is 3200, F2 is 

Recommended image size is 3200

# 4. Inference

## Initialize environment

In [None]:
import greatbarrierreef
env = greatbarrierreef.make_env()# initialize the environment
iter_test = env.iter_test()      # an iterator which loops over the test set and sample submission

## Make predictions with tracking

In [None]:
submission_dict = {
    'id': [],
    'prediction_string': [],
}

model = _load_model(yolov5s6_1920_batch_8_groupk_f0, conf, iou)

#######################################################
#                      Tracking                       #
#######################################################

# Tracker will update tracks based on detections from current frame
# Matching based on euclidean distance between bbox centers of detections 
# from current frame and tracked_objects based on previous frames
# You can check it's parameters in norfair docs
# https://github.com/tryolabs/norfair/blob/master/docs/README.md
tracker = Tracker(
    distance_function=euclidean_distance, 
    distance_threshold=30,
    hit_inertia_min=3,
    hit_inertia_max=6,
    initialization_delay=1,
)

# Save frame_id into detection to know which tracks have no detections on current frame
frame_id = 0
#######################################################

for img, pred_df in tqdm(iter_test):
    anno = ''
    bboxes = np.empty((0,5), int)
    predictions, detects = [], []
    
    r = model(img, size=img_size, augment=augment)

    if r.pandas().xyxy[0].shape[0] == 0:
        anno = ''
    else:
        for idx, row in r.pandas().xyxy[0].iterrows():
            bboxes = np.append(bboxes, [[row.xmin, row.ymin, row.xmax, row.ymax, row.confidence]], axis=0)
        
        for bbox in bboxes:
            detects.append(bbox)
            width, height = int(bbox[2]-bbox[0]), int(bbox[3]-bbox[1])
            area = width * height
            if area >= area_thr:
                predictions.append('{:.2f} {} {} {} {}'.format(bbox[4], int(bbox[0]), int(bbox[1]), width, height))
                                   
    #######################################################
    #                      Tracking                       #
    #######################################################
    
    # Update tracks using detects from current frame
    tracked_objects = tracker.update(detections=to_norfair(detects, frame_id))
    for tobj in tracked_objects:
        bbox_width, bbox_height, last_detected_frame_id = tobj.last_detection.data
        if last_detected_frame_id == frame_id:  # Skip objects that were detected on current frame
            continue
            
        # Add objects that have no detections on current frame to predictions
        xc, yc = tobj.estimate[0]
        x_min, y_min = int(round(xc - bbox_width / 2)), int(round(yc - bbox_height / 2))
        score = tobj.last_detection.scores[0]
        area = bbox_width * bbox_height
        if area >= area_thr:
            predictions.append('{:.2f} {} {} {} {}'.format(score, x_min, y_min, bbox_width, bbox_height))
    #######################################################
    
    prediction_str = ' '.join(predictions)
    pred_df['annotations'] = prediction_str
    env.predict(pred_df)

    print('Prediction:', prediction_str)
    frame_id += 1

## Check submission

In [None]:
sub_df = pd.read_csv('submission.csv')
sub_df.head(10)