In [1]:
import os
import random

import numpy as np
import torch as th
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm

from dataset import YC2ProcNetDataset
from models.model import ProcNet

In [2]:
feature_root = "/disk/scratch_fast/s2004019/youcook2/raw_videos"
yc2_dur_file = "/disk/scratch_fast/s2004019/youcook2/yc2/yc2_duration_frame.csv"
yc2_annotation_file = "/disk/scratch_fast/s2004019/youcook2/yc2/yc2_new_annotations_trainval_test.json"
checkpoint_dir = "/disk/scratch_fast/s2004019/youcook2/checkpoints/procnet/run7_iou"
post_fix = "_iou"
# post_fix = "_iou_run3_aug5_exp_scheduler"
# post_fix = "_epoch8_iou_run3_aug5_exp_scheduler"
frames_per_video = 500

In [3]:
checkpoint_path = os.path.join(checkpoint_dir, "model_best{}.pth.tar".format(post_fix))

In [4]:
test_dataset = YC2ProcNetDataset(feature_root=feature_root,
                                 data_file="validation_frames.json",
                                 dur_file=yc2_dur_file,
                                 annotation_file=yc2_annotation_file,
                                 split="testing", frames_per_video=frames_per_video, max_augs=1)
test_dataloader = DataLoader(
    test_dataset, batch_size=1, shuffle=False, num_workers=16)

In [5]:
model = ProcNet(input_encoding_size=512, rnn_size=512, clip_number=16,
                    kernelInfo=[3, 123, 8], frames_per_video=frames_per_video,
                    mp_scale=(8, 5), video_feat=512)

model.eval()
model.cuda()

ProcNet(
  (core1): LSTM(512, 512, batch_first=True, bidirectional=True)
  (mlp): Sequential(
    (0): Linear(in_features=1536, out_features=512, bias=True)
    (1): ReLU()
  )
  (temporal_segment): TemporalSegmentation(
    (ts): ModuleList(
      (0): Conv1d(512, 3, kernel_size=(3,), stride=(1,))
      (1): Conv1d(512, 3, kernel_size=(11,), stride=(1,))
      (2): Conv1d(512, 3, kernel_size=(19,), stride=(1,))
      (3): Conv1d(512, 3, kernel_size=(27,), stride=(1,))
      (4): Conv1d(512, 3, kernel_size=(35,), stride=(1,))
      (5): Conv1d(512, 3, kernel_size=(43,), stride=(1,))
      (6): Conv1d(512, 3, kernel_size=(51,), stride=(1,))
      (7): Conv1d(512, 3, kernel_size=(59,), stride=(1,))
      (8): Conv1d(512, 3, kernel_size=(67,), stride=(1,))
      (9): Conv1d(512, 3, kernel_size=(75,), stride=(1,))
      (10): Conv1d(512, 3, kernel_size=(83,), stride=(1,))
      (11): Conv1d(512, 3, kernel_size=(91,), stride=(1,))
      (12): Conv1d(512, 3, kernel_size=(99,), stride=(1,))
 

In [6]:
def getParllelNetworkStateDict(state_dict):
    from collections import OrderedDict

    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k[7:]  # remove `module.`
        new_state_dict[name] = v
    return new_state_dict

checkpoint = th.load(checkpoint_path)
model_state_dict = getParllelNetworkStateDict(checkpoint["state_dict"])
model.load_state_dict(model_state_dict)
best_iou = checkpoint["miou"]
print("=> loaded checkpoint '{}' (epoch {}) with best mIoU : {}".format(checkpoint_path, checkpoint["epoch"], best_iou))

=> loaded checkpoint '/disk/scratch_fast/s2004019/youcook2/checkpoints/procnet/run7_iou/model_best_iou.pth.tar' (epoch 18) with best mIoU : 0.3983739769977072


In [11]:
def compute_iou(c2, c1):
    intersection = max(0, min(c1[1], c2[1])-max(c1[0], c2[0]))
    if intersection == 0:
        return 0
    else:
        union = max(c1[1], c2[1]) - min(c1[0], c2[0])
        return intersection/union
    
    
def compute_jacc(c2, c1):
    intersection = max(0, min(c1[1], c2[1])-max(c1[0], c2[0]))
    if intersection == 0:
        return 0
    else:
        union = c2[1]-c2[0]
        return intersection/union


def validate_epoch(eval_dataloader, model):
    model.eval()
    running_loss = 0.0
    valid_bar = tqdm(enumerate(eval_dataloader), total=len(eval_dataloader))
    avg_miou = []
    avg_jacc = []
    results = {}
    raw_results = {}
    with th.no_grad():
        for i_batch, data in valid_bar:
            feat = data["feature"].float().cuda()
            gt_segments = data["segments"].float()
            task = data["task"]
            vid = data["vid"][0]
            
            _, maxpool_boundaries, pred_seg_index, _ = model(feat, None)
            l = maxpool_boundaries.size(-1)
            iou_clip = 0.
            jacc_clip = 0.
            n_segments = gt_segments.size(1)
            per_video_result = {}

            per_video_raw_preds = {}
            keys_to_remove = []
            for j in range(len(pred_seg_index)):
                column = pred_seg_index[j] % l
                row = pred_seg_index[j] // l
                clip_boundary = maxpool_boundaries[0, :, row, column]
                if j == 0:
                    per_video_raw_preds[j] = clip_boundary.cpu().numpy()
                    continue
                per_video_raw_preds[j] = clip_boundary.cpu().numpy()
                if np.sum(clip_boundary.cpu().numpy() - per_video_raw_preds[j-1]) == 0:
                    keys_to_remove.append(j)
            
            per_video_raw_preds = {k:v for k,v in per_video_raw_preds.items() if k not in keys_to_remove}
            raw_results[vid] = per_video_raw_preds
            hits = []
            for i in range(n_segments):
                best_iou = 0.
                best_jacc = 0.
                best_clip = []
                best_index = -1
                for j in range(len(pred_seg_index)):
                    if j in hits:
                        continue
#                     if pred_seg_index[j] == model.clip_prop_encoding:
#                         print("breaking")
#                         break
                    column = pred_seg_index[j] % l
                    row = pred_seg_index[j] // l
                    clip_boundary = maxpool_boundaries[0, :, row, column]
                    current_iou = compute_iou(
                        clip_boundary.cpu().numpy(), gt_segments[0][i].cpu().numpy())
                    current_jacc = compute_jacc(
                        clip_boundary.cpu().numpy(), gt_segments[0][i].cpu().numpy())
                    if current_iou > best_iou:
                        best_iou = current_iou
                        best_clip = clip_boundary.cpu().numpy()
                        best_index = j
                    if current_jacc > best_jacc:
                        best_jacc = current_jacc
                hits.append(best_index)
                per_video_result[str(i)] = {"iou": best_iou, "gt":list(gt_segments[0][i].cpu().numpy()), "pred":list(best_clip), "best_index":best_index}
                iou_clip += best_iou
                jacc_clip += best_jacc
            results[vid] = per_video_result   
            avg_miou.append(iou_clip/n_segments)
            avg_jacc.append(jacc_clip/n_segments)
    return avg_miou, avg_jacc, results, raw_results

In [12]:
avg_miou, avg_jacc, results, raw_results = validate_epoch(test_dataloader, model)
print(f"Testing mIoU is {np.nanmean(avg_miou)} and mJacc is {np.nanmean(avg_jacc)}")

100%|██████████| 180/180 [00:30<00:00,  5.97it/s]

Testing mIoU is 0.3462829431923256 and mJacc is 0.47650047140824014





In [14]:
# Validation mIoU is 0.36315514330913995 and mJacc is 0.5091106183977766 -- One Proposal is used only once
# Validation mIoU is 0.398204407727742 and mJacc is 0.5452657957634546 -- One Proposal is allowed to be reused

# Testing mIoU is 0.3462829431923256 and mJacc is 0.47650047140824014 -- One Proposal is used only once
# Testing mIoU is 0.38121907241187913 and mJacc is 0.5120001695887446 -- One Proposal is allowed to be reused

In [13]:
len(results), model.clip_prop_encoding-1

(180, 199)

In [17]:
ind_ = 165
# list(results.keys())[ind_], 
results[list(results.keys())[ind_]],  raw_results[list(results.keys())[ind_]]

({'0': {'iou': 0.0,
   'gt': [32.902515, 38.702972],
   'pred': [],
   'best_index': -1},
  '1': {'iou': 0.337534,
   'gt': [43.53669, 144.07796],
   'pred': [103.776184, 137.71228],
   'best_index': 1},
  '2': {'iou': 0.3752818,
   'gt': [145.0447, 197.24881],
   'pred': [134.71849, 168.51118],
   'best_index': 2},
  '3': {'iou': 0.31203666,
   'gt': [201.11578, 223.35088],
   'pred': [178.84769, 215.00241],
   'best_index': 3},
  '4': {'iou': 0.48605427,
   'gt': [224.31761, 259.12036],
   'pred': [212.6751, 246.89253],
   'best_index': 4},
  '5': {'iou': 0.56015575,
   'gt': [260.0871, 282.32217],
   'pred': [261.6448, 297.0007],
   'best_index': 5},
  '6': {'iou': 0.0, 'gt': [283.28894, 293.9231], 'pred': [], 'best_index': -1},
  '7': {'iou': 0.6313663,
   'gt': [294.88983, 320.02518],
   'pred': [296.63025, 331.94427],
   'best_index': 6},
  '8': {'iou': 0.18529533,
   'gt': [345.1605, 351.92767],
   'pred': [331.04663, 367.5677],
   'best_index': 7},
  '9': {'iou': 0.854404,
   '

In [19]:
ind_ = 98
list(results.keys())[ind_], results[list(results.keys())[ind_]], raw_results[list(results.keys())[ind_]]

('yxjnWx6TaQ8',
 {'0': {'iou': 0.27841088,
   'gt': [92.93016, 116.91194],
   'pred': [53.93296, 140.07104],
   'best_index': 1},
  '1': {'iou': 0.31207353,
   'gt': [132.8998, 212.83907],
   'pred': [135.52855, 160.47548],
   'best_index': 3},
  '2': {'iou': 0.0030800733,
   'gt': [216.83603, 264.7996],
   'pred': [196.95905, 217.04498],
   'best_index': 4},
  '3': {'iou': 0.0,
   'gt': [268.79657, 292.77835],
   'pred': [],
   'best_index': -1},
  '4': {'iou': 0.0,
   'gt': [312.76315, 348.73584],
   'pred': [],
   'best_index': -1},
  '5': {'iou': 0.030165005,
   'gt': [352.7328, 396.6994],
   'pred': [393.44788, 460.5241],
   'best_index': 5}},
 {0: array([ 54.743393, 107.23659 ], dtype=float32),
  1: array([ 53.93296, 140.07104], dtype=float32),
  2: array([108.563705, 131.44029 ], dtype=float32),
  3: array([135.52855, 160.47548], dtype=float32),
  4: array([196.95905, 217.04498], dtype=float32),
  5: array([393.44788, 460.5241 ], dtype=float32)})

In [22]:
ind_ = 56
list(results.keys())[ind_], results[list(results.keys())[ind_]], raw_results[list(results.keys())[ind_]]

('NYhsc9ikk4I',
 {'0': {'iou': 0.5825819,
   'gt': [66.99063, 95.48658],
   'pred': [73.33563, 105.012665],
   'best_index': 0},
  '1': {'iou': 0.624274,
   'gt': [146.47934, 221.46869],
   'pred': [154.82823, 201.64214],
   'best_index': 3},
  '2': {'iou': 0.3235407,
   'gt': [224.46826, 258.96335],
   'pred': [206.76297, 241.35721],
   'best_index': 4},
  '3': {'iou': 0.25168076,
   'gt': [317.45505, 336.9523],
   'pred': [325.43417, 363.21988],
   'best_index': 7},
  '4': {'iou': 0.0, 'gt': [336.9523, 365.44824], 'pred': [], 'best_index': -1},
  '5': {'iou': 0.0,
   'gt': [365.44824, 374.44696],
   'pred': [],
   'best_index': -1},
  '6': {'iou': 0.26737636,
   'gt': [420.94037, 431.43887],
   'pred': [405.79724, 445.06213],
   'best_index': 8},
  '7': {'iou': 0.18885782,
   'gt': [431.43887, 477.93225],
   'pred': [405.79724, 445.06213],
   'best_index': 9}},
 {0: array([ 73.33563 , 105.012665], dtype=float32),
  1: array([ 96.35262, 133.82199], dtype=float32),
  2: array([132.4213

In [14]:
import json
test = raw_results[list(results.keys())[ind_]]
test = {str(k):str(list(v)) for k,v in test.items()}
print(json.dumps(test, indent=4))

{
    "0": "[68.14183, 117.15561]",
    "1": "[84.14755, 134.9491]",
    "2": "[135.89635, 156.42366]",
    "3": "[181.16602, 216.93011]",
    "4": "[215.98392, 257.88126]",
    "5": "[269.7658, 311.92828]",
    "6": "[291.0036, 326.789]",
    "7": "[326.0857, 361.1903]",
    "8": "[378.85477, 402.00937]",
    "9": "[401.35873, 432.45865]",
    "10": "[430.66473, 476.51892]",
    "11": "[430.66473, 476.51892]",
    "12": "[430.66473, 476.51892]",
    "13": "[430.66473, 476.51892]",
    "14": "[430.66473, 476.51892]",
    "15": "[430.66473, 476.51892]"
}


In [26]:
def Average(lst):
    return sum(lst) / len(lst)

results_vid_iou = {vid:Average([v['iou'] for k,v in result.items()]) for vid, result in results.items()} 

In [54]:
results_vid_iou_good = {k:v for k,v in results_vid_iou.items() if v > 0.64}
results_vid_iou_bad = {k:v for k,v in results_vid_iou.items() if v < 0.29}

In [55]:
results_vid_iou_good, results_vid_iou_bad

({'vLcBGs389k4': 0.6491352468729019,
  'zPCtV7YcmkA': 0.6616055568059286,
  '4apR0YypAGc': 0.6538130715489388,
  'WlHWRPyA7_g': 0.6866569618384043,
  'DHpQOhQhW3A': 0.6465556025505066},
 {'sBJJ0Cj0GG4': 0.2877267375588417,
  'fn9anlEL4FI': 0.28432218823581934,
  '-dh_uGahzYo': 0.170203671273258,
  '2-mxsib6pJo': 0.28805758378335405,
  'sGzBQrg1adY': 0.19011149276047945,
  '6seOEuK0ojg': 0.20878860460860388})

In [108]:
# https://youtu.be/-AwyG1JcMp8
raw_results["-AwyG1JcMp8"]

{0: array([29.105045, 71.41049 ], dtype=float32),
 1: array([ 59.09948, 101.40493], dtype=float32),
 2: array([ 89.09893, 131.40439], dtype=float32),
 3: array([119.10808, 161.41353], dtype=float32),
 4: array([149.10504, 191.41049], dtype=float32),
 5: array([184.10504, 226.41049], dtype=float32),
 6: array([214.10504, 256.4105 ], dtype=float32),
 7: array([249.08904, 291.3945 ], dtype=float32),
 8: array([274.08902, 316.3945 ], dtype=float32),
 9: array([319.08902, 361.3945 ], dtype=float32),
 10: array([344.08902, 386.3945 ], dtype=float32),
 11: array([369.08905, 411.39447], dtype=float32),
 12: array([394.08905, 436.39447], dtype=float32),
 13: array([414.08902, 456.3945 ], dtype=float32),
 14: array([439.08902, 481.3945 ], dtype=float32),
 15: array([439.08902, 481.3945 ], dtype=float32)}

In [101]:
with open("/disk/scratch_fast/s2004019/youcook2/features/transcript_per_sec_all.json", "r") as f:
    data = json.load(f)

In [24]:
def levenstein(p, y, norm=False):
    m_row = len(p)    
    n_col = len(y)
    D = np.zeros([m_row+1, n_col+1], float)
    for i in range(m_row+1):
        D[i, 0] = i
    for i in range(n_col+1):
        D[0, i] = i

    for j in range(1, n_col+1):
        for i in range(1, m_row+1):
            if y[j-1] == p[i-1]:
                D[i, j] = D[i-1, j-1]
            else:
                D[i, j] = min(D[i-1, j] + 1,
                              D[i, j-1] + 1,
                              D[i-1, j-1] + 1)
    
    if norm:
        score = (1 - D[-1, -1]/max(m_row, n_col)) * 100
    else:
        score = D[-1, -1]

    return score


In [25]:
y_labels = [1,2,3,4]
p_labels = [3,5,4]
levenstein(p_labels, y_labels, True)

25.0

In [26]:
edit_score = []
for vid, result in results.items():
    y_label= []
    p_label = []
    for k,v in result.items():
        y_label.append(str(int(k) + 1))
        if v["best_index"]+1 > 0:
            p_label.append(str(v["best_index"]+1))
    
    edit = levenstein(p_label, y_label, True)
    print(p_label, y_label, edit)
    edit_score.append(edit)
    break
print(np.nanmean(edit_score))

['3', '5', '6'] ['1', '2', '3', '4'] 0.0
0.0


In [24]:
result

{'0': {'iou': 0.0, 'gt': [125.99558, 149.3281], 'pred': [], 'best_index': -1},
 '1': {'iou': 0.85044336,
  'gt': [150.99469, 182.66025],
  'pred': [153.35834, 185.44955],
  'best_index': 2},
 '2': {'iou': 0.5899308,
  'gt': [184.32684, 220.99222],
  'pred': [194.06398, 229.97328],
  'best_index': 4},
 '3': {'iou': 0.40845892,
  'gt': [222.65883, 300.98938],
  'pred': [244.71954, 276.71436],
  'best_index': 5}}

In [None]:
# Testing : 31.9064438647772
# Validation : 32.12256322912061