In [1]:
import os
import random

import numpy as np
import torch as th
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm

from dataset import YC2ProcNetDataset
from models.model import ProcNet

In [2]:
feature_root = "/disk/scratch_fast/s2004019/youcook2/raw_videos"
yc2_dur_file = "/disk/scratch_fast/s2004019/youcook2/yc2/yc2_duration_frame.csv"
yc2_annotation_file = "/disk/scratch_fast/s2004019/youcook2/yc2/yc2_new_annotations_trainval_test.json"
checkpoint_dir = "/disk/scratch_fast/s2004019/youcook2/checkpoints/procnet/run7_iou"
post_fix = "_iou"
# post_fix = "_iou_run3_aug5_exp_scheduler"
# post_fix = "_epoch8_iou_run3_aug5_exp_scheduler"
frames_per_video = 500

In [3]:
checkpoint_path = os.path.join(checkpoint_dir, "model_best{}.pth.tar".format(post_fix))

In [4]:
test_dataset = YC2ProcNetDataset(feature_root=feature_root,
                                 data_file="validation_frames.json",
                                 dur_file=yc2_dur_file,
                                 annotation_file=yc2_annotation_file,
                                 split="validation", frames_per_video=frames_per_video, max_augs=1)
test_dataloader = DataLoader(
    test_dataset, batch_size=1, shuffle=False, num_workers=16)

In [5]:
model = ProcNet(input_encoding_size=512, rnn_size=512, clip_number=16,
                    kernelInfo=[3, 123, 8], frames_per_video=frames_per_video,
                    mp_scale=(8, 5), video_feat=512)

model.eval()
model.cuda()

ProcNet(
  (core1): LSTM(512, 512, batch_first=True, bidirectional=True)
  (mlp): Sequential(
    (0): Linear(in_features=1536, out_features=512, bias=True)
    (1): ReLU()
  )
  (temporal_segment): TemporalSegmentation(
    (ts): ModuleList(
      (0): Conv1d(512, 3, kernel_size=(3,), stride=(1,))
      (1): Conv1d(512, 3, kernel_size=(11,), stride=(1,))
      (2): Conv1d(512, 3, kernel_size=(19,), stride=(1,))
      (3): Conv1d(512, 3, kernel_size=(27,), stride=(1,))
      (4): Conv1d(512, 3, kernel_size=(35,), stride=(1,))
      (5): Conv1d(512, 3, kernel_size=(43,), stride=(1,))
      (6): Conv1d(512, 3, kernel_size=(51,), stride=(1,))
      (7): Conv1d(512, 3, kernel_size=(59,), stride=(1,))
      (8): Conv1d(512, 3, kernel_size=(67,), stride=(1,))
      (9): Conv1d(512, 3, kernel_size=(75,), stride=(1,))
      (10): Conv1d(512, 3, kernel_size=(83,), stride=(1,))
      (11): Conv1d(512, 3, kernel_size=(91,), stride=(1,))
      (12): Conv1d(512, 3, kernel_size=(99,), stride=(1,))
 

In [6]:
def getParllelNetworkStateDict(state_dict):
    from collections import OrderedDict

    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k[7:]  # remove `module.`
        new_state_dict[name] = v
    return new_state_dict

checkpoint = th.load(checkpoint_path)
model_state_dict = getParllelNetworkStateDict(checkpoint["state_dict"])
model.load_state_dict(model_state_dict)
best_iou = checkpoint["miou"]
print("=> loaded checkpoint '{}' (epoch {}) with best mIoU : {}".format(checkpoint_path, checkpoint["epoch"], best_iou))

=> loaded checkpoint '/disk/scratch_fast/s2004019/youcook2/checkpoints/procnet/run7_iou/model_best_iou.pth.tar' (epoch 3) with best mIoU : 0.3617170168005902


In [7]:
def compute_iou(c2, c1):
    intersection = max(0, min(c1[1], c2[1])-max(c1[0], c2[0]))
    if intersection == 0:
        return 0
    else:
        union = max(c1[1], c2[1]) - min(c1[0], c2[0])
        return intersection/union
    
    
def compute_jacc(c2, c1):
    intersection = max(0, min(c1[1], c2[1])-max(c1[0], c2[0]))
    if intersection == 0:
        return 0
    else:
        union = c2[1]-c2[0]
        return intersection/union


def validate_epoch(eval_dataloader, model):
    model.eval()
    running_loss = 0.0
    valid_bar = tqdm(enumerate(eval_dataloader), total=len(eval_dataloader))
    avg_miou = []
    avg_jacc = []
    results = {}
    raw_results = {}
    with th.no_grad():
        for i_batch, data in valid_bar:
            feat = data["feature"].float().cuda()
            gt_segments = data["segments"].float()
            task = data["task"]
            vid = data["vid"][0]
            
            _, maxpool_boundaries, pred_seg_index, _ = model(feat, None)
            l = maxpool_boundaries.size(-1)
            iou_clip = 0.
            jacc_clip = 0.
            n_segments = gt_segments.size(1)
            per_video_result = {}

            per_video_raw_preds = {}
            for j in range(len(pred_seg_index)):
                column = pred_seg_index[j] % l
                row = pred_seg_index[j] // l
                clip_boundary = maxpool_boundaries[0, :, row, column]
                per_video_raw_preds[j] = clip_boundary.cpu().numpy()
            raw_results[vid] = per_video_raw_preds
            for i in range(n_segments):
                best_iou = 0.
                best_jacc = 0.
                best_clip = []
                best_index = -1
                for j in range(len(pred_seg_index)):
#                     if pred_seg_index[j] == model.clip_prop_encoding:
#                         print("breaking")
#                         break
                    column = pred_seg_index[j] % l
                    row = pred_seg_index[j] // l
                    clip_boundary = maxpool_boundaries[0, :, row, column]
                    current_iou = compute_iou(
                        clip_boundary.cpu().numpy(), gt_segments[0][i].cpu().numpy())
                    current_jacc = compute_jacc(
                        clip_boundary.cpu().numpy(), gt_segments[0][i].cpu().numpy())
                    if current_iou > best_iou:
                        best_iou = current_iou
                        best_clip = clip_boundary.cpu().numpy()
                        best_index = j
                    if current_jacc > best_jacc:
                        best_jacc = current_jacc
                per_video_result[str(i)] = {"iou": best_iou, "gt":list(gt_segments[0][i].cpu().numpy()), "pred":list(best_clip), "best_index":best_index}
                iou_clip += best_iou
                jacc_clip += best_jacc
            results[vid] = per_video_result   
            avg_miou.append(iou_clip/n_segments)
            avg_jacc.append(jacc_clip/n_segments)
    return avg_miou, avg_jacc, results, raw_results

In [8]:
avg_miou, avg_jacc, results, raw_results = validate_epoch(test_dataloader, model)
print(f"Testing mIoU is {np.nanmean(avg_miou)} and mJacc is {np.nanmean(avg_jacc)}")

100%|██████████| 180/180 [00:32<00:00,  5.49it/s]

Testing mIoU is 0.347653728331278 and mJacc is 0.5317526235371772





In [9]:
# Testing mIoU is 0.4844937084824859 and mJacc is 0.6039762745414246

In [10]:
len(results), model.clip_prop_encoding-1

(180, 199)

In [9]:
ind_ = 14
# list(results.keys())[ind_], results[list(results.keys())[ind_]], 
raw_results[list(results.keys())[ind_]]

{0: array([60.321068, 93.694954], dtype=float32),
 1: array([65.32107 , 98.694954], dtype=float32),
 2: array([ 95.32107, 128.69495], dtype=float32),
 3: array([102.32107, 135.69495], dtype=float32),
 4: array([132.32108, 165.69495], dtype=float32),
 5: array([164.32108, 197.69495], dtype=float32),
 6: array([205.32108, 238.69495], dtype=float32),
 7: array([237.32108, 270.69495], dtype=float32),
 8: array([255.32104, 288.69495], dtype=float32),
 9: array([325.32104, 358.69495], dtype=float32),
 10: array([325.32104, 358.69495], dtype=float32),
 11: array([325.32104, 358.69495], dtype=float32),
 12: array([325.32104, 358.69495], dtype=float32),
 13: array([325.32104, 358.69495], dtype=float32),
 14: array([428.65292, 450.2576 ], dtype=float32),
 15: array([325.32104, 358.69495], dtype=float32)}

In [10]:
ind_ = 98
# list(results.keys())[ind_], results[list(results.keys())[ind_]], 
raw_results[list(results.keys())[ind_]]

{0: array([61.321068, 94.694954], dtype=float32),
 1: array([66.32107 , 99.694954], dtype=float32),
 2: array([ 93.32107 , 126.694954], dtype=float32),
 3: array([101.32107, 134.69495], dtype=float32),
 4: array([131.32108, 164.69495], dtype=float32),
 5: array([167.32108, 200.69495], dtype=float32),
 6: array([205.32108, 238.69495], dtype=float32),
 7: array([237.32108, 270.69495], dtype=float32),
 8: array([257.32104, 290.69495], dtype=float32),
 9: array([326.32104, 359.69495], dtype=float32),
 10: array([326.32104, 359.69495], dtype=float32),
 11: array([326.32104, 359.69495], dtype=float32),
 12: array([326.32104, 359.69495], dtype=float32),
 13: array([326.32104, 359.69495], dtype=float32),
 14: array([374.1284, 446.7816], dtype=float32),
 15: array([326.32104, 359.69495], dtype=float32)}

In [11]:
ind_ = 156
# list(results.keys())[ind_], results[list(results.keys())[ind_]], 
raw_results[list(results.keys())[ind_]]

{0: array([58.321068, 91.694954], dtype=float32),
 1: array([ 77.32107 , 110.694954], dtype=float32),
 2: array([100.32107, 133.69495], dtype=float32),
 3: array([129.32108, 162.69495], dtype=float32),
 4: array([164.32108, 197.69495], dtype=float32),
 5: array([203.32108, 236.69495], dtype=float32),
 6: array([234.32108, 267.69495], dtype=float32),
 7: array([256.32104, 289.69495], dtype=float32),
 8: array([327.32104, 360.69495], dtype=float32),
 9: array([327.32104, 360.69495], dtype=float32),
 10: array([327.32104, 360.69495], dtype=float32),
 11: array([327.32104, 360.69495], dtype=float32),
 12: array([357.8494, 386.8795], dtype=float32),
 13: array([327.32104, 360.69495], dtype=float32),
 14: array([327.32104, 360.69495], dtype=float32),
 15: array([357.8494, 386.8795], dtype=float32)}

In [14]:
import json
test = raw_results[list(results.keys())[ind_]]
test = {str(k):str(list(v)) for k,v in test.items()}
print(json.dumps(test, indent=4))

{
    "0": "[68.14183, 117.15561]",
    "1": "[84.14755, 134.9491]",
    "2": "[135.89635, 156.42366]",
    "3": "[181.16602, 216.93011]",
    "4": "[215.98392, 257.88126]",
    "5": "[269.7658, 311.92828]",
    "6": "[291.0036, 326.789]",
    "7": "[326.0857, 361.1903]",
    "8": "[378.85477, 402.00937]",
    "9": "[401.35873, 432.45865]",
    "10": "[430.66473, 476.51892]",
    "11": "[430.66473, 476.51892]",
    "12": "[430.66473, 476.51892]",
    "13": "[430.66473, 476.51892]",
    "14": "[430.66473, 476.51892]",
    "15": "[430.66473, 476.51892]"
}


In [26]:
def Average(lst):
    return sum(lst) / len(lst)

results_vid_iou = {vid:Average([v['iou'] for k,v in result.items()]) for vid, result in results.items()} 

In [54]:
results_vid_iou_good = {k:v for k,v in results_vid_iou.items() if v > 0.64}
results_vid_iou_bad = {k:v for k,v in results_vid_iou.items() if v < 0.29}

In [55]:
results_vid_iou_good, results_vid_iou_bad

({'vLcBGs389k4': 0.6491352468729019,
  'zPCtV7YcmkA': 0.6616055568059286,
  '4apR0YypAGc': 0.6538130715489388,
  'WlHWRPyA7_g': 0.6866569618384043,
  'DHpQOhQhW3A': 0.6465556025505066},
 {'sBJJ0Cj0GG4': 0.2877267375588417,
  'fn9anlEL4FI': 0.28432218823581934,
  '-dh_uGahzYo': 0.170203671273258,
  '2-mxsib6pJo': 0.28805758378335405,
  'sGzBQrg1adY': 0.19011149276047945,
  '6seOEuK0ojg': 0.20878860460860388})

In [108]:
# https://youtu.be/-AwyG1JcMp8
raw_results["-AwyG1JcMp8"]

{0: array([29.105045, 71.41049 ], dtype=float32),
 1: array([ 59.09948, 101.40493], dtype=float32),
 2: array([ 89.09893, 131.40439], dtype=float32),
 3: array([119.10808, 161.41353], dtype=float32),
 4: array([149.10504, 191.41049], dtype=float32),
 5: array([184.10504, 226.41049], dtype=float32),
 6: array([214.10504, 256.4105 ], dtype=float32),
 7: array([249.08904, 291.3945 ], dtype=float32),
 8: array([274.08902, 316.3945 ], dtype=float32),
 9: array([319.08902, 361.3945 ], dtype=float32),
 10: array([344.08902, 386.3945 ], dtype=float32),
 11: array([369.08905, 411.39447], dtype=float32),
 12: array([394.08905, 436.39447], dtype=float32),
 13: array([414.08902, 456.3945 ], dtype=float32),
 14: array([439.08902, 481.3945 ], dtype=float32),
 15: array([439.08902, 481.3945 ], dtype=float32)}

In [101]:
with open("/disk/scratch_fast/s2004019/youcook2/features/transcript_per_sec_all.json", "r") as f:
    data = json.load(f)

In [109]:
data["-AwyG1JcMp8"]

{'set': 'validation',
 'task': '309',
 'text': ['',
  '',
  '',
  '',
  '',
  '',
  "yes especially monster today we're going",
  "yes especially monster today we're going",
  "yes especially monster today we're going",
  'to do spicy sausage and huge eat',
  'to do spicy sausage and huge eat',
  'dumplings the meat quick simple',
  'dumplings the meat quick simple',
  "delicious and if you don't want this to",
  "delicious and if you don't want this to",
  'be me variety you can always have stupid',
  'be me variety you can always have stupid',
  "couple tofu if you don't want this to be",
  "couple tofu if you don't want this to be",
  "couple tofu if you don't want this to be",
  'spicy you can just omit the red pepper',
  "flakes but you can steam this Friday's",
  "flakes but you can steam this Friday's",
  "flakes but you can steam this Friday's",
  'make this into soup and all those',
  'make this into soup and all those',
  'different varieties to be post on my',
  'blog the su

In [110]:
results["-AwyG1JcMp8"]

{'0': {'iou': 0.5588634,
  'gt': [70.39085, 146.08995],
  'pred': [89.09893, 131.40439],
  'best_index': 2},
 '1': {'iou': 0.5964487,
  'gt': [160.28352, 185.51656],
  'pred': [149.10504, 191.41049],
  'best_index': 4},
 '2': {'iou': 0.7828389,
  'gt': [253.33034, 286.4487],
  'pred': [249.08904, 291.3945],
  'best_index': 7},
 '3': {'iou': 0.72501254,
  'gt': [303.79642, 362.1478],
  'pred': [319.08902, 361.3945],
  'best_index': 9},
 '4': {'iou': 0.1863909,
  'gt': [414.19092, 422.07626],
  'pred': [394.08905, 436.39447],
  'best_index': 12}}