In [1]:
%load_ext autoreload
%autoreload 2

import sys
#sys.path.append('../src/processing/')
sys.path.append('../../video_features')

## Video Pipeline

In [2]:
import os
from video_pipeline import downld_vids, process_video
from models.resnet.extract_resnet import ExtractResNet
from utils.utils import build_cfg_path
from omegaconf import OmegaConf
import torch
import numpy as np
import pandas as pd

In [3]:
!nvcc --version
!nvidia-smi

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0
/bin/bash: nvidia-smi: command not found


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.cuda.get_device_name(0)

### Extract feature represenation from video

In [7]:
def extract_vid_features(video_path, fps):
    # Select the feature type 
    feature_type = 'resnet'
    model_name = 'resnet50'

    # Load and patch the config
    args = OmegaConf.load(build_cfg_path(feature_type))
    args.feature_type = feature_type
    args.model_name = model_name
    args.video_paths = [video_path]
    args.batch_size = 32
    args.extraction_fps = fps
    # args.show_pred = True

    # Load the model
    extractor = ExtractResNet(args)

    # Extract features
    print(f'Extracting for {video_path}')
    feature_dict = extractor.extract(video_path)
    features = feature_dict[feature_type]
    timestamp = feature_dict['timestamps_ms'] / 1000
    print(features, timestamp, sep = '\n') 
    return features, timestamp

def to_dataframe(features, timestamp):
    col_len = features.shape[1]
    col_names = list(range(col_len))
    df = pd.DataFrame(data = features, columns = col_names)
    df['timestamp'] = timestamp
    return df

In [8]:
def run_video_pipeline(yt_ids_file, dest_path):
    # Full video -> resnet50 feature extraction pipeline
    vid_dump_folder = 'tmp/videos'
    csv_path = os.path.join(dest_path, 'features')
    for yt_id, vid_path in downld_vids(yt_ids_file, vid_dump_folder):
        #extract resnet50 features from the video
        print(yt_id, vid_path)
        features, timestamp = extract_vid_features(vid_path, 2)
        df =  to_dataframe(features, timestamp)
        #store the features in csv
        df.to_csv(os.path.join(csv_path, yt_id, 'resnet50_features.csv', header = False))
        #delete the video
        os.remove(vid_path)
        print(yt_id, 'is complete!')

### Get Youtube Ids

In [9]:
def dump_yt_ids(key_clip_training_file, dest):
    dest_path = os.path.abspath(dest)
    if not os.path.isdir(dest_path):
        os.mkdir(dest_path)
    df = pd.read_csv(key_clip_training_file)
    yt_ids = df['VideoUrl'].str[-11:].unique().tolist()
    n = len(yt_ids)
    step = 10
    cnt = 0
    for i in range(0, n, step):
        grp_ids = yt_ids[i:min(i+step, n)]
        ids_file = os.path.join(dest_path, f'{cnt}.txt')
        with open(ids_file, 'w+') as f:
            f.write('\n'.join(grp_ids))
        f.closed
        cnt += 1

In [91]:
key_clip_training_file = 'reviewed_0812_vid_5s.csv'
yt_id_dump_folder = 'tmp/youtube_ids'
dump_yt_ids(key_clip_training_file, yt_id_dump_folder)

### Run the pipeline

In [None]:
run_video_pipeline('tmp/youtube_ids/1.txt', 'tmp')

In [None]:
# Select the feature type 
feature_type = 'resnet'
model_name = 'resnet152'
# Load and patch the config
args = OmegaConf.load(build_cfg_path(feature_type))
args.feature_type = feature_type
args.model_name = model_name
args.video_paths = ['../../video_features/sample/v_GGSY1Qvo990.mp4']
args.batch_size = 32
args.extraction_fps = 5
# args.show_pred = True

# Load the model
extractor = ExtractResNet(args)

# Extract features
for video_path in args.video_paths:
    print(f'Extracting for {video_path}')
    feature_dict = extractor.extract(video_path)
    [(print(k), print(v.shape), print(v)) for k, v in feature_dict.items()]

### Align and map transcripts to video

In [11]:
def to_seconds(timestamp):
    t = timestamp.strip().split(':')
    h, m, s = t[-3] if len(t) == 3 else '0', t[-2], t[-1]
        
    return int(h) * 3600 + int(m) * 60 + float(s)
            
t2sec = np.vectorize(to_seconds)

In [55]:
def align_text_video(text_df, v_df, ext):
    timestamps = v_df[v_df.columns[-1]].tolist()
    res = []
    indices = []
    i = 0
    n = len(timestamps)
    for row in text_df[['RowNumber', 'IsUsefulSentence', 'start_time', 'end_time']].iterrows():
        row_id, label, st, et = tuple(row[1].tolist())
        st -= ext
        et += ext
        while i + 1 > n:
            i -= 1
        while i > 0 and timestamps[i] > st:
            i -= 1
        while i < n and timestamps[i] < st:
            i += 1
        while i < n and st <= timestamps[i] < et:
            res.append((row_id, label))
            indices.append(i)
            i += 1
    assert len(indices) == len(res)
    df = pd.DataFrame(res, columns = ['row_number', 'label'])
    return pd.concat([v_df.iloc[indices].reset_index(drop=True), df], axis = 1)

In [15]:
# load the information
key_clip_training_file = 'reviewed_0812_vid_5s.csv'
df = pd.read_csv(key_clip_training_file)
#extract yt_id
df['yt_id'] = df['VideoUrl'].str.extract('=(.+)')[0]

#convert time to seconds
df['start_time'] = t2sec(df['TimeStamp'])

#get end time
df['end_time'] = df[['yt_id', 'start_time']].groupby('yt_id').shift(-1, fill_value = 0)
end_time = df['end_time']
start_time = df['start_time']
df['end_time'] = end_time.where(end_time != 0, 5 + start_time)
#df[['yt_id', 'start_time', 'end_time']].groupby('yt_id').last()
df.head(5)

Unnamed: 0,No,Title,VideoUrl,TimeStamp,Sentence,RowNumber,IsUsefulSentence,Key steps,Verb,Object(directly related with Verb),Location,Time,Temperature,Other important phrase(like with,Video Pred,Clip IDs,yt_id,start_time,end_time
0,1,pizza marghetta,https://www.youtube.com/watch?v=FHvZgt3ExDI,00:00.1,"guys , jason hill here today.",0,0,,,,,,,,look grape,FHvZgt3ExDI_0001,FHvZgt3ExDI,0.1,1.5
1,2,pizza marghetta,https://www.youtube.com/watch?v=FHvZgt3ExDI,00:01.5,and i 'm with chef great stillman at repor res...,1,0,,,,,,,,"look grape, close grape","FHvZgt3ExDI_0001, FHvZgt3ExDI_0002",FHvZgt3ExDI,1.5,7.3
2,3,pizza marghetta,https://www.youtube.com/watch?v=FHvZgt3ExDI,00:07.3,what are we going to have?,2,0,,,,,,,,close grape,FHvZgt3ExDI_0002,FHvZgt3ExDI,7.3,8.2
3,4,pizza marghetta,https://www.youtube.com/watch?v=FHvZgt3ExDI,00:08.2,we 're going to have a traditional margherita ...,3,0,,,,,,,,"close grape, take pizza, close cupboard","FHvZgt3ExDI_0002, FHvZgt3ExDI_0003, FHvZgt3ExD...",FHvZgt3ExDI,8.2,15.9
4,5,pizza marghetta,https://www.youtube.com/watch?v=FHvZgt3ExDI,00:15.9,so this is the dough.,4,0,,,,,,,,close cupboard,FHvZgt3ExDI_0004,FHvZgt3ExDI,15.9,17.4


In [None]:
#Align text and video
features = '/network/scratch/s/subhrajyoti.dasgupta/yc2/features/'
yt_ids = os.listdir(features)
src_name = 'master_features_with_ts_v2.csv'
dest_name = 'master_features_with_label.csv'

cnt = 1
for yt_id in yt_ids:
    #get text corresponding to yt_id
    text_by_yt_id = df[df['yt_id'] == yt_id]
    #load video df corresponding to yt_id
    src_path = os.path.join(features, yt_id, src_name)
    df_v = pd.read_csv(src_path, header = None)
    
    #align
    df_nv = align_text_video(text_by_yt_id, df_v.drop([0, 1], axis = 1), ext)
    dest_path = os.path.join(features, yt_id, dest_name)
    df_nv.to_csv(dest_path, header = False, index = False)
    print(f"{cnt}: {yt_id} - alignment is complete!")
    cnt += 1

### Video to Clips (ResNet features)

#### Dataframe to Tensor

In [None]:
str.format()

In [None]:
features = '/network/scratch/s/subhrajyoti.dasgupta/yc2/features/'
clips = '/network/scratch/s/subhrajyoti.dasgupta/yc2/clips/'
yt_ids = os.listdir(features)
src_name = 'master_features_with_label.csv'
dest_name = 'clip_{:03}.pt'

cnt = 1
res = []
cols = list(range(512))
cnt = 1
for yt_id in yt_ids:
    src_path = os.path.join(features, yt_id, src_name)
    df = pd.read_csv(src_path, header = None)
    for i, d in list(df.groupby(513)):
        dest_fldr = os.path.join(clips, yt_id)
        if not os.path.isdir(dest_fldr):
            os.mkdir(dest_fldr)
        dest_path = os.path.join(dest_fldr, dest_name.format(int(i)))
        tensor = torch.from_numpy(d[cols].values)
        torch.save(tensor, dest_path)
    print(f'{cnt}: {yt_id} clips saved!')
    cnt += 1

#### clip reference path in reviewed_0812_vid_5s

In [107]:
key_clip_training_file = 'reviewed_0812_vid_5s.csv'
save_path = '/network/scratch/s/subhrajyoti.dasgupta/yc2/reviewed_0812_with_clip_path.csv'
df = pd.read_csv(key_clip_training_file)
#extract yt_id
clip_num = df['RowNumber'].apply(lambda x: 'clip_{:03}.pt'.format(int(x)))
yt_id = df['VideoUrl'].str.extract('=(.+)')[0]
df['clip_path'] = 'clips/' + yt_id + '/' + clip_num
df.to_csv(save_path)

### Generate n-second clips

In [None]:
# vid_path = './tmp/videos/FHvZgt3ExDI.mp4'
# dest = './tmp/video_frames/'
# import time
# s = time.time()
# process_video(vid_path, dest, 5)
# e = time.time()
# print(e - s)

### Object & Action detection from clips