In [113]:
import sys, os

root = r'C:\Users\user\Documents\Fine-VLA'

if root not in sys.path:
    sys.path.insert(0, root)

os.chdir(root)

import data_loader.data_loader as module_data
import model.metric as module_metric
import model.model as module_arch
from model.model import compute_similarity
from parse_config import ConfigParser
from trainer.trainer import verbose
from utils.util import state_dict_data_parallel_fix

In [114]:
import argparse

import pandas as pd
import torch
import transformers
from sacred import Experiment
from tqdm import tqdm
import glob

import numpy as np
import os
import copy
import pathlib
import platform

In [115]:
import sys

original_argv = sys.argv.copy()
sys.argv = ['ipython']

parser = argparse.ArgumentParser(description='PyTorch Template')

parser.add_argument('-r', '--resume', default=None, type=str,
                    help='path to latest checkpoint (default: None)')
parser.add_argument('-d', '--device', default=None, type=str,
                    help='indices of GPUs to enable (default: all)')
parser.add_argument('-c', '--config', default=None, type=str,
                    help='config file path (default: None)')
parser.add_argument('-s', '--sliding_window_stride', default=-1, type=int,
                    help='test time temporal augmentation, repeat samples with different start times.')
parser.add_argument('--save_feats', default=None,
                    help='path to store text & video feats, this is for saving embeddings if you want to do offline retrieval.')
parser.add_argument('--save_type', default='both', choices=['both', 'text', 'video'],
                    help='Whether to save video, text or both feats. If running on inference videos, text is just a placeholder')
parser.add_argument('--vis_token_similarity', action='store_true')
parser.add_argument('--split', default='test', choices=['train', 'val', 'test'],
                    help='split to evaluate on.')
parser.add_argument('--batch_size', default=16, type=int,
                    help='size of batch')

_StoreAction(option_strings=['--batch_size'], dest='batch_size', nargs=None, const=None, default=16, type=<class 'int'>, choices=None, required=False, help='size of batch', metavar=None)

In [116]:
sys.argv.extend([
    '-c', 'configs/ntu.json',
    '-r', 'exps/pretrained/cc-webvid2m-4f_stformer_b_16_224.pth.tar',
    '--split', 'test',
    '--batch_size', '16'
])
print(sys.argv)


['ipython', '-c', 'configs/ntu.json', '-r', 'exps/pretrained/cc-webvid2m-4f_stformer_b_16_224.pth.tar', '--split', 'test', '--batch_size', '16']


In [118]:
config = ConfigParser(parser, test=True)

print(f"Dataset name: {config['data_loader']['args']['dataset_name']}")
print(f"Data dir: {config['data_loader']['args']['data_dir']}")
print(f"Cut: {config['data_loader']['args']['cut']}")

args = parser.parse_args()

Dataset name: NTU
Data dir: data/nturgbd_rgb
Cut: standard


In [121]:
config._config['data_loader']['args']['split'] = args.split
config._config['data_loader']['args']['tsfm_split'] = 'test'
config._config['data_loader']['args']['shuffle'] = False
config._config['data_loader']['args']['batch_size'] = args.batch_size
config._config['data_loader']['args']['sliding_window_stride'] = args.sliding_window_stride

In [119]:
data_loader = config.initialize('data_loader', module_data)

TextVideoDataLoader


' 이거 한번 만들어보자'

In [132]:
module_name = config['data_loader']['type']
module_args = dict(config['data_loader']['args'])

print(module_name)
for key, value in module_args.items():
    if isinstance(value, dict):
        print(f"  {key}: {list(value.keys())}")
    else:
        print(f"  {key}: {value}")

TextVideoDataLoader
  dataset_name: NTU
  data_dir: data/nturgbd_rgb
  shuffle: False
  num_workers: 16
  batch_size: 16
  split: test
  cut: standard
  subsample: 1
  text_params: ['input']
  video_params: ['extraction_fps', 'extraction_res', 'input_res', 'num_frames', 'stride']
  tsfm_split: test
  sliding_window_stride: -1


In [133]:
TextVideoDataLoader = getattr(module_data, module_name)

print({TextVideoDataLoader})
print({type(TextVideoDataLoader)})

{<class 'data_loader.data_loader.TextVideoDataLoader'>}
{<class 'type'>}


In [135]:
dataset_kwargs = {
    'dataset_name': module_args['dataset_name'],
    'text_params': module_args['text_params'],
    'video_params': module_args['video_params'],
    'data_dir': module_args['data_dir'],
    'metadata_dir': module_args.get('metadata_dir'),
    'split': module_args['split'],
    'cut': module_args.get('cut'),
    'subsample': module_args.get('subsample', 1),
}

for key, value in dataset_kwargs.items():
    print(f"  {key}: {value}")

  dataset_name: NTU
  text_params: OrderedDict([('input', 'text')])
  video_params: OrderedDict([('extraction_fps', 25), ('extraction_res', 256), ('input_res', 224), ('num_frames', 4), ('stride', 1)])
  data_dir: data/nturgbd_rgb
  metadata_dir: None
  split: test
  cut: standard
  subsample: 1


In [138]:
from data_loader.NTU_dataset import NTU
from base.base_dataset import TextVideoDataset



dataset_name: 'NTU'
NTU 클래스: <class 'data_loader.NTU_dataset.NTU'>
NTU 부모 클래스: (<class 'base.base_dataset.TextVideoDataset'>,)


In [141]:
ntu_metadata_dir = "data/nturgbd_rgb"
csv_fp = os.path.join(ntu_metadata_dir, 'annotations.csv')
ntu_df = pd.read_csv(csv_fp)

ntu_split_dir = os.path.join(ntu_metadata_dir, 'splits')
ntu_train_list_path = "train_list.txt"
ntu_test_list_path = "test_list.txt"

        
ntu_train_df = pd.read_csv(os.path.join(ntu_split_dir, ntu_train_list_path), names=['videoid'])
ntu_test_df = pd.read_csv(os.path.join(ntu_split_dir, ntu_test_list_path), names=['videoid'])

print(ntu_train_df.head())
print(ntu_test_df.head())

                    videoid
0  S017C001P003R001A001_rgb
1  S017C001P003R001A002_rgb
2  S017C001P003R001A003_rgb
3  S017C001P003R001A004_rgb
4  S017C001P003R001A005_rgb
                    videoid
0  S017C001P008R001A001_rgb
1  S017C001P008R001A002_rgb
2  S017C001P008R001A003_rgb
3  S017C001P008R001A004_rgb
4  S017C001P008R001A005_rgb


In [142]:
ntu_metadata = ntu_df.groupby(['video_id'])['caption'].apply(list)
print(ntu_metadata.head())

video_id
S017C001P003R001A001_rgb    [drink_water]
S017C001P003R001A002_rgb       [eat_meal]
S017C001P003R001A003_rgb    [brush_teeth]
S017C001P003R001A004_rgb     [brush_hair]
S017C001P003R001A005_rgb           [drop]
Name: caption, dtype: object


In [147]:
data_loader = config.initialize('data_loader', module_data)

TextVideoDataLoader


In [151]:
dataset = data_loader.dataset
print(f"Metadata shape: {dataset.metadata.shape}")
print(f"Metadata 첫 3행:\n{dataset.metadata.head(3)}")

Metadata shape: (600,)
Metadata 첫 3행:
video_id
S017C001P008R001A001_rgb    [drink_water]
S017C001P008R001A002_rgb       [eat_meal]
S017C001P008R001A003_rgb    [brush_teeth]
Name: caption, dtype: object


In [None]:
first_batch = next(iter(data_loader))