In [None]:
import sys, os

root = r'C:\Users\user\Documents\Fine-VLA'

if root not in sys.path:
    sys.path.insert(0, root)

os.chdir(root)

import data.data_loader as module_data
import model.metric as module_metric
import model.model as module_arch
from model.model import compute_similarity
from scripts.parse_config import ConfigParser
from trainer.trainer import verbose
from utils.util import state_dict_data_parallel_fix

In [181]:
import argparse

import pandas as pd
import torch
import transformers
from sacred import Experiment
from tqdm import tqdm
import glob

import numpy as np
import os
import copy
import pathlib
import platform

In [182]:
import sys

original_argv = sys.argv.copy()
sys.argv = ['ipython']

parser = argparse.ArgumentParser(description='PyTorch Template')

parser.add_argument('-r', '--resume', default=None, type=str,
                    help='path to latest checkpoint (default: None)')
parser.add_argument('-d', '--device', default=None, type=str,
                    help='indices of GPUs to enable (default: all)')
parser.add_argument('-c', '--config', default=None, type=str,
                    help='config file path (default: None)')
parser.add_argument('-s', '--sliding_window_stride', default=-1, type=int,
                    help='test time temporal augmentation, repeat samples with different start times.')
parser.add_argument('--save_feats', default=None,
                    help='path to store text & video feats, this is for saving embeddings if you want to do offline retrieval.')
parser.add_argument('--save_type', default='both', choices=['both', 'text', 'video'],
                    help='Whether to save video, text or both feats. If running on inference videos, text is just a placeholder')
parser.add_argument('--vis_token_similarity', action='store_true')
parser.add_argument('--split', default='test', choices=['train', 'val', 'test'],
                    help='split to evaluate on.')
parser.add_argument('--batch_size', default=16, type=int,
                    help='size of batch')

_StoreAction(option_strings=['--batch_size'], dest='batch_size', nargs=None, const=None, default=16, type=<class 'int'>, choices=None, required=False, help='size of batch', metavar=None)

In [183]:
sys.argv.extend([
    '-c', 'configs/ntu.json',
    '-r', 'exps/pretrained/cc-webvid2m-4f_stformer_b_16_224.pth.tar',
    '--split', 'test',
    '--batch_size', '16'
])

In [184]:
config = ConfigParser(parser, test=True) # parse config

print(f"Dataset name: {config['data_loader']['args']['dataset_name']}")
print(f"Data dir: {config['data_loader']['args']['data_dir']}")
print(f"Cut: {config['data_loader']['args']['cut']}")

args = parser.parse_args()

Dataset name: NTU
Data dir: C:/Users/user/Documents/data/nturgbd_rgb
Cut: standard


In [185]:
config._config['data_loader']['args']['split'] = args.split
config._config['data_loader']['args']['tsfm_split'] = 'test'
config._config['data_loader']['args']['shuffle'] = False
config._config['data_loader']['args']['batch_size'] = args.batch_size
config._config['data_loader']['args']['sliding_window_stride'] = args.sliding_window_stride

In [186]:
data_loader = config.initialize('data_loader', module_data)
# config['data_loader']['type'] -> "TextVideoDataLoader"
# getattr(module_data, config['data_loader']['type'])
# TextVideoDataLoader(**module_args) -> data_loader.py

TextVideoDataLoader


In [187]:
module_name = config['data_loader']['type']
print(module_name)

module_args = dict(config['data_loader']['args'])
for key, value in module_args.items():
    if isinstance(value, dict):
        print(f"  {key}: {list(value.keys())}")
    else:
        print(f"  {key}: {value}")

TextVideoDataLoader
  dataset_name: NTU
  data_dir: C:/Users/user/Documents/data/nturgbd_rgb
  shuffle: False
  num_workers: 16
  batch_size: 16
  split: test
  cut: standard
  subsample: 1
  text_params: ['input']
  video_params: ['extraction_fps', 'extraction_res', 'input_res', 'num_frames', 'stride']
  tsfm_split: test
  sliding_window_stride: -1


In [189]:
from data_loader.NTU_dataset import NTU
from data_loader.transforms import init_transform_dict

dataset_name = 'NTU'
text_params = config['data_loader']['args']['text_params']
video_params = config['data_loader']['args']['video_params']
data_dir = config['data_loader']['args']['data_dir']
metadata_dir = config['data_loader']['args'].get('metadata_dir')
split = config['data_loader']['args']['split']
cut = config['data_loader']['args'].get('cut')
subsample = config['data_loader']['args'].get('subsample', 1)
sliding_window_stride = config['data_loader']['args'].get('sliding_window_stride', -1)
reader = config['data_loader']['args'].get('reader', 'decord')

tsfm_params = config['data_loader']['args'].get('tsfm_params', {})
tsfm_dict = init_transform_dict(**tsfm_params)
tsfm_split = config['data_loader']['args'].get('tsfm_split', split)
tsfm = tsfm_dict[tsfm_split]

In [190]:
ntu_dataset = NTU(
    dataset_name=dataset_name,
    text_params=text_params, 
    video_params=video_params,
    data_dir=data_dir,
    metadata_dir=metadata_dir,
    split=split,
    tsfms=tsfm,
    cut=cut,
    subsample=subsample,
    sliding_window_stride=sliding_window_stride,
    reader=reader
)

print(ntu_dataset)
print(f"dataset size: {len(ntu_dataset)}")
print(f"metadata shape: {ntu_dataset.metadata.shape}")
print(f"first video id: {ntu_dataset.metadata.index[0]}")
print(f"dataset_name: {ntu_dataset.dataset_name}")
print(f"split: {ntu_dataset.split}")
print(f"data directory: {ntu_dataset.data_dir}")
print(f"metadata directory: {ntu_dataset.metadata_dir}")

<data_loader.NTU_dataset.NTU object at 0x000001D88613D120>
dataset size: 600
metadata shape: (600,)
first video id: S017C001P008R001A001_rgb
dataset_name: NTU
split: test
data directory: C:/Users/user/Documents/data/nturgbd_rgb
metadata directory: C:/Users/user/Documents/data/nturgbd_rgb


In [None]:
ntu_metadata_dir = "data/nturgbd"
csv_fp = os.path.join(ntu_metadata_dir, 'annotations.csv')
ntu_df = pd.read_csv(csv_fp)

ntu_split_dir = os.path.join(ntu_metadata_dir, 'splits')
ntu_train_list_path = "train_list.txt"
ntu_test_list_path = "test_list.txt"

        
ntu_train_df = pd.read_csv(os.path.join(ntu_split_dir, ntu_train_list_path), names=['videoid'])
ntu_test_df = pd.read_csv(os.path.join(ntu_split_dir, ntu_test_list_path), names=['videoid'])

print(ntu_train_df.head())
print(ntu_test_df.head())

                    videoid
0  S017C001P003R001A001_rgb
1  S017C001P003R001A002_rgb
2  S017C001P003R001A003_rgb
3  S017C001P003R001A004_rgb
4  S017C001P003R001A005_rgb
                    videoid
0  S017C001P008R001A001_rgb
1  S017C001P008R001A002_rgb
2  S017C001P008R001A003_rgb
3  S017C001P008R001A004_rgb
4  S017C001P008R001A005_rgb


In [192]:
ntu_metadata = ntu_df.groupby(['video_id'])['caption'].apply(list)
print(ntu_metadata.head())

video_id
S017C001P003R001A001_rgb    [drink_water]
S017C001P003R001A002_rgb       [eat_meal]
S017C001P003R001A003_rgb    [brush_teeth]
S017C001P003R001A004_rgb     [brush_hair]
S017C001P003R001A005_rgb           [drop]
Name: caption, dtype: object


In [195]:
dataset = TextVideoDataLoader.dataset
print(f"Metadata shape: {dataset.metadata.shape}")
print(f"Metadata 첫 3행:\n{dataset.metadata.head(3)}")

AttributeError: type object 'TextVideoDataLoader' has no attribute 'dataset'

In [194]:
first_batch = next(iter(data_loader))

TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "c:\Users\user\Documents\Fine-VLA\.venv\lib\site-packages\torch\utils\data\_utils\worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "c:\Users\user\Documents\Fine-VLA\.venv\lib\site-packages\torch\utils\data\_utils\fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "c:\Users\user\Documents\Fine-VLA\.venv\lib\site-packages\torch\utils\data\_utils\fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "C:\Users\user\Documents\Fine-VLA\base\base_dataset.py", line 85, in __getitem__
    video_fp, rel_fp = self._get_video_path(sample) # 비디오 경로 가져오기
  File "C:\Users\user\Documents\Fine-VLA\data_loader\NTU_dataset.py", line 37, in _get_video_path
    rel_path = sample['video_path']
TypeError: list indices must be integers or slices, not str


In [None]:
text_model_name = config['arch']['args']['text_params']['model']
print("Text model: {text_model_name}")

if "openai/clip" in text_model_name:
    tokenizer_builder = transformers.CLIPTokenizer
else:
    tokenizer_builder = transformers.AutoTokenizer
tokenizer = tokenizer_builder.from_pretrained(
    text_model_name,
    model_max_length=int(config['arch']['args']['text_params'].get('max_length', 1e6)),
    TOKENIZERS_PARALLELISM=False
)

In [196]:
import copy

data = copy.deepcopy(first_batch)

# text_embeds

if tokenizer is not None:
    print(f"원본 텍스트: {data['text'][:2]}")
    data['text'] = tokenizer(data['text'], return_tensors='pt', padding=True, truncation=True)
    print(f"토큰화 후 키들:{data['text'].keys()}")
    print(f"Input IDs shape: {data['text']['input_ids'].shape}")

NameError: name 'first_batch' is not defined

In [None]:
import copy
import matplotlib.pyplot as plt
import numpy as np
import torch

# 배치 데이터 복사
data = copy.deepcopy(first_batch)

# 1. 텍스트 토큰화 시각화
print("\n===== 텍스트 토큰화 =====")
print(f"원본 텍스트: {data['text'][:3]}")

if tokenizer is not None:
    # 토큰화
    data['text'] = tokenizer(data['text'], return_tensors='pt', padding=True, truncation=True)
    print(f"토큰화 후 키들: {data['text'].keys()}")
    print(f"Input IDs shape: {data['text']['input_ids'].shape}")
    print(f"Attention mask shape: {data['text']['attention_mask'].shape}")
    
    # 토큰 ID 시각화
    token_ids = data['text']['input_ids'][:3].numpy()  # 처음 3개 샘플만
    
    print("\n토큰 ID 매트릭스 (처음 3개 샘플):")
    for i, ids in enumerate(token_ids):
        print(f"샘플 {i+1}: {ids}")
    
    # 토큰 디코딩
    print("\n토큰 디코딩 결과:")
    for i, ids in enumerate(token_ids):
        tokens = tokenizer.convert_ids_to_tokens(ids)
        print(f"샘플 {i+1} 토큰: {tokens}")

# 2. 비디오 토큰화 시각화
print("\n===== 비디오 프레임 및 패치 시각화 =====")

# 원본 비디오 프레임 시각화
plt.figure(figsize=(15, 8))

# 상단 행: 원본 프레임
for i in range(min(4, data['video'].shape[1])):
    plt.subplot(2, 4, i+1)
    frame = data['video'][0, i].permute(1, 2, 0).numpy()
    if frame.min() < 0 or frame.max() > 1:
        frame = (frame - frame.min()) / (frame.max() - frame.min())
    plt.imshow(frame)
    plt.title(f"원본 프레임 {i+1}")
    plt.axis('off')

# 하단 행: 패치 그리드 시각화
patch_size = 16  # 일반적인 ViT 패치 크기
for i in range(min(4, data['video'].shape[1])):
    plt.subplot(2, 4, i+5)
    frame = data['video'][0, i].permute(1, 2, 0).numpy()
    if frame.min() < 0 or frame.max() > 1:
        frame = (frame - frame.min()) / (frame.max() - frame.min())
    
    plt.imshow(frame)
    
    # 패치 그리드 오버레이
    h, w = frame.shape[0], frame.shape[1]
    for y in range(0, h, patch_size):
        plt.axhline(y=y, color='r', linestyle='-', alpha=0.3)
    for x in range(0, w, patch_size):
        plt.axvline(x=x, color='r', linestyle='-', alpha=0.3)
        
    plt.title(f"프레임 {i+1} 패치 그리드")
    plt.axis('off')

plt.tight_layout()
plt.show()

# 3. 패치 토큰 샘플 시각화
frame_idx = 0
frame = data['video'][0, frame_idx].permute(1, 2, 0).numpy()
if frame.min() < 0 or frame.max() > 1:
    frame = (frame - frame.min()) / (frame.max() - frame.min())

h, w = frame.shape[0], frame.shape[1]
num_patches_h = h // patch_size
num_patches_w = w // patch_size
total_patches = num_patches_h * num_patches_w

print(f"\n프레임당 패치 수: {total_patches} ({num_patches_h}x{num_patches_w})")
print(f"비디오 패치 토큰 수: {total_patches * data['video'].shape[1]} (프레임당 {total_patches} x {data['video'].shape[1]}프레임)")

# 패치 샘플 추출 및 시각화
plt.figure(figsize=(15, 3))
patch_indices = [0, total_patches//4, total_patches//2, 3*total_patches//4]

for idx, i in enumerate(patch_indices):
    row = (i // num_patches_w) * patch_size
    col = (i % num_patches_w) * patch_size
    
    patch = frame[row:row+patch_size, col:col+patch_size, :]
    
    plt.subplot(1, 4, idx+1)
    plt.imshow(patch)
    plt.title(f"패치 토큰 #{i+1}")
    plt.axis('off')

plt.suptitle(f"비디오 패치 토큰 샘플 (프레임 {frame_idx+1})", fontsize=16)
plt.tight_layout()
plt.show()
