In [None]:
import pandas as pd
from pathlib import Path

# Merge all CSVs in training_dataset into one master file
output_dir = Path("../training_dataset")
csv_files = list(output_dir.glob("*_metadata.csv"))
dfs = [pd.read_csv(f) for f in csv_files]
master_df = pd.concat(dfs, ignore_index=True)
master_df.to_csv(output_dir / "master_metadata.csv", index=False, encoding='utf-8-sig')
print(f"Combined {len(csv_files)} files into master_metadata.csv")

In [None]:
import sys
from pathlib import Path

sys.path.append(str(Path.cwd().parent))

from src.data_preprocessing.tokenizer import NSLTokenizer
from src.data_preprocessing.dataset import NSLDataset, nsl_collate_fn
from torch.utils.data import DataLoader

tokenizer = NSLTokenizer()
tokenizer.load_vocab("../vocab.json")

dataset = NSLDataset(
    metadata_path="../training_dataset/master_metadata.csv",
    sequences_root="../training_dataset/sequences",
    tokenizer=tokenizer
)

loader = DataLoader(
    dataset,
    batch_size=4,
    shuffle=True,
    collate_fn=nsl_collate_fn
)

batch = next(iter(loader))

print("Batch Motion Shape:", batch["features"].shape)
print("Batch Tokens Shape:", batch["token_ids"].shape)
print("Actual lengths:", batch["lengths"])

In [1]:
import torch

print("--- GPU Confirmation ---")
print(f"PyTorch Version: {torch.__version__}")
print(f"Is CUDA available? {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU Device Name: {torch.cuda.get_device_name(0)}")
    print(f"Current GPU Device ID: {torch.cuda.current_device()}")
    print(f"CUDA Version: {torch.version.cuda}")
else:
    print("CUDA is NOT available. The model will run on CPU (Slow).")

--- GPU Confirmation ---
PyTorch Version: 2.7.1+cu118
Is CUDA available? True
GPU Device Name: NVIDIA GeForce RTX 4070 Laptop GPU
Current GPU Device ID: 0
CUDA Version: 11.8
