# **DATA TRAINER USING BiLSTM**

## **x.1 Setup and Reproducibility**

#### x.1.1 Import Libraries

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

#### x.1.2 Device and Seed

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

def set_seed(seed=42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

Device: cpu


## **x.2 Dataset Configuration and Utilities**

#### x.2.1 Dataset Config

In [3]:
csv_folder = Path("input/")
pad_len = 83
n_features = 118

#### x.2.2 Filename Parsing

In [None]:
def parse_label_from_filename(csv_path):
    parts = csv_path.stem.split("_")
    return "_".join(parts[:-1]) if len(parts) > 1 else csv_path.stem

#### x.2.3 Padding/truncation

In [None]:
def pad_or_truncate_with_length(sequence_2d, target_len, n_features):
    T, F = sequence_2d.shape
    length = min(T, target_len)

    X = np.zeros((target_len, F), dtype=np.float32)
    X[:length] = sequence_2d[:length].astype(np.float32)

    return X, length

## **x.3 Dataset Indexing and DataLoaders**

#### x.3.1 Build file index

In [None]:
def build_file_index(csv_folder: Path):
    """
    Returns:
      - file_paths: list[Path]
      - labels: list[str]
      - label_to_id: dict[str, int]
      - id_to_label: dict[int, str]
    """
    file_paths = sorted(csv_folder.glob("*.csv"))
    if len(file_paths) == 0:
        raise FileNotFoundError(f"No CSV files found in: {csv_folder}")

    labels = [parse_label_from_filename(p) for p in file_paths]

    unique_labels = sorted(set(labels))
    label_to_id = {lab: i for i, lab in enumerate(unique_labels)}
    id_to_label = {i: lab for lab, i in label_to_id.items()}

    return file_paths, labels, label_to_id, id_to_label


file_paths, labels, label_to_id, id_to_label = build_file_index(csv_folder)

print("Total sequences:", len(file_paths))
print("Total classes  :", len(label_to_id))
print("Example labels :", list(label_to_id.keys())[:10])

#### x.3.2 Train/Val/Test split

In [None]:
def stratified_split(file_paths, labels, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, seed=42):
    """
    Returns dict with keys: train, val, test
    Each value: list of indices
    """
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-9

    rng = np.random.default_rng(seed)

    label_to_indices = defaultdict(list)
    for i, lab in enumerate(labels):
        label_to_indices[lab].append(i)

    train_idx, val_idx, test_idx = [], [], []

    for lab, idxs in label_to_indices.items():
        idxs = np.array(idxs)
        rng.shuffle(idxs)

        n = len(idxs)
        n_train = int(round(n * train_ratio))
        n_val = int(round(n * val_ratio))
        # ensure total = n
        n_test = n - n_train - n_val

        train_part = idxs[:n_train]
        val_part = idxs[n_train:n_train + n_val]
        test_part = idxs[n_train + n_val:]

        train_idx.extend(train_part.tolist())
        val_idx.extend(val_part.tolist())
        test_idx.extend(test_part.tolist())

    # shuffle within splits (optional)
    rng.shuffle(train_idx)
    rng.shuffle(val_idx)
    rng.shuffle(test_idx)

    return {"train": train_idx, "val": val_idx, "test": test_idx}


splits = stratified_split(file_paths, labels, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1)

print("Train:", len(splits["train"]))
print("Val  :", len(splits["val"]))
print("Test :", len(splits["test"]))

#### x.3.3 PyTorch Dataset

In [None]:
class SignCSVDataset(Dataset):
    """
    Lazy CSV dataset:
      - loads one CSV when __getitem__ is called
      - pads/truncates to pad_len
      - returns (X, y, length, filename)
    """
    def __init__(self, file_paths, labels, label_to_id, pad_len, n_features=118):
        self.file_paths = file_paths
        self.labels = labels
        self.label_to_id = label_to_id
        self.pad_len = pad_len
        self.n_features = n_features

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        csv_path = self.file_paths[idx]
        label_str = self.labels[idx]
        y = self.label_to_id[label_str]

        # Load CSV (T, 118)
        df = pd.read_csv(csv_path)
        seq = df.values  # numpy array

        # Pad/truncate (pad_len, 118) and get original length
        X_np, length = pad_or_truncate_with_length(seq, self.pad_len, self.n_features)

        # Convert to torch tensors
        X = torch.from_numpy(X_np)                # float32, shape (T, F)
        y = torch.tensor(y, dtype=torch.long)     # class id
        length = torch.tensor(length, dtype=torch.long)

        return X, y, length, csv_path.name

#### x.3.4 Collate Function

In [None]:
def collate_fn(batch):
    X_list, y_list, length_list, fname_list = zip(*batch)

    X_batch = torch.stack(X_list, dim=0)       # (B, T, F)
    y_batch = torch.stack(y_list, dim=0)       # (B,)
    lengths = torch.stack(length_list, dim=0)  # (B,)

    return X_batch, y_batch, lengths, list(fname_list)

#### x.3.5 DataLoaders