## Package Management

In [None]:
from src.utils.helpers import is_installed, install_package, uninstall_package
# -------------------------
# Step 1: Define packages
# -------------------------
conflicting_packages = [
    "torch", "torchvision", "torchaudio", "transformers",
    "evaluate", "pyarrow", "spacy", "pytorch_lightning"
]

required_packages = [
    "absl-py", "aiohappyeyeballs", "aiohttp", "aiosignal", "annotated-types", "anyascii",
    "attrs", "blis", "catalogue", "certifi", "charset-normalizer", "click", "cloudpathlib",
    "colorama", "confection", "contractions", "cramjam", "cymem", "datasets", "decorator",
    "dill", "evaluate", "fastparquet", "filelock", "frozenlist", "fsspec", "hf-xet",
    "huggingface-hub", "idna", "imageio", "imageio-ffmpeg", "Jinja2", "joblib", "kagglehub",
    "langcodes", "language_data", "lightning-utilities", "lxml", "marisa-trie", "markdown-it-py",
    "MarkupSafe", "mdurl", "moviepy", "mpmath", "multidict", "multiprocess", "murmurhash",
    "networkx", "nltk", "numpy", "packaging", "pandas", "pathlib", "pillow", "portalocker",
    "preshed", "proglog", "propcache", "pyahocorasick", "pyarrow", "pydantic", "pydantic_core",
    "Pygments", "python-dateutil", "python-dotenv", "pytorch-lightning", "pytz", "PyYAML",
    "regex", "requests", "rich", "rouge_score", "sacrebleu", "safetensors", "setuptools",
    "shellingham", "six", "smart_open", "spacy", "spacy-legacy", "spacy-loggers", "srsly",
    "sympy", "tabulate", "textsearch", "thinc", "tokenizers", "torch", "torchmetrics",
    "tqdm", "transformers", "typer", "typing-inspection", "typing_extensions", "tzdata",
    "urllib3", "wasabi", "weasel", "wrapt", "xxhash", "yarl"
]

spacy_model_url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"

# -------------------------
# Execution summary
# -------------------------
summary = {"uninstalled": [], "installed": [], "already_installed": []}

# -------------------------
# Step 2: Uninstall conflicting packages
# -------------------------
for pkg in conflicting_packages:
    if is_installed(pkg):
        uninstall_package(pkg)
        summary["uninstalled"].append(pkg)
    else:
        summary["already_installed"].append(pkg)

# -------------------------
# Step 3: Install required packages
# -------------------------
for pkg in required_packages:
    if not is_installed(pkg):
        install_package(pkg)
        summary["installed"].append(pkg)
    else:
        summary["already_installed"].append(pkg)

# -------------------------
# Step 4: Install spaCy English model
# -------------------------
try:
    import en_core_web_sm
    summary["already_installed"].append("en_core_web_sm")
except ImportError:
    install_package(spacy_model_url)
    summary["installed"].append("en_core_web_sm")

# -------------------------
# Step 5: Print summary
# -------------------------
print("\n=== Package Installation Summary ===")
print(f"Uninstalled packages: {summary['uninstalled']}")
print(f"Installed packages: {summary['installed']}")
print(f"Already installed/skipped: {summary['already_installed']}")


## Imports

In [2]:
# ======================
# Standard library
# ======================
from pathlib import Path
from datetime import datetime
import shutil
import json
import os
import random

# ======================
# Third-party libraries
# ======================
import jsonlines
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import AutoTokenizer, T5ForConditionalGeneration, T5TokenizerFast
import evaluate
from datasets import load_dataset
import kagglehub
import re
import cv2

# ======================
# Project utilities
# ======================
from src.utils.helpers import (
    get_next_run_id,
    save_run_metadata,
    add_step_to_metadata,
    add_artifact_to_metadata,
    Artifact,
    clean_text,
    install_package,
)
from src.utils.artifact_names import (
    ASLG_PC12_PARQUET,
    WLASL_JSON,
    ASLG_PC12_CLEAN_JSONL,
    GLOSS_TO_VIDEOID_MAP_JSON,
    ASLG_PC12_TOKENIZED_PT,
    ASLG_PC12_CLEAN_LIMIT_JSONL,
    PREDICTIONS_CSV,
    CONF_JSON,
    RUN_INFO_JSON,
    RAW_DATA_FOLDER,
    ASLG_FOLDER,
    WLASL_FOLDER,
    ARTIFACTS_FOLDER,
    #CHECKPOINTS_FOLDER,
)
from src.utils import logging
from src.preprocessing import data_preprocessing
from src.preprocessing import tokenize_aslg_pc12
from src.data_loading import prepare_dataloader
from src.data_loading.aslg_pc12_dataset import ASLGPC12Dataset
from src.models import train
from src.models.train import TextToGlossModel
from src.utils import metrics 

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [None]:
if not Path(CONF_JSON).exists():
    raise FileNotFoundError(f"{CONF_JSON} not found. Please provide it before running.")

with open(CONF_JSON, "r") as f:
    conf = json.load(f)

# Pull dataset names/paths dynamically, fall back to defaults if not provided
wlasl_kaggle_path = conf.get("wlasl_kaggle_path", "sttaseen/wlasl2000-resized")
aslg_hf_name = conf.get("aslg_hf_name", "achrafothman/aslg_pc12")

# Define local storage dirs (consistent with app.py)
wlasl_dest_dir = Path("data/raw/WLASL2000/wlasl-complete")
aslg_dest_dir = Path("data/raw/aslg_pc12/data")

wlasl_dest_dir.parent.mkdir(parents=True, exist_ok=True)
aslg_dest_dir.parent.mkdir(parents=True, exist_ok=True)

# === DOWNLOAD WLASL FROM KAGGLE ===
print(f"📥 Downloading WLASL dataset from Kaggle: {wlasl_kaggle_path}")
wlasl_cache = kagglehub.dataset_download(wlasl_kaggle_path)
shutil.copytree(wlasl_cache, wlasl_dest_dir, dirs_exist_ok=True)
print(f"✅ WLASL saved to {wlasl_dest_dir}")

# === DOWNLOAD ASLG FROM HUGGINGFACE ===
print(f"📥 Downloading ASLG dataset from Hugging Face: {aslg_hf_name}")
aslg_dataset = load_dataset(aslg_hf_name)
aslg_dataset.save_to_disk(str(aslg_dest_dir))
print(f"✅ ASLG dataset saved to {aslg_dest_dir}")

# Load only the train split (matching app.py’s expected parquet location)
dataset = load_dataset(
    aslg_hf_name,
    split="train",
    revision=conf.get("aslg_hf_revision", "cb7cd272db8fcd4004ee04ddf50e194c15ea24d6")
)

output_path = aslg_dest_dir / "train-00000-of-00001.parquet"
dataset.to_parquet(str(output_path))

print(f"✅ Train parquet saved to {output_path}")
print("🎉 All datasets downloaded and stored according to pipeline structure.")


## Preprocess Data

In [None]:
# === CONFIGURATION ===
wlasl_json_path = Path(f"{RAW_DATA_FOLDER}/wlasl2000/wlasl-complete/{WLASL_JSON}")  
aslg_pc12_parquet_path = Path(f"{RAW_DATA_FOLDER}/aslg_pc12/data/{ASLG_PC12_PARQUET}") 
gloss_video_map_path = Path(f"{ARTIFACTS_FOLDER}/{GLOSS_TO_VIDEOID_MAP_JSON}")
aslg_pc12_cleaned_path = Path(f"{ARTIFACTS_FOLDER}/{ASLG_PC12_CLEAN_JSONL}")


# === PRE-PROCESSING WLASL DATASET ===
df_gloss, df_gloss_to_video, gloss_to_videos = data_preprocessing.load_wlasl_data(wlasl_json_path)
data_preprocessing.save_gloss_video_mapping(gloss_to_videos, gloss_video_map_path)


# === PROCESSING ASLG-PC12 PARQUET -> CLEANED JSONL ===
data_preprocessing.preprocess_aslg_pc12(aslg_pc12_parquet_path, aslg_pc12_cleaned_path)


print("✅ Gloss-video mapping saved to", gloss_video_map_path)
print("✅ Cleaned ASLG-PC12 saved to", aslg_pc12_cleaned_path)


## Limit Dataset

In [None]:

# Paths
aslg_pc12_cleaned_limit_path = Path(f"{ARTIFACTS_FOLDER}/{ASLG_PC12_CLEAN_LIMIT_JSONL}")

# Load gloss-to-video map
with open(gloss_video_map_path, "r") as f:
    gloss_to_vids = json.load(f)

available_glosses = set(gloss_to_vids.keys())
print(f"Number of glosses with videos: {len(available_glosses)}")

# Filter JSONL
total_samples = 0
kept_samples = 0
filtered_samples = []

with jsonlines.open(aslg_pc12_cleaned_path) as reader:
    for i, item in enumerate(reader):
        try:
            total_samples += 1
            if "gloss" in item and item["gloss"] in available_glosses:
                filtered_samples.append(item)
                kept_samples += 1
        except (Exception) as e:
            print(f"Skipping invalid line {i+1}: {e}")

print(f"Total samples in original dataset: {total_samples}")
print(f"Samples kept after filtering: {kept_samples}")

# Ensure output folder exists
aslg_pc12_cleaned_limit_path.parent.mkdir(parents=True, exist_ok=True)

# Save filtered JSONL
with jsonlines.open(aslg_pc12_cleaned_limit_path, mode='w') as writer:
    writer.write_all(filtered_samples)

print(f"Filtered dataset saved to: {aslg_pc12_cleaned_limit_path}")


## Tokenize

In [None]:
# === TOKENIZING ===

tokenized_path = Path(f"{ARTIFACTS_FOLDER}/{ASLG_PC12_TOKENIZED_PT}")
tokenize_aslg_pc12.main([
        "--input", str(aslg_pc12_cleaned_limit_path),
        "--output", str(tokenized_path)
    ])

## Prepare DataLoader

In [13]:
# === PREPARING DATA LOADER ===
tokenized_path = Path(f"{ARTIFACTS_FOLDER}/{ASLG_PC12_TOKENIZED_PT}")
tokenizer_name = "t5-small"
batch_size = 32
shuffle = True
num_workers = 2

# ==== FUNCTION ====
def get_dataloader(data_path: Path = None, tokenized_path: Path = None,
                   tokenizer_name='t5-small', batch_size=32,
                   shuffle=True, num_workers=2):
    dataset = ASLGPC12Dataset(
        data_path=data_path,
        tokenizer_name=tokenizer_name,
        tokenized_path=tokenized_path
    )
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=True
    )

# ==== BUILD DATA LOADER ====
tokenized_path = Path(tokenized_path) if tokenized_path else None

dataloader = get_dataloader(
    tokenized_path=tokenized_path,
    tokenizer_name=tokenizer_name,
    batch_size=batch_size,
    shuffle=shuffle,
    num_workers=num_workers
)

# ==== QUICK CHECK ====
for batch in dataloader:
    print("Batch keys:", batch.keys())
    print("input_ids shape:", batch['input_ids'].shape)
    print("labels shape:", batch['labels'].shape)
    break

INFO:root:Loading pre-tokenized dataset from artifacts/aslg_pc12_tokenized.pt
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISMTOKENIZERS_PARALLELISM=(true | false)
=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling para

Batch keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
input_ids shape: torch.Size([32, 128])
labels shape: torch.Size([32, 128])


## Training

In [None]:
run = False
checkpoint_dir = Path(f"{ARTIFACTS_FOLDER}/checkpoints")
checkpoint_dir.mkdir(parents=True, exist_ok=True)
if run: 
    try:
        train.main()
        #logger.info(f"Step 6 completed, checkpoints saved to {checkpoint_dir}")
    except Exception as e:
        #logger.error(f"Error during training: {e}")
        raise e
else:
    print("Training not executed. Using Pre-trained model for inference.")

## Check if Checkpoints contain some Content

In [3]:

checkpoint_dir = Path(f"{ARTIFACTS_FOLDER}/checkpoints")
checkpoint_files = sorted(checkpoint_dir.glob("*.ckpt"), key=lambda x: x.stat().st_mtime, reverse=True)

def is_state_dict_nonzero(state_dict):
    """Check if at least one tensor in state_dict has non-zero values."""
    for key, tensor in state_dict.items():
        if torch.any(tensor != 0):
            return True
    return False

for ckpt_path in checkpoint_files:
    print(f"--- Inspecting: {ckpt_path.name} ---")
    checkpoint = torch.load(ckpt_path, map_location='cpu')  # load to CPU

    if 'state_dict' in checkpoint:
        state_dict = checkpoint['state_dict']
        alive = is_state_dict_nonzero(state_dict)
        print(f"Checkpoint is {'ALIVE' if alive else 'DEAD'}")
        print(f"state_dict contains {len(state_dict)} tensors")
        sample_keys = list(state_dict.keys())[:5]
        print("Sample state_dict keys:", sample_keys)
    else:
        print("No state_dict found. Marked as DEAD")

    if 'epoch' in checkpoint:
        print(f"Epoch: {checkpoint['epoch']}")
    if 'global_step' in checkpoint:
        print(f"Global step: {checkpoint['global_step']}")

    print("\n")


--- Inspecting: text_to_gloss-epoch=epoch=02-val_loss=val_loss=0.0042.ckpt ---
Checkpoint is ALIVE
state_dict contains 134 tensors
Sample state_dict keys: ['model.shared.weight', 'model.encoder.embed_tokens.weight', 'model.encoder.block.0.layer.0.SelfAttention.q.weight', 'model.encoder.block.0.layer.0.SelfAttention.k.weight', 'model.encoder.block.0.layer.0.SelfAttention.v.weight']
Epoch: 2
Global step: 1539


--- Inspecting: text_to_gloss-epoch=epoch=03-val_loss=val_loss=0.0025.ckpt ---
Checkpoint is ALIVE
state_dict contains 134 tensors
Sample state_dict keys: ['model.shared.weight', 'model.encoder.embed_tokens.weight', 'model.encoder.block.0.layer.0.SelfAttention.q.weight', 'model.encoder.block.0.layer.0.SelfAttention.k.weight', 'model.encoder.block.0.layer.0.SelfAttention.v.weight']
Epoch: 3
Global step: 2052


--- Inspecting: text_to_gloss-epoch=epoch=04-val_loss=val_loss=0.0015.ckpt ---
Checkpoint is ALIVE
state_dict contains 134 tensors
Sample state_dict keys: ['model.shared.weig