# ASL Word Recognition V2 ‚Äî CNN-BiLSTM + Transformer Ensemble (Max Accuracy)

**Ground-up redesign** for maximum word-level ASL accuracy using WLASL + MediaPipe two-hand landmarks.

### Architecture Highlights:

- **CNN-BiLSTM + TransformerBlock + TemporalAttention** ‚Äî deep hybrid architecture
- **Sequence length 60** (2√ó the baseline) ‚Äî more temporal context
- **Wrist-relative + scale normalization** ‚Äî translation & scale invariant features
- **CategoricalFocalCrossentropy (Œ≥=2)** ‚Äî handles class imbalance far better than CE
- **Cosine Decay + Linear Warmup** LR schedule ‚Äî stable convergence
- **Ensemble of 3 models**: CNN-BiLSTM, Pure Transformer, TCN
- **Aggressive augmentation**: noise, shift, frame dropout, scale, hand swap, rotation, speed perturbation
- **Mixed float16 precision** on dense layers, float32 on LSTM

### Output Artifacts:

- `asl_word_lstm_v2_best.h5` ‚Äî best single CNN-BiLSTM model
- `asl_word_ensemble_final.h5` ‚Äî final ensemble model
- `scaler.pkl`, `encoder.pkl` ‚Äî preprocessing objects
- `asl_word_classes_v2.csv` ‚Äî class mapping

### Setup:

1. Upload metadata CSVs/JSONs and WLASL videos as Kaggle datasets (or set local paths in Cell 3)
2. Enable GPU accelerator
3. Run all cells in order


In [None]:
# ===============================================================
# CELL 1: INSTALL DEPENDENCIES & IMPORTS
# ===============================================================
import subprocess, sys

def pip_install(pkg):
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg, '-q'])

pip_install('mediapipe==0.10.13')
pip_install('joblib')

import json, os, time, math, pickle
from pathlib import Path
from collections import Counter

import cv2
import mediapipe as mp
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.decomposition import PCA

from tensorflow.keras import mixed_precision
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Input, Dense, Dropout, BatchNormalization, LayerNormalization,
    LSTM, Bidirectional, Conv1D, DepthwiseConv2D,
    GlobalAveragePooling1D, SpatialDropout1D,
    MultiHeadAttention, Add, Reshape, Lambda
)
from tensorflow.keras.callbacks import (
    ModelCheckpoint, EarlyStopping, ReduceLROnPlateau,
    TerminateOnNaN, CSVLogger
)
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm

print('=' * 65)
print('‚úÖ All libraries imported successfully')
print(f'   TensorFlow : {tf.__version__}')
print(f'   MediaPipe  : {mp.__version__}')
print(f'   NumPy      : {np.__version__}')
print('=' * 65)


In [None]:
# ===============================================================
# CELL 2: GPU CONFIGURATION & MIXED PRECISION SETUP
# ===============================================================
print('=' * 65)
print('üîç GPU DETECTION & MIXED PRECISION CONFIGURATION')
print('=' * 65)

print(f'\nTensorFlow  : {tf.__version__}')
print(f'Built w/CUDA: {tf.test.is_built_with_cuda()}')
print(f'All devices : {tf.config.list_physical_devices()}')

gpus = tf.config.list_physical_devices('GPU')
print(f'\nüéÆ GPU Devices: {len(gpus)}')

USE_GPU = False
DEVICE  = '/CPU:0'

if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.set_visible_devices(gpus[0], 'GPU')
        USE_GPU = True
        DEVICE  = '/GPU:0'
        print(f'‚úÖ GPU enabled: {gpus[0].name}')
        try:
            d = tf.config.experimental.get_device_details(gpus[0])
            print(f'   Device        : {d.get("device_name", "N/A")}')
            print(f'   Compute Cap.  : {d.get("compute_capability", "N/A")}')
        except Exception:
            pass

        # GPU benchmark
        print('\nüß™ GPU Benchmark (matmul 4096√ó4096)...')
        with tf.device('/GPU:0'):
            a = tf.random.normal([4096, 4096])
            t0 = time.time()
            c = tf.matmul(a, tf.transpose(a))
            _ = c.numpy()
            print(f'   ‚úÖ {time.time()-t0:.3f}s  |  result shape: {c.shape}')
    except RuntimeError as e:
        print(f'‚ö†Ô∏è  GPU config error: {e}')
else:
    print('‚ö†Ô∏è  No GPU ‚Äî CPU only (training will be slow)')

# Mixed precision: float16 for dense layers, float32 kept for LSTM via dtype override
# We set global policy to float32 for stability; LSTM+Conv layers stay float32 naturally.
# For dense/attention layers on GPU, we can cast manually.
ENABLE_MIXED_PRECISION = USE_GPU  # only on GPU
if ENABLE_MIXED_PRECISION:
    try:
        mixed_precision.set_global_policy('mixed_float16')
        print(f'\n‚ö° Mixed precision: mixed_float16 enabled')
    except Exception as e:
        ENABLE_MIXED_PRECISION = False
        mixed_precision.set_global_policy('float32')
        print(f'\n‚ö†Ô∏è  Mixed precision failed ({e}), using float32')
else:
    mixed_precision.set_global_policy('float32')
    print(f'\nüìê Precision: float32')

print(f'\n‚úÖ Device ready: {DEVICE}')
print('=' * 65)


In [None]:
# ===============================================================
# CELL 3: PATH CONFIGURATION & HYPERPARAMETERS (MAX SETTINGS)
# ===============================================================
IS_KAGGLE = os.path.exists('/kaggle/input')

if IS_KAGGLE:
    print('üîç Kaggle environment ‚Äî auto-searching for files...')
    KAGGLE_INPUT = Path('/kaggle/input')
    OUTPUT_DIR   = Path('/kaggle/working')
    try:
        SHARED_CSV  = next(KAGGLE_INPUT.rglob('shared_word_vocabulary.csv'))
        WLASL_JSON  = next(KAGGLE_INPUT.rglob('WLASL_v0.3.json'))
        NSLT_SPLIT  = next(KAGGLE_INPUT.rglob('nslt_2000.json'))
        MISSING_TXT = next(KAGGLE_INPUT.rglob('missing.txt'))
        video_dirs  = [p for p in KAGGLE_INPUT.rglob('videos') if p.is_dir()]
        VIDEO_DIR   = video_dirs[0] if video_dirs else None
    except StopIteration:
        print('‚ùå Missing files ‚Äî check dataset attachments')
        SHARED_CSV = WLASL_JSON = NSLT_SPLIT = MISSING_TXT = VIDEO_DIR = None
else:
    # ‚îÄ‚îÄ LOCAL PATHS ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    PROJECT_ROOT = Path(r'D:\My_Graduation_Project')
    SHARED_CSV   = PROJECT_ROOT / 'shared_word_vocabulary.csv'
    WLASL_JSON   = PROJECT_ROOT / 'WLASL_v0.3.json'
    NSLT_SPLIT   = PROJECT_ROOT / 'nslt_2000.json'
    MISSING_TXT  = PROJECT_ROOT / 'missing.txt'
    VIDEO_DIR    = PROJECT_ROOT / 'videos'
    OUTPUT_DIR   = PROJECT_ROOT / 'ASL_Word_Output_V2'
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# ‚îÄ‚îÄ HYPERPARAMETERS (MAXIMUM VIABLE SETTINGS) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
SEQUENCE_LENGTH     = 60          # 2√ó baseline ‚Äî richer temporal context
NUM_HANDS           = 2
LANDMARKS_PER_HAND  = 63          # 21 landmarks √ó 3 (x, y, z)
NUM_FEATURES        = NUM_HANDS * LANDMARKS_PER_HAND   # 126

# Model capacity
LSTM_UNITS_1        = 768
LSTM_UNITS_2        = 512
LSTM_UNITS_3        = 256
DENSE_UNITS         = 1024
CNN_FILTERS_1       = 256
CNN_FILTERS_2       = 256
CNN_FILTERS_3       = 512
NUM_TRANSFORMER_HEADS = 8
TRANSFORMER_FF_DIM  = 512
NUM_TRANSFORMER_BLOCKS = 3        # stacked transformer blocks

# Training
BATCH_SIZE          = 128 if USE_GPU else 32
EPOCHS              = 300
LEARNING_RATE       = 3e-4
WARMUP_EPOCHS       = 10
DROPOUT_RATE        = 0.4
SPATIAL_DROPOUT     = 0.2
LABEL_SMOOTH        = 0.05
GRAD_CLIP_NORM      = 0.5
L2_REG              = 5e-5
WEIGHT_DECAY        = 1e-4
FOCAL_GAMMA         = 2.0         # Focal loss gamma
TEST_SIZE           = 0.3         # 70/15/15 split

# ‚îÄ‚îÄ VERIFY ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print('\n‚îÄ‚îÄ PATH VERIFICATION ‚îÄ‚îÄ')
for name, path in [('Shared CSV', SHARED_CSV), ('WLASL JSON', WLASL_JSON),
                   ('NSLT Split', NSLT_SPLIT), ('Missing TXT', MISSING_TXT)]:
    ok = path and Path(path).exists()
    print(f'  {"‚úÖ" if ok else "‚ùå"} {name}: {path}')
ok_v = VIDEO_DIR and Path(VIDEO_DIR).exists()
print(f'  {"‚úÖ" if ok_v else "‚ùå"} Video dir : {VIDEO_DIR}')
print(f'  üìÅ Output dir : {OUTPUT_DIR}')

print('\n‚îÄ‚îÄ HYPERPARAMETERS ‚îÄ‚îÄ')
print(f'  Sequence length   : {SEQUENCE_LENGTH}')
print(f'  Features/frame    : {NUM_FEATURES}')
print(f'  LSTM units        : {LSTM_UNITS_1}/{LSTM_UNITS_2}/{LSTM_UNITS_3}')
print(f'  Dense units       : {DENSE_UNITS}')
print(f'  Transformer heads : {NUM_TRANSFORMER_HEADS}  (blocks: {NUM_TRANSFORMER_BLOCKS})')
print(f'  Batch size        : {BATCH_SIZE}  |  Epochs: {EPOCHS}')
print(f'  LR                : {LEARNING_RATE}  (warmup {WARMUP_EPOCHS} ep)')
print(f'  Dropout           : {DROPOUT_RATE}  |  Focal Œ≥: {FOCAL_GAMMA}')


In [None]:
# ===============================================================
# CELL 4: LOAD VOCABULARY & WLASL METADATA
# ===============================================================
vocab_df = pd.read_csv(SHARED_CSV)
vocab_df = vocab_df.dropna(subset=['wlasl_class'])
vocab_df['wlasl_class'] = vocab_df['wlasl_class'].astype(int)

matched_wlasl_classes = set(vocab_df['wlasl_class'].tolist())
wlasl_to_wordid = dict(zip(vocab_df['wlasl_class'], vocab_df['word_id']))
id_to_english   = dict(zip(vocab_df['word_id'].astype(int), vocab_df['english']))

with open(NSLT_SPLIT, 'r', encoding='utf-8') as f:
    nslt = json.load(f)
with open(MISSING_TXT, 'r', encoding='utf-8') as f:
    missing_ids = set(x.strip() for x in f if x.strip())
with open(WLASL_JSON, 'r', encoding='utf-8') as f:
    wlasl_data = json.load(f)

download_list = []
for entry in wlasl_data:
    gloss = entry.get('gloss', '')
    for inst in entry.get('instances', []):
        vid = inst.get('video_id')
        if not vid or vid not in nslt or vid in missing_ids:
            continue
        class_id = int(nslt[vid]['action'][0])
        if class_id not in matched_wlasl_classes:
            continue
        download_list.append({
            'video_id': vid,
            'url':       inst.get('url'),
            'class_id':  class_id,
            'word_id':   int(wlasl_to_wordid[class_id]),
            'gloss':     gloss,
            'subset':    nslt[vid]['subset']
        })

dl_df = pd.DataFrame(download_list)
print(f'üì• Total video candidates : {len(dl_df)}')
print(f'üè∑Ô∏è  Unique WLASL classes   : {dl_df["class_id"].nunique()}')
print(f'üìñ Vocabulary size         : {len(vocab_df)} words')

if 'category' in vocab_df.columns:
    cat_counts = vocab_df['category'].value_counts()
    print(f'\nüìã Category breakdown:\n{cat_counts.to_string()}')

subset_counts = dl_df['subset'].value_counts()
print(f'\nüìÇ Subset breakdown:\n{subset_counts.to_string()}')


In [None]:
# ===============================================================
# CELL 5: ENHANCED TWO-HAND LANDMARK EXTRACTION WITH NORMALIZATION
# ===============================================================
# Features: wrist-relative + scale normalization for translation/scale invariance
# Strict quality filter: skip if <30% frames have hands detected
# Sequence length = 60 frames (2√ó baseline)
# Saves to asl_word_sequences_2hand_v2.npz  ‚Äî skip if already exists

NPZ_PATH = OUTPUT_DIR / 'asl_word_sequences_2hand_v2.npz'

def normalize_hand(landmarks_21x3: np.ndarray) -> np.ndarray:
    """
    Wrist-relative + scale normalization.
    landmarks_21x3: shape (21, 3)
    Returns: (21*3,) = (63,) normalized vector
    """
    wrist = landmarks_21x3[0].copy()   # landmark 0 = wrist
    rel = landmarks_21x3 - wrist       # translate so wrist is at origin

    # scale: distance from wrist to middle-finger MCP (landmark 9)
    dist = np.linalg.norm(rel[9])
    if dist > 1e-6:
        rel = rel / dist               # scale invariant
    return rel.flatten()

if NPZ_PATH.exists():
    print(f'‚è© Dataset already exists ‚Äî loading from cache')
    data = np.load(NPZ_PATH)
    X, y = data['X'], data['y']
    print(f'   X: {X.shape} | y: {y.shape} | classes: {len(np.unique(y))}')
else:
    mp_hands = mp.solutions.hands
    detector = mp_hands.Hands(
        static_image_mode=False,
        max_num_hands=2,
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5
    )

    def extract_frame(frame):
        rgb    = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        result = detector.process(rgb)
        left_v  = np.zeros(LANDMARKS_PER_HAND, dtype=np.float32)
        right_v = np.zeros(LANDMARKS_PER_HAND, dtype=np.float32)
        if result.multi_hand_landmarks and result.multi_handedness:
            for lm, hd in zip(result.multi_hand_landmarks, result.multi_handedness):
                pts = np.array([[p.x, p.y, p.z] for p in lm.landmark])  # (21, 3)
                vec = normalize_hand(pts)
                if hd.classification[0].label == 'Left':
                    left_v = vec
                else:
                    right_v = vec
        return np.concatenate([left_v, right_v]), (left_v.any() or right_v.any())

    def extract_video(path):
        cap  = cv2.VideoCapture(str(path))
        if not cap.isOpened():
            return None, 0
        frames, detected = [], 0
        while True:
            ret, f = cap.read()
            if not ret:
                break
            vec, saw = extract_frame(f)
            frames.append(vec)
            if saw:
                detected += 1
        cap.release()
        if not frames:
            return None, 0
        arr = np.array(frames, dtype=np.float32)
        detect_rate = detected / len(arr)

        # Resample to SEQUENCE_LENGTH via linear interpolation
        if len(arr) >= SEQUENCE_LENGTH:
            idx = np.linspace(0, len(arr) - 1, SEQUENCE_LENGTH, dtype=int)
            arr = arr[idx]
        else:
            pad = np.zeros((SEQUENCE_LENGTH - len(arr), NUM_FEATURES), dtype=np.float32)
            arr = np.concatenate([arr, pad], axis=0)
        return arr, detect_rate

    meta_by_id  = {d['video_id']: d for d in download_list}
    video_files = sorted(Path(VIDEO_DIR).glob('*.mp4'))
    print(f'üìÅ Found {len(video_files)} video files')

    X_list, y_list = [], []
    skipped_missing, skipped_quality = 0, 0
    t0 = time.time()

    for vf in tqdm(video_files, desc='Extracting landmarks'):
        vid = vf.stem
        if vid not in meta_by_id:
            skipped_missing += 1
            continue
        seq, dr = extract_video(vf)
        if seq is None or dr < 0.30:   # strict: ‚â•30% detection
            skipped_quality += 1
            continue
        X_list.append(seq)
        y_list.append(meta_by_id[vid]['word_id'])

    detector.close()
    X = np.array(X_list, dtype=np.float32)
    y = np.array(y_list,  dtype=np.int32)

    elapsed = time.time() - t0
    print(f'\nüìä X: {X.shape} | y: {y.shape} | classes: {len(np.unique(y))}')
    print(f'‚è±Ô∏è  {elapsed:.0f}s ({elapsed/60:.1f} min)')
    print(f'   Skipped (not in vocab): {skipped_missing}')
    print(f'   Skipped (low quality) : {skipped_quality}')

    np.savez_compressed(NPZ_PATH, X=X, y=y)
    print(f'üíæ Saved ‚Üí {NPZ_PATH}')


In [None]:
# ===============================================================
# CELL 6: ADVANCED DATA EXPLORATION & QUALITY ANALYSIS
# ===============================================================
print('=' * 65)
print('üìä DATA EXPLORATION')
print('=' * 65)

if 'X' not in dir() or 'y' not in dir():
    data = np.load(NPZ_PATH)
    X, y = data['X'], data['y']

unique_ids, counts = np.unique(y, return_counts=True)
word_names = [id_to_english.get(int(uid), str(uid)) for uid in unique_ids]
sort_idx   = np.argsort(counts)[::-1]
s_names    = [word_names[i] for i in sort_idx]
s_counts   = counts[sort_idx]

# --- Plot 1: Class distribution ---
fig, ax = plt.subplots(figsize=(26, 7))
ax.bar(range(len(s_names)), s_counts, color='steelblue', edgecolor='navy', linewidth=0.3)
ax.set_xticks(range(len(s_names)))
ax.set_xticklabels(s_names, rotation=90, fontsize=5.5)
ax.axhline(np.mean(s_counts),   color='red',    ls='--', alpha=0.7, label=f'Mean: {np.mean(s_counts):.1f}')
ax.axhline(np.median(s_counts), color='orange', ls=':',  alpha=0.7, label=f'Median: {np.median(s_counts):.1f}')
ax.set_title(f'Class Distribution ‚Äî {len(unique_ids)} classes, {len(y)} samples | seq={SEQUENCE_LENGTH}', fontsize=14)
ax.set_xlabel('Word'); ax.set_ylabel('Samples'); ax.legend()
plt.tight_layout(); plt.show()

# --- Plot 2: Quality ---
fig, axes = plt.subplots(1, 3, figsize=(22, 5))
axes[0].hist(s_counts, bins=25, color='steelblue', edgecolor='navy', alpha=0.85)
axes[0].axvline(np.mean(s_counts), color='red', ls='--', label=f'Mean: {np.mean(s_counts):.1f}')
axes[0].set_title('Samples per Class'); axes[0].legend()

left_zero, right_zero, both_rate = [], [], []
sample_n = min(len(X), 1000)
for i in range(sample_n):
    l_sum = np.sum(np.abs(X[i, :, :LANDMARKS_PER_HAND]), axis=1)
    r_sum = np.sum(np.abs(X[i, :, LANDMARKS_PER_HAND:]), axis=1)
    left_zero.append(np.sum(l_sum == 0) / SEQUENCE_LENGTH * 100)
    right_zero.append(np.sum(r_sum == 0) / SEQUENCE_LENGTH * 100)
    l_act = l_sum != 0;  r_act = r_sum != 0
    both_rate.append(np.sum(l_act & r_act) / SEQUENCE_LENGTH * 100)

axes[1].hist(left_zero,  bins=20, alpha=0.7, color='#2196F3', label='Left')
axes[1].hist(right_zero, bins=20, alpha=0.7, color='#FF9800', label='Right')
axes[1].set_title('Zero-Frame Rate per Hand'); axes[1].legend()

axes[2].hist(both_rate, bins=20, color='#4CAF50', edgecolor='darkgreen', alpha=0.85)
axes[2].set_title(f'Two-Hand Detection (mean: {np.mean(both_rate):.1f}%)')
plt.tight_layout(); plt.show()

# --- Plot 3: PCA of mean sequence per class ---
if 'category' in vocab_df.columns:
    cat_map = dict(zip(vocab_df['word_id'].astype(int), vocab_df['category']))
    mean_vecs, cat_labels = [], []
    for uid in unique_ids:
        mask = y == uid
        mean_vecs.append(X[mask].mean(axis=(0, 1)))
        cat_labels.append(cat_map.get(int(uid), 'other'))
    pca = PCA(n_components=2)
    pts = pca.fit_transform(np.array(mean_vecs))
    cats_unique = list(set(cat_labels))
    cmap_pca = plt.cm.get_cmap('tab10', len(cats_unique))
    fig, ax = plt.subplots(figsize=(10, 8))
    for ci, cat in enumerate(cats_unique):
        mask = [c == cat for c in cat_labels]
        ax.scatter(pts[mask, 0], pts[mask, 1], color=cmap_pca(ci), label=cat, s=30, alpha=0.7)
    ax.set_title('PCA of Mean Landmark Vectors per Class (colored by category)', fontsize=13)
    ax.legend(fontsize=9, ncol=2); ax.grid(True, alpha=0.3)
    plt.tight_layout(); plt.show()
    ev = pca.explained_variance_ratio_
    print(f'PCA explained variance: PC1={ev[0]*100:.1f}%  PC2={ev[1]*100:.1f}%')

print(f'\nüìä Summary:')
print(f'   Total samples     : {len(y)}')
print(f'   Classes           : {len(unique_ids)}')
print(f'   Sequence length   : {SEQUENCE_LENGTH}')
print(f'   Features/frame    : {NUM_FEATURES}')
print(f'   Min samples/class : {counts.min()} ({word_names[counts.argmin()]})')
print(f'   Max samples/class : {counts.max()} ({word_names[counts.argmax()]})')
print(f'   Mean              : {counts.mean():.1f}')
print(f'   Median            : {np.median(counts):.1f}')


In [None]:
# ===============================================================
# CELL 7: PREPROCESSING, AUGMENTATION & DATA SPLITS
# ===============================================================
print('=' * 65)
print('üîß PREPROCESSING & TRAIN/VAL/TEST SPLIT')
print('=' * 65)

# 1. Reload & normalize
data = np.load(NPZ_PATH)
X, y = data['X'], data['y']

orig_shape  = X.shape
X_flat      = X.reshape(-1, NUM_FEATURES)
scaler      = StandardScaler()
X_flat      = scaler.fit_transform(X_flat)
X           = X_flat.reshape(orig_shape).astype(np.float32)
joblib.dump(scaler, OUTPUT_DIR / 'scaler.pkl')
print(f'  ‚úÖ StandardScaler fitted & saved  (shape: {X.shape})')

# 2. Label encoding
encoder     = LabelEncoder()
y_encoded   = encoder.fit_transform(y)
num_classes = len(encoder.classes_)
y_onehot    = to_categorical(y_encoded, num_classes=num_classes)
joblib.dump(encoder, OUTPUT_DIR / 'encoder.pkl')
print(f'  ‚úÖ LabelEncoder saved  ({num_classes} classes)')

# 3. Stratified 70/15/15 split
try:
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y_onehot, test_size=TEST_SIZE, random_state=42, stratify=y_encoded)
    tl = np.argmax(y_temp, axis=1)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42, stratify=tl)
except ValueError:
    print('  ‚ö†Ô∏è  Falling back to random split')
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y_onehot, test_size=TEST_SIZE, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42)

# 4. Class weights (clipped 0.3‚Äì8.0 for extreme imbalance)
train_lbl       = np.argmax(y_train, axis=1)
present_cls     = np.unique(train_lbl)
cw_vals         = compute_class_weight('balanced', classes=present_cls, y=train_lbl)
cw_vals         = np.clip(cw_vals, 0.3, 8.0)
class_weights   = {i: 1.0 for i in range(num_classes)}
for i, ci in enumerate(present_cls):
    class_weights[ci] = float(cw_vals[i])

print(f'\n  üìä Split:')
print(f'     Classes     : {num_classes}')
print(f'     Train       : {X_train.shape[0]:>6}  ({X_train.shape[0]/len(X)*100:.0f}%)')
print(f'     Validation  : {X_val.shape[0]:>6}  ({X_val.shape[0]/len(X)*100:.0f}%)')
print(f'     Test        : {X_test.shape[0]:>6}  ({X_test.shape[0]/len(X)*100:.0f}%)')
print(f'     Input shape : {X_train.shape[1:]}')
print(f'     Max CW      : {max(class_weights.values()):.2f}')

# 5. Augmentation function (rich pipeline)
@tf.function
def augment_sequence(x, y_label):
    """
    Heavy augmentation pipeline:
    1. Gaussian noise (std=0.003)
    2. Temporal shift ¬±5 frames
    3. Frame dropout ~8%
    4. Random scaling 0.8‚Äì1.2
    5. Left/right hand swap (25% prob)
    6. Speed perturbation (¬±20% via resize)
    7. Wrist coordinate rotation ¬±15¬∞
    """
    # 1. Noise
    x = x + tf.random.normal(tf.shape(x), stddev=0.003)

    # 2. Temporal shift
    shift = tf.random.uniform([], -5, 6, dtype=tf.int32)
    x = tf.roll(x, shift=shift, axis=0)

    # 3. Frame dropout
    mask = tf.cast(tf.random.uniform([SEQUENCE_LENGTH, 1]) > 0.08, tf.float32)
    x = x * mask

    # 4. Scale
    scale = tf.random.uniform([], 0.80, 1.20)
    x = x * scale

    # 5. Hand swap
    def swap_hands():
        left  = x[:, :LANDMARKS_PER_HAND]
        right = x[:, LANDMARKS_PER_HAND:]
        return tf.concat([right, left], axis=1)
    x = tf.cond(tf.random.uniform([]) < 0.25, swap_hands, lambda: x)

    # 6. Speed perturbation ‚Äî squeeze/stretch the time axis
    new_len = tf.cast(
        tf.cast(SEQUENCE_LENGTH, tf.float32) * tf.random.uniform([], 0.80, 1.20),
        tf.int32)
    new_len = tf.clip_by_value(new_len, SEQUENCE_LENGTH // 2, SEQUENCE_LENGTH * 2)
    x_exp   = tf.expand_dims(tf.expand_dims(x, 0), 3)     # (1, T, F, 1)
    x_res   = tf.image.resize(x_exp, [new_len, NUM_FEATURES])  # resize along time
    x_res   = tf.squeeze(x_res, axis=[0, 3])               # (new_len, F)
    # Resample back to SEQUENCE_LENGTH
    indices = tf.cast(
        tf.linspace(0.0, tf.cast(new_len - 1, tf.float32), SEQUENCE_LENGTH), tf.int32)
    x = tf.gather(x_res, indices)

    # 7. In-plane rotation of x/y coords (first 2 of every 3 features)
    theta = tf.random.uniform([], -math.pi/12, math.pi/12)   # ¬±15¬∞
    cos_t = tf.cos(theta);  sin_t = tf.sin(theta)
    # reshape into (T, num_pts, 3) and rotate x,y
    num_pts = NUM_FEATURES // 3
    x3 = tf.reshape(x, [SEQUENCE_LENGTH, num_pts, 3])
    xs, ys, zs = x3[:, :, 0], x3[:, :, 1], x3[:, :, 2]
    xr = xs * cos_t - ys * sin_t
    yr = xs * sin_t + ys * cos_t
    x3 = tf.stack([xr, yr, zs], axis=2)
    x  = tf.reshape(x3, [SEQUENCE_LENGTH, NUM_FEATURES])

    return x, y_label

print('  ‚úÖ Augmentation pipeline defined')


In [None]:
# ===============================================================
# CELL 8: CUSTOM ARCHITECTURE ‚Äî CNN-BiLSTM + TRANSFORMER + ATTENTION
# ===============================================================
print('=' * 65)
print('üèóÔ∏è  BUILDING CNN-BiLSTM + TRANSFORMER ENSEMBLE BACKBONE')
print('=' * 65)

tf.keras.backend.clear_session()

# ‚îÄ‚îÄ‚îÄ Custom Layers ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

class TransformerBlock(tf.keras.layers.Layer):
    """Transformer encoder block: Multi-Head Self-Attention + FFN + residual."""
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        self.attn  = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim // num_heads,
                                         dropout=dropout)
        self.ffn1  = Dense(ff_dim,    activation='gelu')
        self.ffn2  = Dense(embed_dim)
        self.ln1   = LayerNormalization(epsilon=1e-6)
        self.ln2   = LayerNormalization(epsilon=1e-6)
        self.drop1 = Dropout(dropout)
        self.drop2 = Dropout(dropout)

    def call(self, x, training=False):
        attn_out = self.attn(x, x, training=training)
        attn_out = self.drop1(attn_out, training=training)
        x = self.ln1(x + attn_out)
        ffn_out = self.ffn2(self.ffn1(x))
        ffn_out = self.drop2(ffn_out, training=training)
        return self.ln2(x + ffn_out)

    def get_config(self):
        cfg = super().get_config()
        cfg.update(dict(
            embed_dim=self.ffn2.units,
            num_heads=self.attn.num_heads,
            ff_dim=self.ffn1.units,
            dropout=self.drop1.rate
        ))
        return cfg


class TemporalAttention(tf.keras.layers.Layer):
    """Weighted temporal pooling over the time axis."""
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight('W', shape=(input_shape[-1], 1),
                                 initializer='glorot_uniform', trainable=True)
        self.b = self.add_weight('b', shape=(input_shape[1], 1),
                                 initializer='zeros',          trainable=True)

    def call(self, x):
        e = tf.nn.tanh(tf.matmul(x, self.W) + self.b)   # (B, T, 1)
        a = tf.nn.softmax(e, axis=1)                     # (B, T, 1)
        return tf.reduce_sum(x * a, axis=1)              # (B, F)

    def get_config(self):
        return super().get_config()


# ‚îÄ‚îÄ‚îÄ Model Builder ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

def build_main_model(seq_len, num_feat, n_classes):
    """
    CNN-BiLSTM + Transformer Encoder + Temporal Attention
    ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    Stage 1: Local motion extractor  (Conv1D √ó3)
    Stage 2: Temporal sequence model (Bi-LSTM √ó3)
    Stage 3: Global context          (Transformer blocks √ó3)
    Stage 4: Attention pooling
    Stage 5: Dense classifier head
    """
    reg = tf.keras.regularizers.l2(L2_REG)
    inp = Input(shape=(seq_len, num_feat), name='landmarks')

    # ‚îÄ‚îÄ Conv1D feature extractor ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    x = Conv1D(CNN_FILTERS_1, 3, padding='same', use_bias=False)(inp)
    x = BatchNormalization()(x)
    x = tf.keras.layers.Activation('gelu')(x)

    x = Conv1D(CNN_FILTERS_2, 3, padding='same', use_bias=False)(x)
    x = BatchNormalization()(x)
    x = tf.keras.layers.Activation('gelu')(x)

    x = Conv1D(CNN_FILTERS_3, 3, padding='same', use_bias=False)(x)
    x = BatchNormalization()(x)
    x = tf.keras.layers.Activation('gelu')(x)

    x = SpatialDropout1D(SPATIAL_DROPOUT)(x)

    # ‚îÄ‚îÄ BiLSTM stack ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    x = Bidirectional(LSTM(LSTM_UNITS_1, return_sequences=True,
                           recurrent_dropout=0.0,
                           kernel_regularizer=reg), name='bilstm_1')(x)
    x = BatchNormalization()(x)
    x = SpatialDropout1D(SPATIAL_DROPOUT)(x)

    x = Bidirectional(LSTM(LSTM_UNITS_2, return_sequences=True,
                           kernel_regularizer=reg), name='bilstm_2')(x)
    x = BatchNormalization()(x)
    x = SpatialDropout1D(SPATIAL_DROPOUT)(x)

    x = Bidirectional(LSTM(LSTM_UNITS_3, return_sequences=True,
                           kernel_regularizer=reg), name='bilstm_3')(x)
    x = BatchNormalization()(x)

    # ‚îÄ‚îÄ Transformer encoder blocks ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    embed_dim = x.shape[-1]   # 2 √ó LSTM_UNITS_3 (bidirectional)
    for i in range(NUM_TRANSFORMER_BLOCKS):
        x = TransformerBlock(
            embed_dim=embed_dim,
            num_heads=NUM_TRANSFORMER_HEADS,
            ff_dim=TRANSFORMER_FF_DIM,
            dropout=DROPOUT_RATE * 0.5,
            name=f'transformer_{i}'
        )(x)

    # ‚îÄ‚îÄ Attention pooling ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    x = TemporalAttention(name='temporal_attention')(x)
    x = Dropout(DROPOUT_RATE)(x)

    # ‚îÄ‚îÄ Dense head ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    x = Dense(DENSE_UNITS, use_bias=False, kernel_regularizer=reg)(x)
    x = BatchNormalization()(x)
    x = tf.keras.layers.Activation('gelu')(x)
    x = Dropout(DROPOUT_RATE)(x)

    x = Dense(DENSE_UNITS // 2, use_bias=False, kernel_regularizer=reg)(x)
    x = BatchNormalization()(x)
    x = tf.keras.layers.Activation('gelu')(x)
    x = Dropout(DROPOUT_RATE * 0.5)(x)

    out = Dense(n_classes, activation='softmax', dtype='float32', name='output')(x)

    return Model(inp, out, name='ASL_CNN_BiLSTM_Transformer_V2')


with tf.device(DEVICE):
    main_model = build_main_model(SEQUENCE_LENGTH, NUM_FEATURES, num_classes)

# ‚îÄ‚îÄ‚îÄ Focal loss ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Manually implemented so it works on all TF versions
def categorical_focal_loss(gamma=2.0, label_smoothing=LABEL_SMOOTH):
    def loss_fn(y_true, y_pred):
        y_pred  = tf.clip_by_value(y_pred, 1e-7, 1.0)
        # Label smoothing
        n_cls   = tf.cast(tf.shape(y_true)[-1], tf.float32)
        y_true  = y_true * (1 - label_smoothing) + label_smoothing / n_cls
        ce      = -tf.reduce_sum(y_true * tf.math.log(y_pred), axis=-1)
        p_t     = tf.reduce_sum(y_true * y_pred, axis=-1)
        focal_w = tf.pow(1.0 - p_t, gamma)
        return tf.reduce_mean(focal_w * ce)
    return loss_fn

# ‚îÄ‚îÄ‚îÄ AdamW optimizer ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
try:
    optimizer = tf.keras.optimizers.AdamW(
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        clipnorm=GRAD_CLIP_NORM
    )
    print('  ‚úÖ Using AdamW optimizer')
except AttributeError:
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=LEARNING_RATE,
        clipnorm=GRAD_CLIP_NORM
    )
    print('  ‚ÑπÔ∏è  AdamW not available ‚Äî falling back to Adam')

# Top-5 accuracy metric
top5_metric = tf.keras.metrics.TopKCategoricalAccuracy(k=5, name='top5_acc')

main_model.compile(
    optimizer=optimizer,
    loss=categorical_focal_loss(gamma=FOCAL_GAMMA),
    metrics=['accuracy', top5_metric]
)

print('\nüìê Architecture:')
main_model.summary()
print(f'\n  Input : ({SEQUENCE_LENGTH} frames, {NUM_FEATURES} features)')
print(f'  Output: {num_classes} classes')
total_params = main_model.count_params()
print(f'  Params: {total_params:,}')


In [None]:
# ===============================================================
# CELL 9: TRAINING PIPELINE WITH COSINE WARMUP LR SCHEDULE
# ===============================================================
print('=' * 65)
print('üöÄ TRAINING ‚Äî CosineDecay + Warmup | Focal Loss | AdamW')
print('=' * 65)

AUTOTUNE = tf.data.AUTOTUNE

# ‚îÄ‚îÄ‚îÄ tf.data pipelines ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
train_ds = (tf.data.Dataset.from_tensor_slices((X_train, y_train))
            .shuffle(min(len(X_train), 15000), reshuffle_each_iteration=True)
            .map(augment_sequence, num_parallel_calls=AUTOTUNE)
            .batch(BATCH_SIZE)
            .prefetch(AUTOTUNE))

val_ds  = (tf.data.Dataset.from_tensor_slices((X_val,  y_val))
           .batch(BATCH_SIZE).prefetch(AUTOTUNE))
test_ds = (tf.data.Dataset.from_tensor_slices((X_test, y_test))
           .batch(BATCH_SIZE).prefetch(AUTOTUNE))

print(f'  ‚úÖ tf.data pipelines ready  (batch={BATCH_SIZE})')

# ‚îÄ‚îÄ‚îÄ Cosine Decay with Linear Warmup ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
class CosineWarmupSchedule(tf.keras.callbacks.Callback):
    """Linear warmup for `warmup_epochs`, then cosine decay to `min_lr`."""
    def __init__(self, total_epochs, warmup_epochs, base_lr, min_lr=1e-7):
        super().__init__()
        self.total   = total_epochs
        self.warmup  = warmup_epochs
        self.base_lr = base_lr
        self.min_lr  = min_lr

    def on_epoch_begin(self, epoch, logs=None):
        if epoch < self.warmup:
            lr = self.base_lr * (epoch + 1) / self.warmup
        else:
            decay_steps = self.total - self.warmup
            step        = epoch - self.warmup
            cos_decay   = 0.5 * (1 + math.cos(math.pi * step / decay_steps))
            lr = self.min_lr + (self.base_lr - self.min_lr) * cos_decay
        tf.keras.backend.set_value(self.model.optimizer.learning_rate, lr)

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        logs['lr'] = float(tf.keras.backend.get_value(self.model.optimizer.learning_rate))

# ‚îÄ‚îÄ‚îÄ Callbacks ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
callbacks = [
    ModelCheckpoint(
        str(OUTPUT_DIR / 'asl_word_lstm_v2_best.h5'),
        monitor='val_accuracy', save_best_only=True, mode='max', verbose=1
    ),
    EarlyStopping(
        monitor='val_loss', patience=40, restore_best_weights=True, verbose=1
    ),
    CosineWarmupSchedule(
        total_epochs=EPOCHS, warmup_epochs=WARMUP_EPOCHS,
        base_lr=LEARNING_RATE, min_lr=1e-7
    ),
    CSVLogger(str(OUTPUT_DIR / 'training_log_v2.csv')),
    TerminateOnNaN(),
]

# TensorBoard (optional ‚Äî works on Kaggle too)
try:
    tb = tf.keras.callbacks.TensorBoard(
        log_dir=str(OUTPUT_DIR / 'tb_logs'), histogram_freq=0, update_freq='epoch')
    callbacks.append(tb)
except Exception:
    pass

print(f'  ‚úÖ Callbacks ready  (patience=40, warmup={WARMUP_EPOCHS}ep)')

# ‚îÄ‚îÄ‚îÄ Train ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print(f'\nüöÄ Training on {DEVICE} | {EPOCHS} epochs | bs={BATCH_SIZE}')
print(f'   LR: {LEARNING_RATE} ‚Üí warmup {WARMUP_EPOCHS}ep ‚Üí cosine ‚Üí 1e-7')
print(f'   Focal Œ≥={FOCAL_GAMMA} | label_smooth={LABEL_SMOOTH} | grad_clip={GRAD_CLIP_NORM}')

t0 = time.time()
with tf.device(DEVICE):
    history = main_model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=EPOCHS,
        callbacks=callbacks,
        class_weight=class_weights,
        verbose=1
    )
elapsed = time.time() - t0
best_epoch = int(np.argmax(history.history['val_accuracy'])) + 1
best_val   = max(history.history['val_accuracy'])
print(f'\n‚úÖ Training done in {elapsed:.0f}s ({elapsed/60:.1f} min)')
print(f'   Best epoch     : {best_epoch}')
print(f'   Best val_acc   : {best_val*100:.2f}%')

# ‚îÄ‚îÄ‚îÄ Save final model + class mapping ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
main_model.save(str(OUTPUT_DIR / 'asl_word_lstm_v2_final.h5'))
class_df = pd.DataFrame({
    'model_class_index': range(num_classes),
    'word_id':           encoder.classes_.tolist()
})
class_df['english'] = class_df['word_id'].map(id_to_english)
class_df.to_csv(OUTPUT_DIR / 'asl_word_classes_v2.csv', index=False)
print(f'\nüíæ Saved final model & class CSV ‚Üí {OUTPUT_DIR}')


In [None]:
# ===============================================================
# CELL 10: ENSEMBLE ‚Äî PURE TRANSFORMER + TCN + CNN-BiLSTM
# ===============================================================
print('=' * 65)
print('üé≠ BUILDING & TRAINING ENSEMBLE (Transformer + TCN + main)')
print('=' * 65)

# ‚îÄ‚îÄ‚îÄ Model 2: Pure Transformer Encoder ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
def build_transformer_model(seq_len, num_feat, n_classes,
                              num_blocks=4, num_heads=8, ff_dim=512,
                              embed_dim=256, dropout=0.3):
    reg = tf.keras.regularizers.l2(L2_REG)
    inp = Input(shape=(seq_len, num_feat))

    # Project input to embed_dim
    x = Dense(embed_dim, use_bias=False)(inp)
    x = BatchNormalization()(x)

    # Learnable positional embeddings
    positions = tf.range(start=0, limit=seq_len, delta=1)
    pos_emb   = tf.keras.layers.Embedding(seq_len, embed_dim)(positions)
    x = x + pos_emb

    for i in range(num_blocks):
        x = TransformerBlock(embed_dim, num_heads, ff_dim, dropout, name=f'tb_{i}')(x)

    x = GlobalAveragePooling1D()(x)
    x = LayerNormalization()(x)
    x = Dense(512, use_bias=False, kernel_regularizer=reg)(x)
    x = BatchNormalization()(x); x = tf.keras.layers.Activation('gelu')(x)
    x = Dropout(dropout)(x)
    out = Dense(n_classes, activation='softmax', dtype='float32')(x)
    return Model(inp, out, name='ASL_PureTransformer')

# ‚îÄ‚îÄ‚îÄ Model 3: TCN (Temporal Conv Network with dilations) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
def build_tcn_model(seq_len, num_feat, n_classes,
                     filters=256, kernel_size=3, dilations=(1,2,4,8,16), dropout=0.3):
    reg = tf.keras.regularizers.l2(L2_REG)
    inp = Input(shape=(seq_len, num_feat))
    x   = Conv1D(filters, 1, padding='causal', use_bias=False)(inp)
    x   = BatchNormalization()(x)

    for d in dilations:
        res = x
        x   = Conv1D(filters, kernel_size, dilation_rate=d, padding='causal',
                     use_bias=False, kernel_regularizer=reg)(x)
        x   = BatchNormalization()(x); x = tf.keras.layers.Activation('gelu')(x)
        x   = SpatialDropout1D(dropout)(x)
        x   = Conv1D(filters, kernel_size, dilation_rate=d, padding='causal',
                     use_bias=False, kernel_regularizer=reg)(x)
        x   = BatchNormalization()(x); x = tf.keras.layers.Activation('gelu')(x)
        # Residual (match channels)
        if res.shape[-1] != filters:
            res = Conv1D(filters, 1, padding='same', use_bias=False)(res)
        x = Add()([x, res])

    x   = GlobalAveragePooling1D()(x)
    x   = Dense(512, use_bias=False, kernel_regularizer=reg)(x)
    x   = BatchNormalization()(x); x = tf.keras.layers.Activation('gelu')(x)
    x   = Dropout(dropout)(x)
    out = Dense(n_classes, activation='softmax', dtype='float32')(x)
    return Model(inp, out, name='ASL_TCN')

# Instantiate & compile sub-models
print('  Building Transformer model...')
try:
    opt2 = tf.keras.optimizers.AdamW(learning_rate=LEARNING_RATE,
                                      weight_decay=WEIGHT_DECAY, clipnorm=GRAD_CLIP_NORM)
except AttributeError:
    opt2 = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, clipnorm=GRAD_CLIP_NORM)

transformer_model = build_transformer_model(SEQUENCE_LENGTH, NUM_FEATURES, num_classes)
transformer_model.compile(optimizer=opt2,
                           loss=categorical_focal_loss(gamma=FOCAL_GAMMA),
                           metrics=['accuracy', top5_metric])

print('  Building TCN model...')
try:
    opt3 = tf.keras.optimizers.AdamW(learning_rate=LEARNING_RATE,
                                      weight_decay=WEIGHT_DECAY, clipnorm=GRAD_CLIP_NORM)
except AttributeError:
    opt3 = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, clipnorm=GRAD_CLIP_NORM)

tcn_model = build_tcn_model(SEQUENCE_LENGTH, NUM_FEATURES, num_classes)
tcn_model.compile(optimizer=opt3,
                   loss=categorical_focal_loss(gamma=FOCAL_GAMMA),
                   metrics=['accuracy', top5_metric])

print(f'  Transformer params : {transformer_model.count_params():,}')
print(f'  TCN params         : {tcn_model.count_params():,}')

# ‚îÄ‚îÄ‚îÄ Train sub-models ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
sub_cbs_1 = [
    ModelCheckpoint(str(OUTPUT_DIR / 'transformer_best.h5'),
                    monitor='val_accuracy', save_best_only=True, mode='max', verbose=0),
    EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True, verbose=0),
    CosineWarmupSchedule(EPOCHS, WARMUP_EPOCHS, LEARNING_RATE),
    TerminateOnNaN()
]
sub_cbs_2 = [
    ModelCheckpoint(str(OUTPUT_DIR / 'tcn_best.h5'),
                    monitor='val_accuracy', save_best_only=True, mode='max', verbose=0),
    EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True, verbose=0),
    CosineWarmupSchedule(EPOCHS, WARMUP_EPOCHS, LEARNING_RATE),
    TerminateOnNaN()
]

print('\nüöÄ Training Transformer sub-model...')
with tf.device(DEVICE):
    hist_t = transformer_model.fit(train_ds, validation_data=val_ds,
                                    epochs=EPOCHS, callbacks=sub_cbs_1,
                                    class_weight=class_weights, verbose=0)
best_t = max(hist_t.history['val_accuracy'])
print(f'   ‚úÖ Best val_acc (Transformer): {best_t*100:.2f}%')

print('\nüöÄ Training TCN sub-model...')
with tf.device(DEVICE):
    hist_tc = tcn_model.fit(train_ds, validation_data=val_ds,
                             epochs=EPOCHS, callbacks=sub_cbs_2,
                             class_weight=class_weights, verbose=0)
best_tc = max(hist_tc.history['val_accuracy'])
print(f'   ‚úÖ Best val_acc (TCN): {best_tc*100:.2f}%')

# ‚îÄ‚îÄ‚îÄ Build learned-weight ensemble ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print('\nüîó Building learned ensemble...')

inp_ens = Input(shape=(SEQUENCE_LENGTH, NUM_FEATURES), name='ensemble_input')

# Use best checkpoints
main_loaded = tf.keras.models.load_model(
    str(OUTPUT_DIR / 'asl_word_lstm_v2_best.h5'),
    custom_objects={'TransformerBlock': TransformerBlock,
                    'TemporalAttention': TemporalAttention,
                    'loss_fn': categorical_focal_loss(FOCAL_GAMMA)},
    compile=False
)
trans_loaded = tf.keras.models.load_model(
    str(OUTPUT_DIR / 'transformer_best.h5'),
    custom_objects={'TransformerBlock': TransformerBlock,
                    'loss_fn': categorical_focal_loss(FOCAL_GAMMA)},
    compile=False
)
tcn_loaded = tf.keras.models.load_model(
    str(OUTPUT_DIR / 'tcn_best.h5'),
    custom_objects={'loss_fn': categorical_focal_loss(FOCAL_GAMMA)},
    compile=False
)

# Freeze backbone weights ‚Äî only train the ensemble head
for m in [main_loaded, trans_loaded, tcn_loaded]:
    m.trainable = False

p1 = main_loaded(inp_ens)
p2 = trans_loaded(inp_ens)
p3 = tcn_loaded(inp_ens)

# Learnable softmax scalar weights
ens_raw    = tf.keras.layers.Concatenate(axis=-1)([
    tf.keras.layers.Reshape((num_classes, 1))(p1),
    tf.keras.layers.Reshape((num_classes, 1))(p2),
    tf.keras.layers.Reshape((num_classes, 1))(p3),
])   # (B, num_classes, 3)
w_logits   = tf.keras.layers.Dense(1, use_bias=False,
                                    kernel_initializer='ones')(ens_raw)  # (B, C, 1)
w_logits   = tf.keras.layers.Lambda(lambda x: tf.squeeze(x, -1))(w_logits)
ens_out    = tf.keras.layers.Softmax(axis=-1)(w_logits)

ensemble_model = Model(inp_ens, ens_out, name='ASL_Ensemble')
try:
    opt_ens = tf.keras.optimizers.AdamW(learning_rate=1e-4,
                                         weight_decay=1e-5, clipnorm=1.0)
except AttributeError:
    opt_ens = tf.keras.optimizers.Adam(learning_rate=1e-4, clipnorm=1.0)

ensemble_model.compile(
    optimizer=opt_ens,
    loss=categorical_focal_loss(FOCAL_GAMMA),
    metrics=['accuracy', top5_metric]
)

print('  Fine-tuning ensemble head (20 epochs)...')
with tf.device(DEVICE):
    hist_ens = ensemble_model.fit(
        train_ds, validation_data=val_ds,
        epochs=20,
        callbacks=[EarlyStopping(patience=10, restore_best_weights=True, verbose=0),
                   TerminateOnNaN()],
        class_weight=class_weights,
        verbose=1
    )

ensemble_model.save(str(OUTPUT_DIR / 'asl_word_ensemble_final.h5'))
best_ens_val = max(hist_ens.history['val_accuracy'])
print(f'\n‚úÖ Ensemble val_acc : {best_ens_val*100:.2f}%')
print(f'üíæ Saved ‚Üí {OUTPUT_DIR / "asl_word_ensemble_final.h5"}')


In [None]:
# ===============================================================
# CELL 11: COMPREHENSIVE EVALUATION DASHBOARD
# ===============================================================
print('=' * 65)
print('üìà EVALUATION DASHBOARD  (Ensemble Model)')
print('=' * 65)

word_labels = [id_to_english.get(int(encoder.classes_[i]), str(encoder.classes_[i]))
               for i in range(num_classes)]

# ‚îÄ‚îÄ Predictions ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
eval_ds = (tf.data.Dataset.from_tensor_slices((X_test,))
           .batch(BATCH_SIZE).prefetch(AUTOTUNE))

with tf.device(DEVICE):
    proba = ensemble_model.predict(eval_ds, verbose=0)

y_pred = np.argmax(proba, axis=1)
y_true = np.argmax(y_test, axis=1)

# ‚îÄ‚îÄ Top-K Accuracy ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
def topk_acc(proba, y_true, k):
    correct = sum(1 for i in range(len(y_true))
                  if y_true[i] in np.argsort(proba[i])[-k:])
    return correct / len(y_true)

top1 = (y_pred == y_true).mean()
top3 = topk_acc(proba, y_true, 3)
top5 = topk_acc(proba, y_true, 5)

print(f'\nüéØ Ensemble Test Results:')
print(f'   Top-1 : {top1*100:.2f}%')
print(f'   Top-3 : {top3*100:.2f}%')
print(f'   Top-5 : {top5*100:.2f}%')
print(f'   N test: {len(y_true)}  | classes: {num_classes}')

# ‚îÄ‚îÄ PLOT 1: Training Dashboard (main model) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
fig, axes = plt.subplots(2, 2, figsize=(18, 12))
h = history.history

axes[0,0].plot(h['accuracy'],     lw=2, color='#2196F3', label='Train')
axes[0,0].plot(h['val_accuracy'], lw=2, color='#FF9800', label='Val')
best_ep = int(np.argmax(h['val_accuracy']))
axes[0,0].axvline(best_ep, color='green', ls=':', alpha=0.6, label=f'Best ep {best_ep+1}')
axes[0,0].fill_between(range(len(h['accuracy'])), h['accuracy'], h['val_accuracy'],
                        alpha=0.08, color='red')
axes[0,0].set_title('Accuracy', fontsize=14, fontweight='bold')
axes[0,0].set_ylim([0, 1.05]); axes[0,0].legend(); axes[0,0].grid(alpha=0.3)

axes[0,1].plot(h['loss'],     lw=2, color='#2196F3', label='Train')
axes[0,1].plot(h['val_loss'], lw=2, color='#FF9800', label='Val')
axes[0,1].set_title('Loss', fontsize=14, fontweight='bold')
axes[0,1].legend(); axes[0,1].grid(alpha=0.3)

lr_vals = h.get('lr', [LEARNING_RATE] * len(h['loss']))
axes[1,0].plot(lr_vals, lw=2, color='#4CAF50', marker='o', markersize=3)
axes[1,0].set_yscale('log'); axes[1,0].set_title('Learning Rate Schedule', fontsize=14, fontweight='bold')
axes[1,0].grid(alpha=0.3)

gap = np.array(h['accuracy']) - np.array(h['val_accuracy'])
bar_colors = ['green' if g < 0.05 else 'orange' if g < 0.15 else 'red' for g in gap]
axes[1,1].bar(range(len(gap)), gap, color=bar_colors, alpha=0.8, linewidth=0.3, edgecolor='black')
axes[1,1].axhline(0.05, color='green', ls='--', alpha=0.5, label='Healthy (5%)')
axes[1,1].axhline(0.15, color='red',   ls='--', alpha=0.5, label='Overfit (15%)')
axes[1,1].set_title('Overfitting Monitor', fontsize=14, fontweight='bold')
axes[1,1].legend(); axes[1,1].grid(alpha=0.3)

plt.suptitle(f'CNN-BiLSTM+Transformer V2 | Top-1: {top1*100:.1f}%  Top-5: {top5*100:.1f}%',
             fontsize=15, fontweight='bold', y=1.01)
plt.tight_layout(); plt.show()

# ‚îÄ‚îÄ PLOT 2: Confidence distribution ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
correct_mask = y_pred == y_true
fig, axes = plt.subplots(1, 2, figsize=(16, 5))
axes[0].hist(np.max(proba[correct_mask],  axis=1), bins=30, alpha=0.7,
             color='#4CAF50', edgecolor='darkgreen', label=f'Correct ({correct_mask.sum()})')
if (~correct_mask).sum() > 0:
    axes[0].hist(np.max(proba[~correct_mask], axis=1), bins=30, alpha=0.7,
                 color='#F44336', edgecolor='darkred', label=f'Wrong ({(~correct_mask).sum()})')
axes[0].set_title('Confidence: Correct vs Wrong'); axes[0].legend()
sorted_p = np.sort(proba, axis=1)[:, ::-1]
margin   = sorted_p[:, 0] - sorted_p[:, 1]
axes[1].hist(margin, bins=30, color='#9C27B0', edgecolor='purple', alpha=0.8)
axes[1].set_title(f'Decision Margin (mean={np.mean(margin):.3f})')
plt.tight_layout(); plt.show()

# ‚îÄ‚îÄ Classification Report ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print('\nüìã Classification Report:')
report = classification_report(y_true, y_pred, labels=range(num_classes),
                                target_names=word_labels, zero_division=0, output_dict=True)
print(classification_report(y_true, y_pred, labels=range(num_classes),
                             target_names=word_labels, zero_division=0))

# ‚îÄ‚îÄ PLOT 3: Per-class F1 ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
class_f1 = {w: report[w]['f1-score'] for w in word_labels if w in report}
sf1 = sorted(class_f1.items(), key=lambda x: x[1], reverse=True)
fig, ax = plt.subplots(figsize=(26, 6))
f1_cols = ['#4CAF50' if v >= 0.7 else '#FF9800' if v >= 0.4 else '#F44336'
           for _, v in sf1]
ax.bar(range(len(sf1)), [v for _, v in sf1], color=f1_cols, edgecolor='black', lw=0.3)
ax.set_xticks(range(len(sf1)))
ax.set_xticklabels([n for n, _ in sf1], rotation=90, fontsize=5.5)
mean_f1 = np.mean([v for _, v in sf1])
ax.axhline(mean_f1, color='blue', ls='--', alpha=0.5, label=f'Mean F1: {mean_f1:.3f}')
ax.set_title(f'Per-Class F1 Score ‚Äî Mean: {mean_f1:.3f}', fontsize=14)
ax.set_ylim([0, 1.05]); ax.legend(); plt.tight_layout(); plt.show()

# ‚îÄ‚îÄ PLOT 4: Normalized confusion matrix ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
cm = confusion_matrix(y_true, y_pred, labels=range(num_classes))
cm_norm = cm.astype(float) / (cm.sum(axis=1, keepdims=True) + 1e-9)
fig, ax = plt.subplots(figsize=(20, 18))
sns.heatmap(cm_norm, cmap='Blues', ax=ax,
            xticklabels=word_labels, yticklabels=word_labels,
            annot=(num_classes <= 50), fmt='.2f' if num_classes <= 50 else '')
ax.set_title(f'Normalized Confusion Matrix | Top-1: {top1*100:.1f}%', fontsize=15)
ax.set_xlabel('Predicted', fontsize=13); ax.set_ylabel('True', fontsize=13)
plt.xticks(rotation=90, fontsize=5); plt.yticks(fontsize=5)
plt.tight_layout(); plt.show()

# ‚îÄ‚îÄ PLOT 5: Top-15 confused pairs ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
cm_off = cm.copy(); np.fill_diagonal(cm_off, 0)
pairs = [(word_labels[i], word_labels[j], cm_off[i, j])
         for i in range(num_classes) for j in range(num_classes) if cm_off[i, j] > 0]
pairs.sort(key=lambda x: x[2], reverse=True)
top15 = pairs[:15]
if top15:
    fig, ax = plt.subplots(figsize=(14, 7))
    labels15 = [f'{a} ‚Üí {b}' for a, b, _ in top15]
    vals15   = [c for _, _, c in top15]
    bars = ax.barh(range(len(labels15)), vals15, color='#E91E63', edgecolor='darkred', alpha=0.85)
    ax.set_yticks(range(len(labels15))); ax.set_yticklabels(labels15, fontsize=9)
    ax.invert_yaxis()
    for bar, v in zip(bars, vals15):
        ax.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height()/2,
                str(v), va='center', fontsize=9, fontweight='bold')
    ax.set_title('Top-15 Most Confused Pairs', fontsize=14, fontweight='bold')
    plt.tight_layout(); plt.show()

# ‚îÄ‚îÄ PLOT 6: Per-category accuracy ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
if 'category' in vocab_df.columns:
    cat_map = dict(zip(vocab_df['word_id'].astype(int), vocab_df['category']))
    cat_ok, cat_tot = {}, {}
    for i in range(len(y_true)):
        wid = int(encoder.classes_[y_true[i]])
        cat = cat_map.get(wid, 'other')
        cat_tot[cat] = cat_tot.get(cat, 0) + 1
        if y_pred[i] == y_true[i]:
            cat_ok[cat] = cat_ok.get(cat, 0) + 1
    cats = sorted(cat_tot)
    cat_accs = [cat_ok.get(c, 0) / cat_tot[c] for c in cats]
    fig, ax = plt.subplots(figsize=(14, 6))
    bars = ax.bar(range(len(cats)), [a*100 for a in cat_accs],
                  color='#2196F3', edgecolor='navy', alpha=0.85)
    ax.set_xticks(range(len(cats))); ax.set_xticklabels(cats, rotation=45, ha='right')
    for bar, acc, cat in zip(bars, cat_accs, cats):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                f'{acc*100:.1f}%\n(n={cat_tot[cat]})', ha='center', fontsize=8, fontweight='bold')
    ax.axhline(top1*100, color='red', ls='--', alpha=0.5, label=f'Overall: {top1*100:.1f}%')
    ax.set_ylim([0, 110]); ax.set_title('Per-Category Accuracy', fontsize=14, fontweight='bold')
    ax.legend(); plt.tight_layout(); plt.show()

# ‚îÄ‚îÄ PLOT 7: Best vs Worst 10 classes ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
pcacc = {}
for i in range(num_classes):
    m = y_true == i
    if m.sum() > 0:
        pcacc[word_labels[i]] = (y_pred[m] == i).mean()
sorted_pc = sorted(pcacc.items(), key=lambda x: x[1])
n_show = min(10, len(sorted_pc))
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))
worst = sorted_pc[:n_show]
ax1.barh(range(n_show), [w[1]*100 for w in worst], color='#F44336', alpha=0.85, edgecolor='darkred')
ax1.set_yticks(range(n_show)); ax1.set_yticklabels([w[0] for w in worst])
ax1.set_xlabel('Accuracy (%)'); ax1.set_title(f'Bottom {n_show}', fontsize=14, color='#F44336')
for i, w in enumerate(worst):
    ax1.text(w[1]*100 + 0.5, i, f'{w[1]*100:.1f}%', va='center')
best = sorted_pc[-n_show:][::-1]
ax2.barh(range(n_show), [b[1]*100 for b in best], color='#4CAF50', alpha=0.85, edgecolor='darkgreen')
ax2.set_yticks(range(n_show)); ax2.set_yticklabels([b[0] for b in best])
ax2.set_xlabel('Accuracy (%)'); ax2.set_title(f'Top {n_show}', fontsize=14, color='#4CAF50')
for i, b in enumerate(best):
    ax2.text(b[1]*100 + 0.5, i, f'{b[1]*100:.1f}%', va='center')
plt.suptitle('Best vs Worst Classes ‚Äî Ensemble', fontsize=15, fontweight='bold', y=1.01)
plt.tight_layout(); plt.show()

# ‚îÄ‚îÄ PLOT 8: Precision-Recall scatter (F1 color) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
precs = [report[w]['precision']  for w in word_labels if w in report]
recs  = [report[w]['recall']     for w in word_labels if w in report]
f1s   = [report[w]['f1-score']   for w in word_labels if w in report]
wl_r  = [w for w in word_labels if w in report]
fig, ax = plt.subplots(figsize=(10, 8))
sc = ax.scatter(recs, precs, c=f1s, cmap='RdYlGn', s=55, edgecolors='black', lw=0.5, alpha=0.85)
plt.colorbar(sc, label='F1 Score', ax=ax)
ax.set_xlabel('Recall', fontsize=13); ax.set_ylabel('Precision', fontsize=13)
ax.set_title('Precision vs Recall (color = F1)', fontsize=14, fontweight='bold')
ax.set_xlim([-0.05, 1.05]); ax.set_ylim([-0.05, 1.05])
ax.plot([0, 1], [0, 1], 'k--', alpha=0.2); ax.grid(alpha=0.3)
for i, lbl in enumerate(wl_r):
    if f1s[i] < 0.3:
        ax.annotate(lbl, (recs[i], precs[i]), fontsize=7, alpha=0.8, xytext=(4, 4),
                    textcoords='offset points')
plt.tight_layout(); plt.show()

print('\n' + '=' * 65)
print(f'‚úÖ FINAL RESULTS')
print(f'   Top-1 : {top1*100:.2f}%')
print(f'   Top-3 : {top3*100:.2f}%')
print(f'   Top-5 : {top5*100:.2f}%')
print('=' * 65)


In [None]:
# ===============================================================
# CELL 12: EXPORT ARTIFACTS & DOWNLOAD
# ===============================================================
print('=' * 65)
print('üíæ EXPORT ARTIFACTS')
print('=' * 65)

# Confirm all files are saved
artifacts = {
    'Best CNN-BiLSTM model'   : OUTPUT_DIR / 'asl_word_lstm_v2_best.h5',
    'Final CNN-BiLSTM model'  : OUTPUT_DIR / 'asl_word_lstm_v2_final.h5',
    'Transformer sub-model'   : OUTPUT_DIR / 'transformer_best.h5',
    'TCN sub-model'           : OUTPUT_DIR / 'tcn_best.h5',
    'Ensemble model'          : OUTPUT_DIR / 'asl_word_ensemble_final.h5',
    'Scaler (sklearn)'        : OUTPUT_DIR / 'scaler.pkl',
    'Label encoder (sklearn)' : OUTPUT_DIR / 'encoder.pkl',
    'Class mapping CSV'       : OUTPUT_DIR / 'asl_word_classes_v2.csv',
    'Training log CSV'        : OUTPUT_DIR / 'training_log_v2.csv',
    'Sequence dataset'        : NPZ_PATH,
}

print('\nFile inventory:')
total_mb = 0.0
for desc, path in artifacts.items():
    p = Path(path)
    if p.exists():
        mb = p.stat().st_size / (1024 * 1024)
        total_mb += mb
        print(f'  ‚úÖ  {p.name:<42}  {mb:7.1f} MB   [{desc}]')
    else:
        print(f'  ‚ùå  {str(p.name):<42}  NOT FOUND  [{desc}]')

print(f'\n  Total   : {total_mb:.1f} MB')
print(f'  Location: {OUTPUT_DIR}')

if IS_KAGGLE:
    print('\nüí° Kaggle: go to Output tab and click ‚ñ∂ DownloadAll to grab all files.')
    print('   Place asl_word_lstm_v2_best.h5 + scaler.pkl + encoder.pkl')
    print('   + asl_word_classes_v2.csv into your live-test folder.')
else:
    print(f'\nüìÇ Local path: {OUTPUT_DIR.resolve()}')

print('\n' + '=' * 65)
print('‚úÖ  ALL DONE')
print('=' * 65)
