In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import pickle
import json
import os
import math
import time
from pathlib import Path
from tqdm import tqdm

# Configuration
CONFIG = {
    # Raw data paths
    "raw_data_dir": r"C:\Users\engmr\Downloads\public_si_dat",
    "train_csv": r"C:\Users\engmr\Downloads\public_si_dat\train.csv",
    "dev_csv": r"C:\Users\engmr\Downloads\public_si_dat\dev.csv",
    "pickle_file": r"C:\Users\engmr\Downloads\public_si_dat\pose_data_isharah1000_hands_lips_body_May12.pkl",
    
    # Output paths
    "output_dir": "isharah_clean",
    "features_dir": "isharah_clean/features",
    "vocab_file": "isharah_clean/vocab.json",
    "labels_dir": "isharah_clean/labels",
    "models_dir": "isharah_clean/models",
    
    # Model hyperparameters
    "d_model": 256,
    "nhead": 4,
    "num_encoder_layers": 4,
    "num_decoder_layers": 4,
    "dim_feedforward": 1024,
    "dropout": 0.3,
    
    # Training
    "batch_size": 32,
    "learning_rate": 3e-4,
    "num_epochs": 50,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

# Create directories
for key in ["output_dir", "features_dir", "labels_dir", "models_dir"]:
    os.makedirs(CONFIG[key], exist_ok=True)

print("Configuration loaded:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

Configuration loaded:
  raw_data_dir: C:\Users\engmr\Downloads\public_si_dat
  train_csv: C:\Users\engmr\Downloads\public_si_dat\train.csv
  dev_csv: C:\Users\engmr\Downloads\public_si_dat\dev.csv
  pickle_file: C:\Users\engmr\Downloads\public_si_dat\pose_data_isharah1000_hands_lips_body_May12.pkl
  output_dir: isharah_clean
  features_dir: isharah_clean/features
  vocab_file: isharah_clean/vocab.json
  labels_dir: isharah_clean/labels
  models_dir: isharah_clean/models
  d_model: 256
  nhead: 4
  num_encoder_layers: 4
  num_decoder_layers: 4
  dim_feedforward: 1024
  dropout: 0.3
  batch_size: 32
  learning_rate: 0.0003
  num_epochs: 50
  device: cuda


In [2]:
def normalize_keypoints(keypoints):
    """
    Normalize keypoints to standard scale.
    
    Args:
        keypoints: (T, 86, 2) array of (x, y) coordinates
    
    Returns:
        (T, 172) flattened normalized array
    """
    arr = np.array(keypoints, dtype=np.float32)
    
    if arr.ndim != 3 or arr.shape[2] != 2:
        raise ValueError(f"Expected shape (T, 86, 2), got {arr.shape}")
    
    T, num_kps, _ = arr.shape
    
    # Center: subtract mean across all keypoints and frames
    center = np.nanmean(arr.reshape(-1, 2), axis=0)
    arr = arr - center
    
    # Scale: use median of per-frame standard deviations
    per_frame_std = np.nanstd(arr, axis=1)  # (T, 2)
    median_std = np.nanmedian(per_frame_std, axis=0)  # (2,)
    scale = max(1e-6, np.mean(median_std))
    arr = arr / scale
    
    # Flatten to (T, 172)
    flattened = arr.reshape(T, -1)
    
    return flattened.astype(np.float32)


# Test normalization
print("\nTesting normalization...")
test_kps = np.random.randn(10, 86, 2) * 100 + 500  # Random keypoints
normalized = normalize_keypoints(test_kps)
print(f"Input shape: {test_kps.shape}")
print(f"Output shape: {normalized.shape}")
print(f"Output dtype: {normalized.dtype}")
print(f"Output range: [{normalized.min():.2f}, {normalized.max():.2f}]")



Testing normalization...
Input shape: (10, 86, 2)
Output shape: (10, 172)
Output dtype: float32
Output range: [-3.22, 3.23]


In [3]:
def build_vocabulary(train_csv, dev_csv):
    """Build vocabulary from gloss texts."""
    all_tokens = set()
    
    # Process train
    df_train = pd.read_csv(train_csv)
    for gloss in df_train["gloss"].astype(str):
        tokens = gloss.strip().split()
        all_tokens.update(tokens)
    
    # Process dev
    df_dev = pd.read_csv(dev_csv)
    for gloss in df_dev["gloss"].astype(str):
        tokens = gloss.strip().split()
        all_tokens.update(tokens)
    
    # Create vocabulary with special tokens
    vocab = ["<pad>", "<sos>", "<eos>"] + sorted(list(all_tokens))
    vocab_map = {token: idx for idx, token in enumerate(vocab)}
    
    return vocab, vocab_map


print("Building vocabulary...")
vocab, vocab_map = build_vocabulary(CONFIG["train_csv"], CONFIG["dev_csv"])

# Save vocabulary
vocab_data = {"vocab": vocab, "vocab_map": vocab_map}
with open(CONFIG["vocab_file"], "w", encoding="utf-8") as f:
    json.dump(vocab_data, f, ensure_ascii=False, indent=2)

print(f"Vocabulary size: {len(vocab)}")
print(f"Special tokens: <pad>={vocab_map['<pad>']}, <sos>={vocab_map['<sos>']}, <eos>={vocab_map['<eos>']}")
print(f"Sample tokens: {vocab[3:10]}")

Building vocabulary...
Vocabulary size: 686
Special tokens: <pad>=0, <sos>=1, <eos>=2
Sample tokens: ['ا', 'اب', 'ابتسامه', 'ابن', 'ابها', 'ابيض', 'اتصال']


In [4]:
def process_split(csv_file, pickle_data, vocab_map, split_name):
    """
    Process one split (train/dev) and save features + labels.
    """
    df = pd.read_csv(csv_file)
    
    features_dir = Path(CONFIG["features_dir"]) / split_name
    features_dir.mkdir(parents=True, exist_ok=True)
    
    processed_samples = []
    skipped = 0
    
    print(f"\nProcessing {split_name} split...")
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        sample_id = str(row["id"])
        gloss = str(row["gloss"]).strip()
        
        # Check if sample exists in pickle
        if sample_id not in pickle_data:
            skipped += 1
            continue
        
        try:
            # Get keypoints
            keypoints = pickle_data[sample_id]["keypoints"]
            
            # Normalize
            features = normalize_keypoints(keypoints)
            
            # Save features as numpy array
            feat_path = features_dir / f"{sample_id}.npy"
            np.save(feat_path, features)
            
            # Create target indices: [<sos>, token1, token2, ..., <eos>]
            tokens = gloss.split()
            target_idx = [vocab_map["<sos>"]]
            target_idx.extend([vocab_map[t] for t in tokens if t in vocab_map])
            target_idx.append(vocab_map["<eos>"])
            
            # Store sample info
            processed_samples.append({
                "sample_id": sample_id,
                "gloss": gloss,
                "target_idx": json.dumps(target_idx),
                "num_frames": len(features),
                "num_tokens": len(target_idx)
            })
            
        except Exception as e:
            print(f"Error processing {sample_id}: {e}")
            skipped += 1
            continue
    
    # Save labels CSV
    labels_df = pd.DataFrame(processed_samples)
    labels_path = Path(CONFIG["labels_dir"]) / f"{split_name}_labels.csv"
    labels_df.to_csv(labels_path, index=False)
    
    print(f"✓ {split_name}: Processed {len(processed_samples)} samples, skipped {skipped}")
    return labels_df


# Load pickle file
print("Loading pickle file...")
with open(CONFIG["pickle_file"], "rb") as f:
    pickle_data = pickle.load(f)
print(f"Loaded {len(pickle_data)} samples from pickle")

# Process both splits
train_labels = process_split(CONFIG["train_csv"], pickle_data, vocab_map, "train")
dev_labels = process_split(CONFIG["dev_csv"], pickle_data, vocab_map, "dev")

print("\n" + "="*60)
print("PREPROCESSING COMPLETE")
print("="*60)
print(f"Train samples: {len(train_labels)}")
print(f"Dev samples: {len(dev_labels)}")
print(f"Feature dimension: 172 (86 keypoints × 2 coordinates)")
print("="*60)

Loading pickle file...
Loaded 10450 samples from pickle

Processing train split...


100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:21<00:00, 463.18it/s]


✓ train: Processed 9500 samples, skipped 500

Processing dev split...


100%|███████████████████████████████████████████████████████████████████████████████| 949/949 [00:02<00:00, 379.08it/s]

✓ dev: Processed 949 samples, skipped 0

PREPROCESSING COMPLETE
Train samples: 9500
Dev samples: 949
Feature dimension: 172 (86 keypoints × 2 coordinates)





In [5]:
class SignLanguageDataset(Dataset):
    """Dataset for sign language recognition."""
    
    def __init__(self, features_dir, labels_csv, vocab_json):
        self.features_dir = Path(features_dir)
        
        # Load labels
        self.labels = pd.read_csv(labels_csv)
        
        # Load vocabulary
        with open(vocab_json, "r", encoding="utf-8") as f:
            vocab_data = json.load(f)
        self.vocab = vocab_data["vocab"]
        self.vocab_map = vocab_data["vocab_map"]
        
        # Special tokens
        self.pad_idx = self.vocab_map["<pad>"]
        self.sos_idx = self.vocab_map["<sos>"]
        self.eos_idx = self.vocab_map["<eos>"]
        
        print(f"Loaded {len(self.labels)} samples")
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        row = self.labels.iloc[idx]
        sample_id = str(row["sample_id"])
        
        # Load features
        feat_path = self.features_dir / f"{sample_id}.npy"
        features = np.load(feat_path)
        
        # Load target
        target = np.array(json.loads(row["target_idx"]), dtype=np.int64)
        
        return {
            "features": features,
            "target": target,
            "sample_id": sample_id
        }


def collate_fn(batch):
    """Collate function with padding."""
    # Sort by length
    batch = sorted(batch, key=lambda x: len(x["features"]), reverse=True)
    
    # Get max lengths
    max_src_len = max(len(b["features"]) for b in batch)
    max_tgt_len = max(len(b["target"]) for b in batch)
    feat_dim = batch[0]["features"].shape[1]
    
    # Pad features
    src_padded = np.zeros((len(batch), max_src_len, feat_dim), dtype=np.float32)
    src_lengths = []
    for i, b in enumerate(batch):
        seq_len = len(b["features"])
        src_padded[i, :seq_len, :] = b["features"]
        src_lengths.append(seq_len)
    
    # Pad targets
    tgt_padded = np.full((len(batch), max_tgt_len), -100, dtype=np.int64)
    tgt_lengths = []
    for i, b in enumerate(batch):
        tgt_len = len(b["target"])
        tgt_padded[i, :tgt_len] = b["target"]
        tgt_lengths.append(tgt_len)
    
    return {
        "src": torch.from_numpy(src_padded),
        "src_lengths": torch.tensor(src_lengths, dtype=torch.long),
        "tgt": torch.from_numpy(tgt_padded),
        "tgt_lengths": torch.tensor(tgt_lengths, dtype=torch.long),
        "sample_ids": [b["sample_id"] for b in batch]
    }

In [6]:
class PositionalEncoding(nn.Module):
    """Sinusoidal positional encoding."""
    
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe.unsqueeze(0))
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]


class Seq2SeqTransformer(nn.Module):
    """Sequence-to-sequence transformer."""
    
    def __init__(self, src_feature_dim, tgt_vocab_size, d_model=256, nhead=4,
                 num_encoder_layers=4, num_decoder_layers=4, dim_feedforward=1024,
                 dropout=0.1, pad_idx=0):
        super().__init__()
        
        self.d_model = d_model
        self.pad_idx = pad_idx
        
        # Source embedding (project features to d_model)
        self.src_embedding = nn.Linear(src_feature_dim, d_model)
        
        # Target embedding
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model, padding_idx=pad_idx)
        
        # Positional encoding
        self.pos_encoder = PositionalEncoding(d_model)
        
        # Transformer
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        
        # Output projection
        self.output_proj = nn.Linear(d_model, tgt_vocab_size)
        
        # Initialize
        self._init_weights()
    
    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    
    def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None):
        # Embed and encode
        src = self.src_embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        
        tgt = self.tgt_embedding(tgt) * math.sqrt(self.d_model)
        tgt = self.pos_encoder(tgt)
        
        # Causal mask
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
        
        # Transform
        output = self.transformer(
            src=src,
            tgt=tgt,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )
        
        # Project
        logits = self.output_proj(output)
        return logits

In [7]:
def train_epoch(model, dataloader, optimizer, criterion, device, epoch):
    model.train()
    total_loss = 0
    total_tokens = 0
    start_time = time.time()
    
    for batch_idx, batch in enumerate(dataloader):
        src = batch["src"].to(device)
        src_lengths = batch["src_lengths"].to(device)
        tgt = batch["tgt"].to(device)
        
        # Create masks
        src_key_padding_mask = (
            torch.arange(src.size(1), device=device).unsqueeze(0) >= src_lengths.unsqueeze(1)
        )
        
        # Teacher forcing: input vs output
        tgt_input = tgt[:, :-1].clone()
        tgt_output = tgt[:, 1:].clone()
        tgt_input[tgt_input == -100] = model.pad_idx
        tgt_key_padding_mask = (tgt[:, :-1] == -100)
        
        # Forward
        optimizer.zero_grad()
        logits = model(src, tgt_input, src_key_padding_mask, tgt_key_padding_mask)
        loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_output.reshape(-1))
        
        # Backward
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        # Stats
        num_tokens = (tgt_output != -100).sum().item()
        total_loss += loss.item() * num_tokens
        total_tokens += num_tokens
        
        if (batch_idx + 1) % 10 == 0:
            elapsed = time.time() - start_time
            print(f"Epoch {epoch} | Batch {batch_idx+1}/{len(dataloader)} | "
                  f"Loss: {total_loss/total_tokens:.4f} | Time: {elapsed:.1f}s")
    
    return total_loss / total_tokens


def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    total_tokens = 0
    
    with torch.no_grad():
        for batch in dataloader:
            src = batch["src"].to(device)
            src_lengths = batch["src_lengths"].to(device)
            tgt = batch["tgt"].to(device)
            
            src_key_padding_mask = (
                torch.arange(src.size(1), device=device).unsqueeze(0) >= src_lengths.unsqueeze(1)
            )
            
            tgt_input = tgt[:, :-1].clone()
            tgt_output = tgt[:, 1:].clone()
            tgt_input[tgt_input == -100] = model.pad_idx
            tgt_key_padding_mask = (tgt[:, :-1] == -100)
            
            logits = model(src, tgt_input, src_key_padding_mask, tgt_key_padding_mask)
            loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_output.reshape(-1))
            
            num_tokens = (tgt_output != -100).sum().item()
            total_loss += loss.item() * num_tokens
            total_tokens += num_tokens
    
    return total_loss / total_tokens

In [8]:
print("Loading datasets...")
train_dataset = SignLanguageDataset(
    CONFIG["features_dir"] + "/train",
    CONFIG["labels_dir"] + "/train_labels.csv",
    CONFIG["vocab_file"]
)
dev_dataset = SignLanguageDataset(
    CONFIG["features_dir"] + "/dev",
    CONFIG["labels_dir"] + "/dev_labels.csv",
    CONFIG["vocab_file"]
)

# Create dataloaders
train_loader = DataLoader(
    train_dataset, batch_size=CONFIG["batch_size"], 
    shuffle=True, collate_fn=collate_fn, num_workers=0
)
dev_loader = DataLoader(
    dev_dataset, batch_size=CONFIG["batch_size"],
    shuffle=False, collate_fn=collate_fn, num_workers=0
)

print(f"Train batches: {len(train_loader)}")
print(f"Dev batches: {len(dev_loader)}")

# Create model
print("\nCreating model...")
model = Seq2SeqTransformer(
    src_feature_dim=172,
    tgt_vocab_size=len(train_dataset.vocab),
    d_model=CONFIG["d_model"],
    nhead=CONFIG["nhead"],
    num_encoder_layers=CONFIG["num_encoder_layers"],
    num_decoder_layers=CONFIG["num_decoder_layers"],
    dim_feedforward=CONFIG["dim_feedforward"],
    dropout=CONFIG["dropout"],
    pad_idx=train_dataset.pad_idx
).to(CONFIG["device"])

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# Setup training
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = optim.Adam(model.parameters(), lr=CONFIG["learning_rate"])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, factor=0.5)


Loading datasets...
Loaded 9500 samples
Loaded 949 samples
Train batches: 297
Dev batches: 30

Creating model...
Model parameters: 7,770,030


In [9]:
print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60 + "\n")
checkpoint = torch.load(f"{CONFIG['models_dir']}/best_model.pt")
model.load_state_dict(checkpoint['model_state_dict'])
best_val_loss = float('inf')

for epoch in range(1, CONFIG["num_epochs"] + 1):
    print(f"\nEpoch {epoch}/{CONFIG['num_epochs']}")
    print("-" * 60)
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, criterion, CONFIG["device"], epoch)
    print(f"Train Loss: {train_loss:.4f}")
    
    # Evaluate
    val_loss = evaluate(model, dev_loader, criterion, CONFIG["device"])
    print(f"Val Loss: {val_loss:.4f}")
    
    # Schedule
    scheduler.step(val_loss)
    lr = optimizer.param_groups[0]['lr']
    print(f"Learning Rate: {lr:.6f}")
    
    # Save best
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': val_loss
        }, f"{CONFIG['models_dir']}/best_model.pt")
        print(f"✓ Saved best model")

print("\n" + "="*60)
print(f"TRAINING COMPLETE - Best val loss: {best_val_loss:.4f}")
print("="*60)



STARTING TRAINING


Epoch 1/50
------------------------------------------------------------


  checkpoint = torch.load(f"{CONFIG['models_dir']}/best_model.pt")


Epoch 1 | Batch 10/297 | Loss: 0.1297 | Time: 4.1s
Epoch 1 | Batch 20/297 | Loss: 0.1291 | Time: 8.0s
Epoch 1 | Batch 30/297 | Loss: 0.1237 | Time: 11.7s
Epoch 1 | Batch 40/297 | Loss: 0.1210 | Time: 15.5s
Epoch 1 | Batch 50/297 | Loss: 0.1169 | Time: 19.3s
Epoch 1 | Batch 60/297 | Loss: 0.1196 | Time: 23.0s
Epoch 1 | Batch 70/297 | Loss: 0.1214 | Time: 26.9s
Epoch 1 | Batch 80/297 | Loss: 0.1211 | Time: 30.7s
Epoch 1 | Batch 90/297 | Loss: 0.1203 | Time: 34.8s
Epoch 1 | Batch 100/297 | Loss: 0.1210 | Time: 38.9s
Epoch 1 | Batch 110/297 | Loss: 0.1187 | Time: 42.8s
Epoch 1 | Batch 120/297 | Loss: 0.1179 | Time: 46.5s
Epoch 1 | Batch 130/297 | Loss: 0.1176 | Time: 50.3s
Epoch 1 | Batch 140/297 | Loss: 0.1188 | Time: 54.3s
Epoch 1 | Batch 150/297 | Loss: 0.1190 | Time: 58.3s
Epoch 1 | Batch 160/297 | Loss: 0.1199 | Time: 62.0s
Epoch 1 | Batch 170/297 | Loss: 0.1203 | Time: 65.8s
Epoch 1 | Batch 180/297 | Loss: 0.1203 | Time: 69.6s
Epoch 1 | Batch 190/297 | Loss: 0.1203 | Time: 73.3s
Epoc

  output = torch._nested_tensor_from_mask(


Val Loss: 0.4153
Learning Rate: 0.000300
✓ Saved best model

Epoch 2/50
------------------------------------------------------------
Epoch 2 | Batch 10/297 | Loss: 0.0922 | Time: 2.1s
Epoch 2 | Batch 20/297 | Loss: 0.0924 | Time: 4.0s
Epoch 2 | Batch 30/297 | Loss: 0.0997 | Time: 5.7s
Epoch 2 | Batch 40/297 | Loss: 0.1057 | Time: 7.4s
Epoch 2 | Batch 50/297 | Loss: 0.1102 | Time: 9.2s
Epoch 2 | Batch 60/297 | Loss: 0.1096 | Time: 11.0s
Epoch 2 | Batch 70/297 | Loss: 0.1117 | Time: 12.8s
Epoch 2 | Batch 80/297 | Loss: 0.1123 | Time: 14.5s
Epoch 2 | Batch 90/297 | Loss: 0.1098 | Time: 16.2s
Epoch 2 | Batch 100/297 | Loss: 0.1107 | Time: 18.0s
Epoch 2 | Batch 110/297 | Loss: 0.1105 | Time: 19.7s
Epoch 2 | Batch 120/297 | Loss: 0.1097 | Time: 21.2s
Epoch 2 | Batch 130/297 | Loss: 0.1094 | Time: 22.8s
Epoch 2 | Batch 140/297 | Loss: 0.1102 | Time: 24.9s
Epoch 2 | Batch 150/297 | Loss: 0.1122 | Time: 26.6s
Epoch 2 | Batch 160/297 | Loss: 0.1128 | Time: 28.3s
Epoch 2 | Batch 170/297 | Loss: 0

Epoch 6 | Batch 280/297 | Loss: 0.1090 | Time: 49.6s
Epoch 6 | Batch 290/297 | Loss: 0.1093 | Time: 51.3s
Train Loss: 0.1086
Val Loss: 0.3927
Learning Rate: 0.000150

Epoch 7/50
------------------------------------------------------------
Epoch 7 | Batch 10/297 | Loss: 0.0826 | Time: 1.7s
Epoch 7 | Batch 20/297 | Loss: 0.0757 | Time: 3.4s
Epoch 7 | Batch 30/297 | Loss: 0.0690 | Time: 5.2s
Epoch 7 | Batch 40/297 | Loss: 0.0691 | Time: 6.8s
Epoch 7 | Batch 50/297 | Loss: 0.0693 | Time: 8.7s
Epoch 7 | Batch 60/297 | Loss: 0.0670 | Time: 10.4s
Epoch 7 | Batch 70/297 | Loss: 0.0656 | Time: 12.3s
Epoch 7 | Batch 80/297 | Loss: 0.0646 | Time: 14.1s
Epoch 7 | Batch 90/297 | Loss: 0.0628 | Time: 15.7s
Epoch 7 | Batch 100/297 | Loss: 0.0627 | Time: 17.3s
Epoch 7 | Batch 110/297 | Loss: 0.0621 | Time: 19.0s
Epoch 7 | Batch 120/297 | Loss: 0.0618 | Time: 20.9s
Epoch 7 | Batch 130/297 | Loss: 0.0608 | Time: 22.6s
Epoch 7 | Batch 140/297 | Loss: 0.0602 | Time: 24.4s
Epoch 7 | Batch 150/297 | Loss: 0

Epoch 11 | Batch 250/297 | Loss: 0.0325 | Time: 44.6s
Epoch 11 | Batch 260/297 | Loss: 0.0327 | Time: 46.2s
Epoch 11 | Batch 270/297 | Loss: 0.0329 | Time: 48.1s
Epoch 11 | Batch 280/297 | Loss: 0.0329 | Time: 49.9s
Epoch 11 | Batch 290/297 | Loss: 0.0327 | Time: 51.6s
Train Loss: 0.0331
Val Loss: 0.3589
Learning Rate: 0.000150

Epoch 12/50
------------------------------------------------------------
Epoch 12 | Batch 10/297 | Loss: 0.0274 | Time: 1.7s
Epoch 12 | Batch 20/297 | Loss: 0.0342 | Time: 3.3s
Epoch 12 | Batch 30/297 | Loss: 0.0341 | Time: 5.3s
Epoch 12 | Batch 40/297 | Loss: 0.0362 | Time: 7.1s
Epoch 12 | Batch 50/297 | Loss: 0.0351 | Time: 9.1s
Epoch 12 | Batch 60/297 | Loss: 0.0340 | Time: 10.9s
Epoch 12 | Batch 70/297 | Loss: 0.0332 | Time: 12.6s
Epoch 12 | Batch 80/297 | Loss: 0.0326 | Time: 14.3s
Epoch 12 | Batch 90/297 | Loss: 0.0314 | Time: 16.0s
Epoch 12 | Batch 100/297 | Loss: 0.0316 | Time: 17.8s
Epoch 12 | Batch 110/297 | Loss: 0.0311 | Time: 19.5s
Epoch 12 | Batch

Epoch 16 | Batch 210/297 | Loss: 0.0108 | Time: 37.9s
Epoch 16 | Batch 220/297 | Loss: 0.0109 | Time: 39.6s
Epoch 16 | Batch 230/297 | Loss: 0.0110 | Time: 41.4s
Epoch 16 | Batch 240/297 | Loss: 0.0115 | Time: 43.2s
Epoch 16 | Batch 250/297 | Loss: 0.0116 | Time: 45.1s
Epoch 16 | Batch 260/297 | Loss: 0.0114 | Time: 46.8s
Epoch 16 | Batch 270/297 | Loss: 0.0114 | Time: 48.4s
Epoch 16 | Batch 280/297 | Loss: 0.0114 | Time: 50.3s
Epoch 16 | Batch 290/297 | Loss: 0.0113 | Time: 52.2s
Train Loss: 0.0112
Val Loss: 0.3559
Learning Rate: 0.000037

Epoch 17/50
------------------------------------------------------------
Epoch 17 | Batch 10/297 | Loss: 0.0111 | Time: 1.7s
Epoch 17 | Batch 20/297 | Loss: 0.0126 | Time: 3.5s
Epoch 17 | Batch 30/297 | Loss: 0.0101 | Time: 5.3s
Epoch 17 | Batch 40/297 | Loss: 0.0108 | Time: 6.9s
Epoch 17 | Batch 50/297 | Loss: 0.0110 | Time: 8.5s
Epoch 17 | Batch 60/297 | Loss: 0.0107 | Time: 10.3s
Epoch 17 | Batch 70/297 | Loss: 0.0112 | Time: 11.9s
Epoch 17 | Bat

Epoch 21 | Batch 170/297 | Loss: 0.0049 | Time: 30.2s
Epoch 21 | Batch 180/297 | Loss: 0.0050 | Time: 31.9s
Epoch 21 | Batch 190/297 | Loss: 0.0050 | Time: 34.0s
Epoch 21 | Batch 200/297 | Loss: 0.0051 | Time: 35.6s
Epoch 21 | Batch 210/297 | Loss: 0.0051 | Time: 37.5s
Epoch 21 | Batch 220/297 | Loss: 0.0050 | Time: 39.3s
Epoch 21 | Batch 230/297 | Loss: 0.0050 | Time: 41.0s
Epoch 21 | Batch 240/297 | Loss: 0.0050 | Time: 42.7s
Epoch 21 | Batch 250/297 | Loss: 0.0049 | Time: 44.5s
Epoch 21 | Batch 260/297 | Loss: 0.0050 | Time: 46.3s
Epoch 21 | Batch 270/297 | Loss: 0.0050 | Time: 48.0s
Epoch 21 | Batch 280/297 | Loss: 0.0051 | Time: 49.9s
Epoch 21 | Batch 290/297 | Loss: 0.0050 | Time: 51.7s
Train Loss: 0.0050
Val Loss: 0.3288
Learning Rate: 0.000019

Epoch 22/50
------------------------------------------------------------
Epoch 22 | Batch 10/297 | Loss: 0.0038 | Time: 1.6s
Epoch 22 | Batch 20/297 | Loss: 0.0037 | Time: 3.4s
Epoch 22 | Batch 30/297 | Loss: 0.0040 | Time: 5.3s
Epoch 22

Epoch 26 | Batch 130/297 | Loss: 0.0039 | Time: 23.1s
Epoch 26 | Batch 140/297 | Loss: 0.0038 | Time: 24.9s
Epoch 26 | Batch 150/297 | Loss: 0.0038 | Time: 26.8s
Epoch 26 | Batch 160/297 | Loss: 0.0038 | Time: 28.6s
Epoch 26 | Batch 170/297 | Loss: 0.0038 | Time: 30.2s
Epoch 26 | Batch 180/297 | Loss: 0.0037 | Time: 31.9s
Epoch 26 | Batch 190/297 | Loss: 0.0037 | Time: 33.6s
Epoch 26 | Batch 200/297 | Loss: 0.0037 | Time: 35.2s
Epoch 26 | Batch 210/297 | Loss: 0.0037 | Time: 37.0s
Epoch 26 | Batch 220/297 | Loss: 0.0037 | Time: 38.8s
Epoch 26 | Batch 230/297 | Loss: 0.0038 | Time: 40.5s
Epoch 26 | Batch 240/297 | Loss: 0.0038 | Time: 42.3s
Epoch 26 | Batch 250/297 | Loss: 0.0038 | Time: 44.1s
Epoch 26 | Batch 260/297 | Loss: 0.0040 | Time: 46.1s
Epoch 26 | Batch 270/297 | Loss: 0.0040 | Time: 47.9s
Epoch 26 | Batch 280/297 | Loss: 0.0040 | Time: 49.7s
Epoch 26 | Batch 290/297 | Loss: 0.0040 | Time: 51.6s
Train Loss: 0.0041
Val Loss: 0.3268
Learning Rate: 0.000005

Epoch 27/50
---------

Epoch 31 | Batch 90/297 | Loss: 0.0037 | Time: 15.9s
Epoch 31 | Batch 100/297 | Loss: 0.0035 | Time: 17.8s
Epoch 31 | Batch 110/297 | Loss: 0.0034 | Time: 19.6s
Epoch 31 | Batch 120/297 | Loss: 0.0034 | Time: 21.3s
Epoch 31 | Batch 130/297 | Loss: 0.0034 | Time: 23.1s
Epoch 31 | Batch 140/297 | Loss: 0.0034 | Time: 24.8s
Epoch 31 | Batch 150/297 | Loss: 0.0033 | Time: 26.5s
Epoch 31 | Batch 160/297 | Loss: 0.0034 | Time: 28.3s
Epoch 31 | Batch 170/297 | Loss: 0.0034 | Time: 30.2s
Epoch 31 | Batch 180/297 | Loss: 0.0034 | Time: 31.7s
Epoch 31 | Batch 190/297 | Loss: 0.0034 | Time: 33.6s
Epoch 31 | Batch 200/297 | Loss: 0.0034 | Time: 35.2s
Epoch 31 | Batch 210/297 | Loss: 0.0033 | Time: 37.1s
Epoch 31 | Batch 220/297 | Loss: 0.0033 | Time: 38.8s
Epoch 31 | Batch 230/297 | Loss: 0.0033 | Time: 40.7s
Epoch 31 | Batch 240/297 | Loss: 0.0033 | Time: 42.6s
Epoch 31 | Batch 250/297 | Loss: 0.0034 | Time: 44.5s
Epoch 31 | Batch 260/297 | Loss: 0.0033 | Time: 46.4s
Epoch 31 | Batch 270/297 | Lo

Epoch 36 | Batch 50/297 | Loss: 0.0043 | Time: 8.8s
Epoch 36 | Batch 60/297 | Loss: 0.0045 | Time: 10.4s
Epoch 36 | Batch 70/297 | Loss: 0.0043 | Time: 12.5s
Epoch 36 | Batch 80/297 | Loss: 0.0043 | Time: 14.3s
Epoch 36 | Batch 90/297 | Loss: 0.0041 | Time: 16.1s
Epoch 36 | Batch 100/297 | Loss: 0.0041 | Time: 17.9s
Epoch 36 | Batch 110/297 | Loss: 0.0040 | Time: 19.6s
Epoch 36 | Batch 120/297 | Loss: 0.0039 | Time: 21.2s
Epoch 36 | Batch 130/297 | Loss: 0.0039 | Time: 23.2s
Epoch 36 | Batch 140/297 | Loss: 0.0042 | Time: 25.1s
Epoch 36 | Batch 150/297 | Loss: 0.0041 | Time: 27.0s
Epoch 36 | Batch 160/297 | Loss: 0.0040 | Time: 28.8s
Epoch 36 | Batch 170/297 | Loss: 0.0040 | Time: 30.4s
Epoch 36 | Batch 180/297 | Loss: 0.0039 | Time: 32.1s
Epoch 36 | Batch 190/297 | Loss: 0.0040 | Time: 33.7s
Epoch 36 | Batch 200/297 | Loss: 0.0039 | Time: 35.6s
Epoch 36 | Batch 210/297 | Loss: 0.0038 | Time: 37.2s
Epoch 36 | Batch 220/297 | Loss: 0.0038 | Time: 39.0s
Epoch 36 | Batch 230/297 | Loss: 0

Epoch 41 | Batch 10/297 | Loss: 0.0026 | Time: 1.7s
Epoch 41 | Batch 20/297 | Loss: 0.0027 | Time: 3.4s
Epoch 41 | Batch 30/297 | Loss: 0.0036 | Time: 5.1s
Epoch 41 | Batch 40/297 | Loss: 0.0037 | Time: 6.8s
Epoch 41 | Batch 50/297 | Loss: 0.0042 | Time: 8.9s
Epoch 41 | Batch 60/297 | Loss: 0.0040 | Time: 10.6s
Epoch 41 | Batch 70/297 | Loss: 0.0039 | Time: 12.5s
Epoch 41 | Batch 80/297 | Loss: 0.0038 | Time: 14.3s
Epoch 41 | Batch 90/297 | Loss: 0.0036 | Time: 16.1s
Epoch 41 | Batch 100/297 | Loss: 0.0038 | Time: 17.7s
Epoch 41 | Batch 110/297 | Loss: 0.0037 | Time: 19.5s
Epoch 41 | Batch 120/297 | Loss: 0.0037 | Time: 21.3s
Epoch 41 | Batch 130/297 | Loss: 0.0036 | Time: 22.9s
Epoch 41 | Batch 140/297 | Loss: 0.0038 | Time: 24.6s
Epoch 41 | Batch 150/297 | Loss: 0.0037 | Time: 26.3s
Epoch 41 | Batch 160/297 | Loss: 0.0039 | Time: 28.3s
Epoch 41 | Batch 170/297 | Loss: 0.0038 | Time: 30.1s
Epoch 41 | Batch 180/297 | Loss: 0.0039 | Time: 31.8s
Epoch 41 | Batch 190/297 | Loss: 0.0039 | 

Epoch 45 | Batch 290/297 | Loss: 0.0032 | Time: 51.4s
Train Loss: 0.0032
Val Loss: 0.3260
Learning Rate: 0.000000

Epoch 46/50
------------------------------------------------------------
Epoch 46 | Batch 10/297 | Loss: 0.0028 | Time: 2.1s
Epoch 46 | Batch 20/297 | Loss: 0.0030 | Time: 3.8s
Epoch 46 | Batch 30/297 | Loss: 0.0025 | Time: 5.5s
Epoch 46 | Batch 40/297 | Loss: 0.0030 | Time: 7.2s
Epoch 46 | Batch 50/297 | Loss: 0.0030 | Time: 9.2s
Epoch 46 | Batch 60/297 | Loss: 0.0032 | Time: 10.9s
Epoch 46 | Batch 70/297 | Loss: 0.0034 | Time: 12.8s
Epoch 46 | Batch 80/297 | Loss: 0.0032 | Time: 14.3s
Epoch 46 | Batch 90/297 | Loss: 0.0032 | Time: 16.1s
Epoch 46 | Batch 100/297 | Loss: 0.0033 | Time: 17.8s
Epoch 46 | Batch 110/297 | Loss: 0.0031 | Time: 19.5s
Epoch 46 | Batch 120/297 | Loss: 0.0032 | Time: 21.2s
Epoch 46 | Batch 130/297 | Loss: 0.0033 | Time: 23.1s
Epoch 46 | Batch 140/297 | Loss: 0.0032 | Time: 24.8s
Epoch 46 | Batch 150/297 | Loss: 0.0033 | Time: 26.6s
Epoch 46 | Batch

Epoch 50 | Batch 250/297 | Loss: 0.0032 | Time: 44.7s
Epoch 50 | Batch 260/297 | Loss: 0.0031 | Time: 46.3s
Epoch 50 | Batch 270/297 | Loss: 0.0032 | Time: 47.9s
Epoch 50 | Batch 280/297 | Loss: 0.0032 | Time: 49.8s
Epoch 50 | Batch 290/297 | Loss: 0.0032 | Time: 51.4s
Train Loss: 0.0032
Val Loss: 0.3261
Learning Rate: 0.000000

TRAINING COMPLETE - Best val loss: 0.3219


In [10]:
# Load best model
print("Loading best model...")
checkpoint = torch.load(f"{CONFIG['models_dir']}/best_model.pt")
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# Inference function
def decode_predictions(model, dataloader, vocab, sos_idx, eos_idx, device):
    """Generate predictions for entire dataset."""
    all_preds = []
    all_refs = []
    all_ids = []
    
    for batch in tqdm(dataloader, desc="Generating predictions"):
        src = batch["src"].to(device)
        src_lengths = batch["src_lengths"].to(device)
        tgt = batch["tgt"].to(device)
        
        # Create source mask
        src_key_padding_mask = (
            torch.arange(src.size(1), device=device).unsqueeze(0) >= src_lengths.unsqueeze(1)
        )
        
        # Greedy decode
        with torch.no_grad():
            # Generate sequences
            predictions = []
            for i in range(src.size(0)):
                pred = []
                tgt_tokens = torch.full((1, 1), sos_idx, dtype=torch.long, device=device)
                
                for _ in range(100):  # max length
                    logits = model(src[i:i+1], tgt_tokens, 
                                  src_key_padding_mask[i:i+1], None)
                    next_token = logits[0, -1].argmax().item()
                    
                    if next_token == eos_idx:
                        break
                    pred.append(next_token)
                    tgt_tokens = torch.cat([tgt_tokens, 
                                           torch.tensor([[next_token]], device=device)], dim=1)
                
                predictions.append(pred)
        
        # Decode to text
        for i, pred_indices in enumerate(predictions):
            # Prediction
            pred_text = " ".join([vocab[idx] for idx in pred_indices 
                                 if idx < len(vocab) and vocab[idx] not in ["<pad>", "<sos>", "<eos>"]])
            
            # Reference
            ref_indices = tgt[i][tgt[i] != -100].cpu().tolist()
            ref_text = " ".join([vocab[idx] for idx in ref_indices 
                                if idx < len(vocab) and vocab[idx] not in ["<pad>", "<sos>", "<eos>"]])
            
            all_preds.append(pred_text)
            all_refs.append(ref_text)
            all_ids.append(batch["sample_ids"][i])
    
    return all_ids, all_preds, all_refs

# Generate predictions on dev set
print("\nGenerating predictions on dev set...")
sample_ids, predictions, references = decode_predictions(
    model, dev_loader, dev_dataset.vocab, 
    dev_dataset.sos_idx, dev_dataset.eos_idx, CONFIG["device"]
)

# Save results
results_df = pd.DataFrame({
    "sample_id": sample_ids,
    "reference": references,
    "prediction": predictions
})
results_df.to_csv(f"{CONFIG['output_dir']}/predictions.csv", index=False, encoding='utf-8')
print(f"✓ Saved predictions to {CONFIG['output_dir']}/predictions.csv")

# Show examples
print("\n" + "="*60)
print("SAMPLE PREDICTIONS")
print("="*60)
for i in range(min(10, len(results_df))):
    print(f"\nSample {results_df.iloc[i]['sample_id']}")
    print(f"REF: {results_df.iloc[i]['reference']}")
    print(f"PRED: {results_df.iloc[i]['prediction']}")
    print("-" * 40)

# Calculate accuracy (exact match)
exact_matches = (results_df["reference"] == results_df["prediction"]).sum()
accuracy = exact_matches / len(results_df) * 100
print(f"\n{'='*60}")
print(f"Exact Match Accuracy: {accuracy:.2f}% ({exact_matches}/{len(results_df)})")
print(f"{'='*60}")

  checkpoint = torch.load(f"{CONFIG['models_dir']}/best_model.pt")


Loading best model...

Generating predictions on dev set...


Generating predictions: 100%|██████████████████████████████████████████████████████████| 30/30 [00:39<00:00,  1.32s/it]

✓ Saved predictions to isharah_clean/predictions.csv

SAMPLE PREDICTIONS

Sample 02_0032
REF: انا رغبه كوب شراء
PRED: انا رغبه كوب شراء
----------------------------------------

Sample 02_0021
REF: الان واحد شهر عشر
PRED: الان واحد شهر عشر
----------------------------------------

Sample 02_0022
REF: انا ذهاب مدرسه سبب كره_قدم
PRED: انا ذهاب مدرسه سبب كره_قدم
----------------------------------------

Sample 02_0004
REF: هو معلم لا انا مدرسه
PRED: انا ذهاب صيدليه مع اب
----------------------------------------

Sample 02_0030
REF: انا رغبه ذهاب لوحه بقاله
PRED: انا رغبه ذهاب لوحه بقاله
----------------------------------------

Sample 02_0008
REF: انا اسره رقم ثلاث اشخاص
PRED: اسره ام اب اخ ولد ثلاث
----------------------------------------

Sample 02_0028
REF: انا رغبه ذهاب مكتبه بعد صيف
PRED: انا رغبه ذهاب مكتبه بعد صيف
----------------------------------------

Sample 02_0007
REF: استفهام هو صديق مدرسه
PRED: هو عوده مع هو
----------------------------------------

Sample 02_0016
REF: هو م




In [11]:
def beam_search(model, src, src_mask, sos_idx, eos_idx, device, beam_size=10, max_len=100):
    """
    src: (1, T)
    src_mask: (1, T)
    """
    # Each beam entry: (log_prob, sequence_tensor)
    beams = [(0.0, torch.tensor([[sos_idx]], dtype=torch.long, device=device))]

    finished = []

    for _ in range(max_len):
        new_beams = []
        for log_prob, seq in beams:
            # Stop if already ended
            if seq[0, -1].item() == eos_idx:
                finished.append((log_prob, seq))
                continue

            # Model forward
            logits = model(src, seq, src_mask, None)  # shape: (1, L, vocab)
            next_logits = logits[0, -1]               # last timestep
            log_probs = torch.log_softmax(next_logits, dim=-1)

            # Get top beam_size candidates
            topk_log_probs, topk_ids = torch.topk(log_probs, beam_size)

            for i in range(beam_size):
                new_log_prob = log_prob + topk_log_probs[i].item()
                new_seq = torch.cat([seq, topk_ids[i].view(1,1)], dim=1)
                new_beams.append((new_log_prob, new_seq))

        # Keep the top beams for the next step
        new_beams.sort(key=lambda x: x[0], reverse=True)
        beams = new_beams[:beam_size]

        # If all beams ended → stop early
        if len(finished) == beam_size:
            break

    # Combine finished + ongoing beams
    all_candidates = finished + beams
    all_candidates.sort(key=lambda x: x[0], reverse=True)
    
    # Return best sequence (tensor of shape (1, length))
    return all_candidates[0][1]

In [12]:
# Load best model
print("Loading best model...")
checkpoint = torch.load(f"{CONFIG['models_dir']}/best_model.pt")
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# Inference function
def decode_predictions(model, dataloader, vocab, sos_idx, eos_idx, device):
    """Generate predictions for entire dataset."""
    all_preds = []
    all_refs = []
    all_ids = []
    
    for batch in tqdm(dataloader, desc="Generating predictions"):
        src = batch["src"].to(device)
        src_lengths = batch["src_lengths"].to(device)
        tgt = batch["tgt"].to(device)

        # Create source mask
        src_key_padding_mask = (
            torch.arange(src.size(1), device=device).unsqueeze(0) >= src_lengths.unsqueeze(1)
        )

        predictions = []   # <-- FIXED (this was missing)

        with torch.no_grad():
            # Beam search for each sample in batch
            for i in range(src.size(0)):
                seq = beam_search(
                    model,
                    src[i:i+1],
                    src_key_padding_mask[i:i+1],
                    sos_idx,
                    eos_idx,
                    device,
                    beam_size=10,
                    max_len=100
                )

                # Convert tensor sequence to python list (skip <sos>)
                pred_indices = seq[0, 1:].tolist()
                predictions.append(pred_indices)

        # Decode predictions + references
        for i, pred_indices in enumerate(predictions):

            # Decode prediction -> text
            pred_text = " ".join([
                vocab[idx] for idx in pred_indices
                if idx < len(vocab) and vocab[idx] not in ["<pad>", "<sos>", "<eos>"]
            ])

            # Decode reference (ignore -100 because of CE loss)
            ref_indices = tgt[i][tgt[i] != -100].cpu().tolist()
            ref_text = " ".join([
                vocab[idx] for idx in ref_indices
                if idx < len(vocab) and vocab[idx] not in ["<pad>", "<sos>", "<eos>"]
            ])

            all_preds.append(pred_text)
            all_refs.append(ref_text)
            all_ids.append(batch["sample_ids"][i])

    return all_ids, all_preds, all_refs


# Generate predictions on dev set
print("\nGenerating predictions on dev set...")
sample_ids, predictions, references = decode_predictions(
    model, dev_loader, dev_dataset.vocab,
    dev_dataset.sos_idx, dev_dataset.eos_idx, CONFIG["device"]
)

# Save results
results_df = pd.DataFrame({
    "sample_id": sample_ids,
    "reference": references,
    "prediction": predictions
})
results_df.to_csv(f"{CONFIG['output_dir']}/predictions.csv", index=False, encoding='utf-8')
print(f"✓ Saved predictions to {CONFIG['output_dir']}/predictions.csv")

# Show examples
print("\n" + "="*60)
print("SAMPLE PREDICTIONS")
print("="*60)
for i in range(min(10, len(results_df))):
    print(f"\nSample {results_df.iloc[i]['sample_id']}")
    print(f"REF: {results_df.iloc[i]['reference']}")
    print(f"PRED: {results_df.iloc[i]['prediction']}")
    print("-" * 40)

# Calculate accuracy (exact match)
exact_matches = (results_df["reference"] == results_df["prediction"]).sum()
accuracy = exact_matches / len(results_df) * 100
print(f"\n{'='*60}")
print(f"Exact Match Accuracy: {accuracy:.2f}% ({exact_matches}/{len(results_df)})")
print(f"{'='*60}")


  checkpoint = torch.load(f"{CONFIG['models_dir']}/best_model.pt")


Loading best model...

Generating predictions on dev set...


Generating predictions: 100%|█████████████████████████████████████████████████████████| 30/30 [50:10<00:00, 100.33s/it]

✓ Saved predictions to isharah_clean/predictions.csv

SAMPLE PREDICTIONS

Sample 02_0032
REF: انا رغبه كوب شراء
PRED: انا رغبه كوب شراء
----------------------------------------

Sample 02_0021
REF: الان واحد شهر عشر
PRED: الان واحد شهر عشر
----------------------------------------

Sample 02_0022
REF: انا ذهاب مدرسه سبب كره_قدم
PRED: انا ذهاب مدرسه سبب كره_قدم
----------------------------------------

Sample 02_0004
REF: هو معلم لا انا مدرسه
PRED: انا ذهاب صيدليه مع اب
----------------------------------------

Sample 02_0030
REF: انا رغبه ذهاب لوحه بقاله
PRED: انا رغبه ذهاب لوحه بقاله
----------------------------------------

Sample 02_0008
REF: انا اسره رقم ثلاث اشخاص
PRED: اسره ام اب اخ ولد ثلاث
----------------------------------------

Sample 02_0028
REF: انا رغبه ذهاب مكتبه بعد صيف
PRED: انا رغبه ذهاب مكتبه بعد صيف
----------------------------------------

Sample 02_0007
REF: استفهام هو صديق مدرسه
PRED: هو عوده مع هو
----------------------------------------

Sample 02_0016
REF: هو م




In [23]:
# Load test pickle
print("Loading test data...")
test_pickle_path = r"C:\Users\engmr\Downloads\public_si_dat\pose_data_isharah1000_SI_test.pkl"
with open(test_pickle_path, "rb") as f:
    test_data = pickle.load(f)

print(f"Loaded {len(test_data)} test samples")

# Process test features (same as train/dev preprocessing)
def process_test_features(pickle_data, output_dir):
    """Process and save test features."""
    features_dir = Path(output_dir) / "features" / "test"
    features_dir.mkdir(parents=True, exist_ok=True)
    
    processed = []
    print("\nProcessing test features...")
    for sample_id in tqdm(pickle_data.keys()):
        try:
            keypoints = pickle_data[sample_id]["keypoints"]
            features = normalize_keypoints(keypoints)
            
            # Save
            feat_path = features_dir / f"{sample_id}.npy"
            np.save(feat_path, features)
            
            processed.append({
                "sample_id": sample_id,
                "num_frames": len(features)
            })
        except Exception as e:
            print(f"Error processing {sample_id}: {e}")
    
    # Save metadata
    meta_df = pd.DataFrame(processed)
    meta_path = Path(output_dir) / "labels" / "test_metadata.csv"
    meta_df.to_csv(meta_path, index=False)
    
    print(f"✓ Processed {len(processed)} test samples")
    return meta_df

# Process test data
test_meta = process_test_features(test_data, CONFIG["output_dir"])

# Create test dataset (no labels, so different structure)
class TestDataset(Dataset):
    def __init__(self, features_dir, metadata_csv):
        self.features_dir = Path(features_dir)
        self.metadata = pd.read_csv(metadata_csv)
    
    def __len__(self):
        return len(self.metadata)
    
    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        sample_id = str(row["sample_id"])
        
        feat_path = self.features_dir / f"{sample_id}.npy"
        features = np.load(feat_path)
        
        return {
            "features": features,
            "sample_id": sample_id
        }

def test_collate_fn(batch):
    """Collate for test (no targets)."""
    batch = sorted(batch, key=lambda x: len(x["features"]), reverse=True)
    
    max_src_len = max(len(b["features"]) for b in batch)
    feat_dim = batch[0]["features"].shape[1]
    
    src_padded = np.zeros((len(batch), max_src_len, feat_dim), dtype=np.float32)
    src_lengths = []
    
    for i, b in enumerate(batch):
        seq_len = len(b["features"])
        src_padded[i, :seq_len, :] = b["features"]
        src_lengths.append(seq_len)
    
    return {
        "src": torch.from_numpy(src_padded),
        "src_lengths": torch.tensor(src_lengths, dtype=torch.long),
        "sample_ids": [b["sample_id"] for b in batch]
    }

# Create test dataloader
test_dataset = TestDataset(
    CONFIG["features_dir"] + "/test",
    CONFIG["labels_dir"] + "/test_metadata.csv"
)
test_loader = DataLoader(
    test_dataset, batch_size=CONFIG["batch_size"],
    shuffle=False, collate_fn=test_collate_fn, num_workers=0
)

print(f"Test samples: {len(test_dataset)}")
print(f"Test batches: {len(test_loader)}")

# Generate test predictions WITH PRESERVED ORDER
def decode_test_predictions(model, dataset, vocab, sos_idx, eos_idx, device, beam_size=2):
    """Generate predictions for test set - PRESERVES ORIGINAL ORDER."""
    all_preds = []
    all_ids = []
    
    model.eval()
    
    # Process ONE sample at a time to preserve order
    for idx in tqdm(range(len(dataset)), desc="Generating test predictions"):
        sample = dataset[idx]
        sample_id = sample['sample_id']
        
        # Prepare input
        features = torch.from_numpy(sample['features']).unsqueeze(0).to(device)
        src_key_padding_mask = torch.zeros(1, features.size(1), dtype=torch.bool, device=device)
        
        # Beam search
        with torch.no_grad():
            seq = beam_search(
                model,
                features,
                src_key_padding_mask,
                sos_idx,
                eos_idx,
                device,
                beam_size=beam_size,
                max_len=100
            )
        
        # Decode
        pred_indices = seq[0, 1:].tolist()
        pred_text = " ".join([
            vocab[idx] for idx in pred_indices
            if idx < len(vocab) and vocab[idx] not in ["<pad>", "<sos>", "<eos>"]
        ])
        
        all_ids.append(sample_id)
        all_preds.append(pred_text)
    
    return all_ids, all_preds

Loading test data...
Loaded 3800 test samples

Processing test features...


100%|█████████████████████████████████████████████████████████████████████████████| 3800/3800 [00:08<00:00, 453.41it/s]

✓ Processed 3800 test samples
Test samples: 3800
Test batches: 119





In [27]:
# Generate
test_ids, test_predictions = decode_test_predictions(
    model, test_dataset, dev_dataset.vocab,
    dev_dataset.sos_idx, dev_dataset.eos_idx,
    CONFIG["device"], beam_size=5
)

# Save
test_results = pd.DataFrame({
    "id": test_ids,
    "gloss": test_predictions
})
test_results.to_csv(f"{CONFIG['output_dir']}/test.csv", index=False, encoding='utf-8')

print(f"✓ Saved {len(test_results)} predictions")

Generating test predictions: 100%|███████████████████████████████████████████████| 3800/3800 [1:21:44<00:00,  1.29s/it]

✓ Saved 3800 predictions



