# Pre-Processing & Setup

## GPU, Imports & Load Data

In [2]:
# ============================================================================
# URDU SQUAD TRANSLATION - 7-PART APPROACH WITH QUALITY REVIEW
# Complete 130K dataset | VS Code | RTX 4070 8GB
# ============================================================================

# ============================================================================
# CELL 1: DEVICE CHECK (CUDA / MPS / CPU)
# ============================================================================
import torch

print("DEVICE CHECK")
print("="*80)

if torch.cuda.is_available():
    device = "cuda"
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    print(f"GPU: {gpu_name} | VRAM: {gpu_memory:.1f} GB | CUDA: {torch.version.cuda}")
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
    print("Apple Silicon GPU (MPS) available")
else:
    device = "cpu"
    print("Falling back to CPU")

print(f"Using device: {device}")
print("="*80)

DEVICE CHECK
Apple Silicon GPU (MPS) available
Using device: mps


In [3]:
# ============================================================================
# CELL 2: IMPORTS
# ============================================================================

import pandas as pd
from contextlib import nullcontext
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.cuda.amp import autocast
from tqdm import tqdm
import warnings
import gc
from datetime import datetime
warnings.filterwarnings('ignore')

print("Libraries imported")

  from .autonotebook import tqdm as notebook_tqdm


Libraries imported


In [4]:
# ============================================================================
# CELL 3: LOAD CSV FILES
# ============================================================================

print("\nLOADING DATA")
print("="*80)

TRAIN_CSV = "/Users/basusmac/Desktop/Github Repositories/NLP Project/SQuAD/train-v2.0-clean.csv"
DEV_CSV = "/Users/basusmac/Desktop/Github Repositories/NLP Project/SQuAD/dev-v2.0-clean.csv"

df_train_full = pd.read_csv(TRAIN_CSV)
df_dev_full = pd.read_csv(DEV_CSV)

print(f"Train: {len(df_train_full):,} rows")
print(f"Dev: {len(df_dev_full):,} rows")
print("="*80)


LOADING DATA
Train: 130,319 rows
Dev: 17,692 rows


In [5]:
# === PATCH A: Normalize SQuAD schema (answers -> answer_text, title, is_impossible) ===
import ast
import pandas as pd

def _safe_parse_answers(x):
    if isinstance(x, dict):
        return x
    if pd.isna(x):
        return {"text": [], "answer_start": []}
    if isinstance(x, str):
        try:
            d = ast.literal_eval(x)
            if isinstance(d, dict) and "text" in d and "answer_start" in d:
                return d
        except Exception:
            pass
    return {"text": [], "answer_start": []}

def normalize_squad_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # unify title field name
    for old in ["article_title", "article", "doc_title"]:
        if old in df.columns and "title" not in df.columns:
            df = df.rename(columns={old: "title"})
            break

    # flatten HF 'answers' -> answer_text/answer_start
    if "answers" in df.columns and "answer_text" not in df.columns:
        ans = df["answers"].apply(_safe_parse_answers)
        df["answer_text"]  = ans.apply(lambda d: d.get("text", [None])[0] if d.get("text") else None)
        df["answer_start"] = ans.apply(lambda d: d.get("answer_start", [None])[0] if d.get("answer_start") else None)

    # infer is_impossible if missing
    if "is_impossible" not in df.columns:
        if "answers" in df.columns:
            ans = df["answers"].apply(_safe_parse_answers)
            df["is_impossible"] = ans.apply(lambda d: len(d.get("text", [])) == 0)
        else:
            df["is_impossible"] = False

    # ensure required columns exist
    for col in ["title","context","question","answer_text","answer_start","is_impossible"]:
        if col not in df.columns:
            df[col] = None

    return df

df_train_full = normalize_squad_df(df_train_full)
df_dev_full   = normalize_squad_df(df_dev_full)

# HOTFIX: guarantee optional column exists (prevents KeyError downstream)
for _df in (df_train_full, df_dev_full):
    if "plausible_answer_text" not in _df.columns:
        _df["plausible_answer_text"] = None

print("Normalized columns (train):", list(df_train_full.columns)[:20], "‚Ä¶")

Normalized columns (train): ['Unnamed: 0', 'data_num', 'paragraph_num', 'id', 'title', 'context', 'question', 'answer', 'is_impossible', 'answer_text', 'answer_start', 'plausible_answer_text'] ‚Ä¶


In [6]:
# === PATCH A: Normalize SQuAD schema (answers -> answer_text, title, is_impossible) ===
import ast
import pandas as pd

def _safe_parse_answers(x):
    if isinstance(x, dict):
        return x
    if pd.isna(x):
        return {"text": [], "answer_start": []}
    if isinstance(x, str):
        try:
            d = ast.literal_eval(x)
            if isinstance(d, dict) and "text" in d and "answer_start" in d:
                return d
        except Exception:
            pass
    return {"text": [], "answer_start": []}

def normalize_squad_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # unify title field name
    for old in ["article_title", "article", "doc_title"]:
        if old in df.columns and "title" not in df.columns:
            df = df.rename(columns={old: "title"})
            break

    # flatten HF 'answers' -> answer_text/answer_start
    if "answers" in df.columns and "answer_text" not in df.columns:
        ans = df["answers"].apply(_safe_parse_answers)
        df["answer_text"]  = ans.apply(lambda d: d.get("text", [None])[0] if d.get("text") else None)
        df["answer_start"] = ans.apply(lambda d: d.get("answer_start", [None])[0] if d.get("answer_start") else None)

    # infer is_impossible if missing
    if "is_impossible" not in df.columns:
        if "answers" in df.columns:
            ans = df["answers"].apply(_safe_parse_answers)
            df["is_impossible"] = ans.apply(lambda d: len(d.get("text", [])) == 0)
        else:
            # if not available, create placeholder
            df["is_impossible"] = False

    # ensure required columns exist
    for col in ["title","context","question","answer_text","answer_start","is_impossible"]:
        if col not in df.columns:
            df[col] = None

    return df

df_train_full = normalize_squad_df(df_train_full)
df_dev_full   = normalize_squad_df(df_dev_full)
print("Normalized columns:", list(df_train_full.columns)[:15], "‚Ä¶")

Normalized columns: ['Unnamed: 0', 'data_num', 'paragraph_num', 'id', 'title', 'context', 'question', 'answer', 'is_impossible', 'answer_text', 'answer_start', 'plausible_answer_text'] ‚Ä¶


## 7 Parts with 1st part 10k, and 6-20k parts split into 5 parts.

In [7]:
# ============================================================================
# CELL 4: SPLIT TRAIN INTO 7 PARTS
# ============================================================================

print("\nSPLITTING TRAIN DATA INTO 7 PARTS")
print("="*80)

# Define splits
splits = [
    ('A', 0, 10000, 10000),
    ('B', 10000, 30000, 20000),
    ('C', 30000, 50000, 20000),
    ('D', 50000, 70000, 20000),
    ('E', 70000, 90000, 20000),
    ('F', 90000, 110000, 20000),
    ('G', 110000, 130319, 20319)
]

# Create part DataFrames
parts = {}
for part_name, start, end, size in splits:
    parts[part_name] = df_train_full.iloc[start:end].copy()
    print(f"Part {part_name}: Rows {start:,}-{end-1:,} ({size:,} rows)")

print(f"\nTotal: {sum([s[3] for s in splits]):,} rows")
print("="*80)


SPLITTING TRAIN DATA INTO 7 PARTS
Part A: Rows 0-9,999 (10,000 rows)
Part B: Rows 10,000-29,999 (20,000 rows)
Part C: Rows 30,000-49,999 (20,000 rows)
Part D: Rows 50,000-69,999 (20,000 rows)
Part E: Rows 70,000-89,999 (20,000 rows)
Part F: Rows 90,000-109,999 (20,000 rows)
Part G: Rows 110,000-130,318 (20,319 rows)

Total: 130,319 rows


## Configurations, Model and Translation Function

In [8]:
# ============================================================================
# CELL 5: CONFIGURATION
# ============================================================================

BATCH_SIZE = 64
MAX_LENGTH = 384
TARGET_LANG = "urd_Arab"
SOURCE_LANG = "eng_Latn"
MODEL_NAME = "facebook/nllb-200-distilled-600M"

print("\nCONFIGURATION")
print("="*80)
print(f"Batch Size: {BATCH_SIZE}")
print(f"Max Length: {MAX_LENGTH}")
print(f"Model: {MODEL_NAME}")
print(f"Target: Urdu")
print("="*80)


CONFIGURATION
Batch Size: 64
Max Length: 384
Model: facebook/nllb-200-distilled-600M
Target: Urdu


In [9]:
# ============================================================================
# CELL 6: LOAD MODEL
# ============================================================================

print("\nLOADING MODEL")
print("="*80)

# Use fp16 only on CUDA; otherwise use fp32
dtype = torch.float16 if device == "cuda" else torch.float32

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=dtype,
)

model.to(device)
model.eval()

# autocast context that works everywhere
if device == "cuda":
    from torch.cuda.amp import autocast as amp_autocast
    amp_ctx = amp_autocast
else:
    amp_ctx = nullcontext  # no autocast on MPS/CPU

print(f"Model on {device} | dtype: {dtype}")
if device == "cuda":
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    print("CUDA optimizations enabled")
print("="*80)


LOADING MODEL
Model on mps | dtype: torch.float32


In [10]:
# ============================================================================
# CELL 7: TRANSLATION FUNCTIONS
# ============================================================================

@torch.inference_mode()
def translate_batch(texts, target_lang, source_lang="eng_Latn"):
    valid_texts = [str(t) if pd.notna(t) and str(t).strip() != "" else "" for t in texts]
    
    indices_to_translate = [i for i, t in enumerate(valid_texts) if t != ""]
    texts_to_translate = [valid_texts[i] for i in indices_to_translate]
    
    if not texts_to_translate:
        return valid_texts
    
    tokenizer.src_lang = source_lang
    inputs = tokenizer(
        texts_to_translate,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_LENGTH
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    forced_bos_token_id = tokenizer.convert_tokens_to_ids(target_lang)
    
    with amp_ctx():
        translated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=forced_bos_token_id,
            max_length=MAX_LENGTH,
            num_beams=1,
            do_sample=False,
            use_cache=True,
        )
    
    translations = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    
    result = valid_texts.copy()
    for idx, translation in zip(indices_to_translate, translations):
        result[idx] = translation
    
    return result


def translate_dataframe(df, part_name, output_file, batch_size=BATCH_SIZE):
    print(f"\nTRANSLATING PART {part_name}")
    print("="*80)
    print(f"Started at: {datetime.now().strftime('%H:%M:%S')}")
    
    num_batches = (len(df) + batch_size - 1) // batch_size
    
    df_urdu = df.copy()
    df_urdu['context_urdu'] = ""
    df_urdu['question_urdu'] = ""
    df_urdu['answer_text_urdu'] = ""
    df_urdu['plausible_answer_text_urdu'] = ""
    
    columns_to_translate = [
        ('context', 'context_urdu'),
        ('question', 'question_urdu'),
        ('answer_text', 'answer_text_urdu'),
        ('plausible_answer_text', 'plausible_answer_text_urdu')
    ]
    
    start_time = datetime.now()
    
    for source_col, target_col in columns_to_translate:
        print(f"\nTranslating: {source_col}")
        
        for batch_idx in tqdm(range(num_batches), desc=f"{source_col}"):
            start_idx = batch_idx * batch_size
            end_idx = min(start_idx + batch_size, len(df))
            
            batch_texts = df[source_col].iloc[start_idx:end_idx].tolist()
            translations = translate_batch(batch_texts, TARGET_LANG, SOURCE_LANG)
            df_urdu.loc[df_urdu.index[start_idx:end_idx], target_col] = translations
            
            if batch_idx % 50 == 0 and batch_idx > 0:
                torch.cuda.empty_cache()
    
    df_urdu.to_csv(output_file, index=False, encoding='utf-8-sig')
    
    elapsed = (datetime.now() - start_time).total_seconds()
    
    print(f"\nPART {part_name} COMPLETE")
    print(f"Time: {elapsed/60:.1f} min")
    print(f"Saved: {output_file}")
    print("="*80)
    
    return df_urdu

print("Functions defined")

Functions defined


In [11]:
# ============================================================================
# CELL 8: QUICK SUMMARY (robust to different SQuAD schemas)
# ============================================================================
import pandas as pd

print("\nQUICK DATA SUMMARY")
print("="*80)
print("Columns:", list(df_train_full.columns))

# Pick title/article column if present
title_col = None
for c in ["title", "article_title", "article", "doc_title"]:
    if c in df_train_full.columns:
        title_col = c
        break

sample = df_train_full.iloc[0]

title_val = sample[title_col] if title_col else "N/A"
question_val = str(sample["question"]) if "question" in df_train_full.columns else "N/A"

# Handle answer in either flat columns or HF 'answers' dict
if "answer_text" in df_train_full.columns:
    answer_val = sample["answer_text"]
elif "answers" in df_train_full.columns:
    # HF squad_v2 style: answers is a dict with lists
    try:
        a = sample["answers"]
        if isinstance(a, dict) and "text" in a and len(a["text"]) > 0:
            answer_val = a["text"][0]
        else:
            answer_val = "N/A"
    except Exception:
        answer_val = "N/A"
else:
    answer_val = "N/A"

print("Sample row from train:")
print(f"  Article:  {title_val}")
print(f"  Question: {question_val[:80]}...")
print(f"  Answer:   {str(answer_val)[:50]}...")

# Safe averages (only if columns exist)
ctx_mean = df_train_full["context"].astype(str).str.len().mean() if "context" in df_train_full.columns else float("nan")
q_mean   = df_train_full["question"].astype(str).str.len().mean() if "question" in df_train_full.columns else float("nan")

print(f"\nAvg context length: {ctx_mean:.0f} chars" if ctx_mean == ctx_mean else "Avg context length: N/A")
print(f"Avg question length: {q_mean:.0f} chars" if q_mean == q_mean else "Avg question length: N/A")

print("\n" + "="*80)
print("Ready to translate!")
print("="*80)


QUICK DATA SUMMARY
Columns: ['Unnamed: 0', 'data_num', 'paragraph_num', 'id', 'title', 'context', 'question', 'answer', 'is_impossible', 'answer_text', 'answer_start', 'plausible_answer_text']
Sample row from train:
  Article:  Beyonc√©
  Question: When did Beyonce start becoming popular?...
  Answer:   None...

Avg context length: 762 chars
Avg question length: 59 chars

Ready to translate!


# Translation Runs

## 9A

In [12]:
# ============================================================================
# CELL 9A: TRANSLATE PART A (10K) - WITH QUALITY REVIEW
# ============================================================================

print(f"\n{'='*80}")
print("PART A - QUALITY REVIEW CHECKPOINT")
print("="*80)

df_partA_Urdu = translate_dataframe(
    parts['A'], 
    'A', 
    'train_urdu_partA.csv', 
    BATCH_SIZE
)

torch.cuda.empty_cache()
gc.collect()


PART A - QUALITY REVIEW CHECKPOINT

TRANSLATING PART A
Started at: 17:42:39

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [56:51<00:00, 21.73s/it]   



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [04:37<00:00,  1.77s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [00:00<00:00, 9207.68it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [00:00<00:00, 8582.56it/s]



PART A COMPLETE
Time: 61.5 min
Saved: train_urdu_partA.csv


80

In [14]:
# ============================================================================
# CELL 9A-REVIEW: COMPREHENSIVE QUALITY CHECK ON PART A
# ============================================================================

print("\n" + "="*80)
print("QUALITY REVIEW - PART A (10,000 SAMPLES)")
print("="*80)

# --- Make the review cell robust if 'parts' wasn't defined (e.g., rerun order / kernel reset) ---
def _rebuild_parts_if_needed():
    global parts
    if 'parts' in globals() and isinstance(parts, dict) and 'A' in parts:
        return
    # Try to rebuild from df_train_full using the same static splits you used earlier
    try:
        _splits = [
            ('A', 0, 10000, 10000),
            ('B', 10000, 30000, 20000),
            ('C', 30000, 50000, 20000),
            ('D', 50000, 70000, 20000),
            ('E', 70000, 90000, 20000),
            ('F', 90000, 110000, 20000),
            ('G', 110000, 130319, 20319),
        ]
        parts = {}
        for _name, _start, _end, _size in _splits:
            parts[_name] = df_train_full.iloc[_start:_end].copy()
    except Exception:
        parts = {}

_rebuild_parts_if_needed()

# ============================================================================
# 1. ROW COUNT VERIFICATION
# ============================================================================
print("\n1. ROW COUNT VERIFICATION:")

# If parts['A'] is available, use it; otherwise fall back gracefully
if 'parts' in globals() and isinstance(parts, dict) and 'A' in parts:
    part_a_original = len(parts['A'])
else:
    # Fallback to the intended slice size for Part A in your split plan
    part_a_original = 10000

part_a_translated = len(df_partA_Urdu)
print(f"   Original Part A (expected): {part_a_original:,} rows")
print(f"   Translated Part A:          {part_a_translated:,} rows")
print(f"   Difference:                 {part_a_original - part_a_translated:,}")

if part_a_original == part_a_translated:
    print("   ‚úÖ Row counts match (expected vs translated)")
else:
    print("   ‚ùå Row count mismatch ‚Äî check the split or translation loop")

# ============================================================================
# 3. EMPTY/NULL TRANSLATION CHECK
# ============================================================================
print("\n3. EMPTY TRANSLATION CHECK:")

empty_contexts = (df_partA_Urdu['context_urdu'].isna() | 
                  (df_partA_Urdu['context_urdu'] == '')).sum()
empty_questions = (df_partA_Urdu['question_urdu'].isna() | 
                   (df_partA_Urdu['question_urdu'] == '')).sum()
empty_answers = (df_partA_Urdu['answer_text_urdu'].isna() | 
                 (df_partA_Urdu['answer_text_urdu'] == '')).sum()

print(f"   Empty contexts: {empty_contexts:,} ({empty_contexts/len(df_partA_Urdu)*100:.2f}%)")
print(f"   Empty questions: {empty_questions:,} ({empty_questions/len(df_partA_Urdu)*100:.2f}%)")
print(f"   Empty answers: {empty_answers:,} ({empty_answers/len(df_partA_Urdu)*100:.2f}%)")

if empty_contexts == 0 and empty_questions == 0:
    print("   ‚úÖ No critical empty translations")
elif empty_contexts + empty_questions < 10:
    print("   ‚ö†Ô∏è  Minor: Few empty translations (acceptable)")
else:
    print("   ‚ùå CRITICAL: Too many empty translations!")

# ============================================================================
# 4. ORIGINAL DATA PRESERVATION CHECK
# ============================================================================
print("\n4. ORIGINAL DATA PRESERVATION:")

# Check if original columns exist and unchanged
base_cols = ['context', 'question']
if 'answer_text' in df_partA_Urdu.columns:
    base_cols.append('answer_text')
title_col = next((c for c in ['title','article_title','article','doc_title'] if c in df_partA_Urdu.columns), None)
if title_col:
    base_cols.append(title_col)

preserved = all(col in df_partA_Urdu.columns for col in base_cols)
print(f"   Original columns present: {preserved}")

print(f"   Original columns present: {preserved}")

if preserved:
    # Check if data actually unchanged (compare first row)
    orig_first = parts['A'].iloc[0]['context']
    trans_first = df_partA_Urdu.iloc[0]['context']
    
    if orig_first == trans_first:
        print("   ‚úÖ Original English data preserved")
    else:
        print("   ‚ùå CRITICAL: Original data modified!")
else:
    print("   ‚ùå CRITICAL: Original columns missing!")

# ============================================================================
# 5. LENGTH RATIO ANALYSIS
# ============================================================================
print("\n5. LENGTH RATIO ANALYSIS:")

df_partA_Urdu['context_len'] = df_partA_Urdu['context'].str.len()
df_partA_Urdu['context_urdu_len'] = df_partA_Urdu['context_urdu'].str.len()
df_partA_Urdu['question_len'] = df_partA_Urdu['question'].str.len()
df_partA_Urdu['question_urdu_len'] = df_partA_Urdu['question_urdu'].str.len()

df_partA_Urdu['context_ratio'] = df_partA_Urdu['context_urdu_len'] / df_partA_Urdu['context_len']
df_partA_Urdu['question_ratio'] = df_partA_Urdu['question_urdu_len'] / df_partA_Urdu['question_len']

context_ratio_median = df_partA_Urdu['context_ratio'].median()
context_ratio_mean = df_partA_Urdu['context_ratio'].mean()
question_ratio_median = df_partA_Urdu['question_ratio'].median()
question_ratio_mean = df_partA_Urdu['question_ratio'].mean()

print(f"   Context ratio (Urdu/English):")
print(f"      Median: {context_ratio_median:.2f}x")
print(f"      Mean: {context_ratio_mean:.2f}x")
print(f"      Range: {df_partA_Urdu['context_ratio'].min():.2f}x - {df_partA_Urdu['context_ratio'].max():.2f}x")

print(f"   Question ratio (Urdu/English):")
print(f"      Median: {question_ratio_median:.2f}x")
print(f"      Mean: {question_ratio_mean:.2f}x")
print(f"      Range: {df_partA_Urdu['question_ratio'].min():.2f}x - {df_partA_Urdu['question_ratio'].max():.2f}x")

if 0.8 < context_ratio_median < 2.0 and 0.8 < question_ratio_median < 2.0:
    print("   ‚úÖ Ratios look normal")
elif 0.5 < context_ratio_median < 2.5:
    print("   ‚ö†Ô∏è  Ratios slightly unusual but acceptable")
else:
    print("   ‚ùå WARNING: Unusual ratios - check translations!")

# ============================================================================
# 6. CHARACTER SET VALIDATION
# ============================================================================
print("\n6. CHARACTER SET VALIDATION:")

# Check for Urdu Unicode characters
import re
# Arabic script ranges covering Urdu
urdu_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]+')  # Urdu Unicode range

contexts_with_urdu = df_partA_Urdu['context_urdu'].apply(lambda x: bool(urdu_pattern.search(str(x)))).sum()
questions_with_urdu = df_partA_Urdu['question_urdu'].apply(lambda x: bool(urdu_pattern.search(str(x)))).sum()

print(f"   Contexts with Urdu script: {contexts_with_urdu:,}/{len(df_partA_Urdu):,} ({contexts_with_urdu/len(df_partA_Urdu)*100:.1f}%)")
print(f"   Questions with Urdu script: {questions_with_urdu:,}/{len(df_partA_Urdu):,} ({questions_with_urdu/len(df_partA_Urdu)*100:.1f}%)")

if contexts_with_urdu > len(df_partA_Urdu) * 0.95:
    print("   ‚úÖ Urdu characters detected in most rows")
else:
    print("   ‚ùå WARNING: Many rows missing Urdu characters!")

# Check for English characters remaining in Urdu columns
english_pattern = re.compile(r'[a-zA-Z]{5,}')  # 5+ consecutive English letters
contexts_with_english = df_partA_Urdu['context_urdu'].apply(lambda x: bool(english_pattern.search(str(x)))).sum()
questions_with_english = df_partA_Urdu['question_urdu'].apply(lambda x: bool(english_pattern.search(str(x)))).sum()

print(f"\n   English words in Urdu contexts: {contexts_with_english:,} ({contexts_with_english/len(df_partA_Urdu)*100:.1f}%)")
print(f"   English words in Urdu questions: {questions_with_english:,} ({questions_with_english/len(df_partA_Urdu)*100:.1f}%)")

if contexts_with_english < len(df_partA_Urdu) * 0.1:
    print("   ‚úÖ Minimal English in Urdu columns (names/entities expected)")
else:
    print("   ‚ö†Ô∏è  High English content in Urdu columns")

# ============================================================================
# 7. DUPLICATE CHECK
# ============================================================================
print("\n7. DUPLICATE CHECK:")

orig_dupes = parts['A'].duplicated().sum()
trans_dupes = df_partA_Urdu.duplicated().sum()

print(f"   Duplicates in original: {orig_dupes:,}")
print(f"   Duplicates in translated: {trans_dupes:,}")

if orig_dupes == trans_dupes:
    print("   ‚úÖ Duplicate count unchanged")
else:
    print("   ‚ö†Ô∏è  Duplicate count changed (may be okay)")

# ============================================================================
# 8. SAMPLE TRANSLATIONS (VISUAL CHECK)
# ============================================================================
print("\n8. SAMPLE TRANSLATIONS (MANUAL REVIEW):")
print("="*80)

# Samples from different parts of Part A
sample_indices = [0, 2500, 5000, 7500, 9999]

for idx in sample_indices:
    row = df_partA_Urdu.iloc[idx]
    print(f"\nüìå Sample {idx+1} (Row {idx}):")
    title_val = (
    row['title'] if 'title' in df_partA_Urdu.columns else
    row['article_title'] if 'article_title' in df_partA_Urdu.columns else
    row.get('article', "N/A") if 'article' in df_partA_Urdu.columns else
    row.get('doc_title', "N/A") if 'doc_title' in df_partA_Urdu.columns else
    "N/A"
    )
    print(f"   Article: {title_val}")
    
    print(f"\n   English Context (first 150 chars):")
    print(f"      {row['context'][:150]}...")
    print(f"   Urdu Context (first 150 chars):")
    print(f"      {row['context_urdu'][:150]}...")
    
    print(f"\n   English Question:")
    print(f"      {row['question']}")
    print(f"   Urdu Question:")
    print(f"      {row['question_urdu']}")
    
    print(f"\n   English Answer:")
    print(f"      {row['answer_text'] if pd.notna(row['answer_text']) else 'N/A'}")
    print(f"   Urdu Answer:")
    print(f"      {row['answer_text_urdu'] if pd.notna(row['answer_text_urdu']) else 'N/A'}")
    
    print(f"\n   Impossible: {row['is_impossible']}")
    print(f"   Length ratios: Context={row['context_ratio']:.2f}x, Question={row['question_ratio']:.2f}x")
    print("-"*80)

# ============================================================================
# 9. VERIFICATION SUMMARY
# ============================================================================
print("\n" + "="*80)
print("VERIFICATION SUMMARY - PART A")
print("="*80)

# --- Recompute expected/actual Urdu columns for this summary block ---
expected_new_cols = ['context_urdu', 'question_urdu']
if 'answer_text_urdu' in df_partA_Urdu.columns:
    expected_new_cols.append('answer_text_urdu')
if 'plausible_answer_text_urdu' in df_partA_Urdu.columns:
    expected_new_cols.append('plausible_answer_text_urdu')

# any column containing "urdu" in its name counts as a translated column
actual_new_cols = [c for c in df_partA_Urdu.columns if 'urdu' in c.lower()]

checks = {
    'Row count matches': part_a_original == part_a_translated,
    'All columns created': set(expected_new_cols).issubset(set(actual_new_cols)),
    'No critical empty translations': empty_contexts == 0 and empty_questions == 0,
    'Original data preserved': preserved,
    'Length ratios normal': 0.8 < context_ratio_median < 2.0,
    'Urdu characters present': contexts_with_urdu > len(df_partA_Urdu) * 0.95
}

passed = sum(checks.values())
total = len(checks)

print(f"\nAutomated checks passed: {passed}/{total}\n")

for check_name, result in checks.items():
    status = "‚úÖ PASS" if result else "‚ùå FAIL"
    print(f"  {status}: {check_name}")

print("\n" + "="*80)

if passed == total:
    print("üéâ ALL AUTOMATED CHECKS PASSED!")
    print("="*80)
    print("\nüëÄ NOW DO MANUAL REVIEW:")
    print("   1. Read the 5 sample translations above")
    print("   2. Verify Urdu looks correct (not gibberish)")
    print("   3. Verify Urdu characters render properly")
    print("   4. Verify translations make sense")
    print("\n‚úÖ If manual review GOOD ‚Üí Continue to Cell 9B")
    print("‚ùå If manual review BAD  ‚Üí Stop and debug")
else:
    print("‚ùå SOME AUTOMATED CHECKS FAILED")
    print("="*80)
    print("\n‚ö†Ô∏è  DO NOT CONTINUE - FIX ISSUES FIRST:")
    print("   1. Review failed checks above")
    print("   2. Debug the translation code")
    print("   3. Re-run Cell 9A")
    print("   4. Re-run this review cell")
    print("\n‚ùå DO NOT proceed to Cell 9B until all checks pass")

print("\n" + "="*80)


QUALITY REVIEW - PART A (10,000 SAMPLES)

1. ROW COUNT VERIFICATION:
   Original Part A (expected): 10,000 rows
   Translated Part A:          10,000 rows
   Difference:                 0
   ‚úÖ Row counts match (expected vs translated)

3. EMPTY TRANSLATION CHECK:
   Empty contexts: 0 (0.00%)
   Empty questions: 0 (0.00%)
   Empty answers: 10,000 (100.00%)
   ‚úÖ No critical empty translations

4. ORIGINAL DATA PRESERVATION:
   Original columns present: True
   Original columns present: True
   ‚úÖ Original English data preserved

5. LENGTH RATIO ANALYSIS:
   Context ratio (Urdu/English):
      Median: 0.93x
      Mean: 0.92x
      Range: 0.26x - 4.81x
   Question ratio (Urdu/English):
      Median: 0.90x
      Mean: 0.92x
      Range: 0.24x - 28.20x
   ‚úÖ Ratios look normal

6. CHARACTER SET VALIDATION:
   Contexts with Urdu script: 9,998/10,000 (100.0%)
   Questions with Urdu script: 10,000/10,000 (100.0%)
   ‚úÖ Urdu characters detected in most rows

   English words in Urdu cont

In [15]:
# ============================================================================
# CELL 9B-SPLIT: SPLIT REMAINING PARTS INTO SUB-PARTS
# ============================================================================

print("\nSPLITTING REMAINING PARTS FOR THERMAL MANAGEMENT")
print("="*80)

# Split Part B (20K) into 5 sub-parts of 4K each
parts_B = {
    'B1': parts['B'].iloc[0:4000].copy(),
    'B2': parts['B'].iloc[4000:8000].copy(),
    'B3': parts['B'].iloc[8000:12000].copy(),
    'B4': parts['B'].iloc[12000:16000].copy(),
    'B5': parts['B'].iloc[16000:20000].copy()
}

# Split Part C (20K) into 5 sub-parts of 4K each
parts_C = {
    'C1': parts['C'].iloc[0:4000].copy(),
    'C2': parts['C'].iloc[4000:8000].copy(),
    'C3': parts['C'].iloc[8000:12000].copy(),
    'C4': parts['C'].iloc[12000:16000].copy(),
    'C5': parts['C'].iloc[16000:20000].copy()
}

# Split Part D (20K) into 5 sub-parts of 4K each
parts_D = {
    'D1': parts['D'].iloc[0:4000].copy(),
    'D2': parts['D'].iloc[4000:8000].copy(),
    'D3': parts['D'].iloc[8000:12000].copy(),
    'D4': parts['D'].iloc[12000:16000].copy(),
    'D5': parts['D'].iloc[16000:20000].copy()
}

# Split Part E (20K) into 5 sub-parts of 4K each
parts_E = {
    'E1': parts['E'].iloc[0:4000].copy(),
    'E2': parts['E'].iloc[4000:8000].copy(),
    'E3': parts['E'].iloc[8000:12000].copy(),
    'E4': parts['E'].iloc[12000:16000].copy(),
    'E5': parts['E'].iloc[16000:20000].copy()
}

# Split Part F (20K) into 5 sub-parts of 4K each
parts_F = {
    'F1': parts['F'].iloc[0:4000].copy(),
    'F2': parts['F'].iloc[4000:8000].copy(),
    'F3': parts['F'].iloc[8000:12000].copy(),
    'F4': parts['F'].iloc[12000:16000].copy(),
    'F5': parts['F'].iloc[16000:20000].copy()
}

# Split Part G (20,319) into 5 sub-parts
parts_G = {
    'G1': parts['G'].iloc[0:4063].copy(),
    'G2': parts['G'].iloc[4063:8126].copy(),
    'G3': parts['G'].iloc[8126:12189].copy(),
    'G4': parts['G'].iloc[12189:16252].copy(),
    'G5': parts['G'].iloc[16252:20319].copy()
}

# Split Dev (11,873) into 3 sub-parts
parts_Dev = {
    'Dev1': df_dev_full.iloc[0:3957].copy(),
    'Dev2': df_dev_full.iloc[3957:7914].copy(),
    'Dev3': df_dev_full.iloc[7914:11873].copy()
}

print("\nPart B sub-parts:")
for name, df in parts_B.items():
    print(f"  {name}: {len(df):,} rows")

print("\nPart C sub-parts:")
for name, df in parts_C.items():
    print(f"  {name}: {len(df):,} rows")

print("\nPart D sub-parts:")
for name, df in parts_D.items():
    print(f"  {name}: {len(df):,} rows")

print("\nPart E sub-parts:")
for name, df in parts_E.items():
    print(f"  {name}: {len(df):,} rows")

print("\nPart F sub-parts:")
for name, df in parts_F.items():
    print(f"  {name}: {len(df):,} rows")

print("\nPart G sub-parts:")
for name, df in parts_G.items():
    print(f"  {name}: {len(df):,} rows")

print("\nDev sub-parts:")
for name, df in parts_Dev.items():
    print(f"  {name}: {len(df):,} rows")

print("\n" + "="*80)
print("SPLITTING COMPLETE - Ready for thermal-safe translation")
print("="*80)


SPLITTING REMAINING PARTS FOR THERMAL MANAGEMENT

Part B sub-parts:
  B1: 4,000 rows
  B2: 4,000 rows
  B3: 4,000 rows
  B4: 4,000 rows
  B5: 4,000 rows

Part C sub-parts:
  C1: 4,000 rows
  C2: 4,000 rows
  C3: 4,000 rows
  C4: 4,000 rows
  C5: 4,000 rows

Part D sub-parts:
  D1: 4,000 rows
  D2: 4,000 rows
  D3: 4,000 rows
  D4: 4,000 rows
  D5: 4,000 rows

Part E sub-parts:
  E1: 4,000 rows
  E2: 4,000 rows
  E3: 4,000 rows
  E4: 4,000 rows
  E5: 4,000 rows

Part F sub-parts:
  F1: 4,000 rows
  F2: 4,000 rows
  F3: 4,000 rows
  F4: 4,000 rows
  F5: 4,000 rows

Part G sub-parts:
  G1: 4,063 rows
  G2: 4,063 rows
  G3: 4,063 rows
  G4: 4,063 rows
  G5: 4,067 rows

Dev sub-parts:
  Dev1: 3,957 rows
  Dev2: 3,957 rows
  Dev3: 3,959 rows

SPLITTING COMPLETE - Ready for thermal-safe translation


## 9B

In [16]:
# ============================================================================
# CELL 9B: TRANSLATE PART B (20K)
# ============================================================================

# Cell 9B1
df_partB1_urdu = translate_dataframe(parts_B['B1'], 'B1', 'train_urdu_partB1.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART B1
Started at: 18:47:39

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [17:19<00:00, 16.49s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [02:10<00:00,  2.07s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9153.11it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9218.57it/s]


PART B1 COMPLETE
Time: 19.5 min
Saved: train_urdu_partB1.csv





450

In [17]:
# ============================================================================
# CELL 9B: TRANSLATE PART B (20K)
# ============================================================================

# Cell 9B2
df_partB2_urdu = translate_dataframe(parts_B['B2'], 'B2', 'train_urdu_partB2.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART B2
Started at: 20:07:38

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [17:50<00:00, 16.99s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [02:14<00:00,  2.13s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9746.28it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 7955.72it/s]


PART B2 COMPLETE
Time: 20.1 min
Saved: train_urdu_partB2.csv





0

In [18]:
# ============================================================================
# CELL 9B: TRANSLATE PART B (20K)
# ============================================================================

# Cell 9B3
df_partB3_urdu = translate_dataframe(parts_B['B3'], 'B3', 'train_urdu_partB3.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART B3
Started at: 20:34:30

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [16:43<00:00, 15.92s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [03:04<00:00,  2.94s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9721.90it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9466.60it/s]


PART B3 COMPLETE
Time: 19.8 min
Saved: train_urdu_partB3.csv





0

In [19]:
# ============================================================================
# CELL 9B: TRANSLATE PART B (20K)
# ============================================================================

# Cell 9B4
df_partB4_urdu = translate_dataframe(parts_B['B4'], 'B4', 'train_urdu_partB4.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART B4
Started at: 21:01:49

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [20:07<00:00, 19.17s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [01:38<00:00,  1.57s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9238.88it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 8315.48it/s]


PART B4 COMPLETE
Time: 21.8 min
Saved: train_urdu_partB4.csv





0

In [20]:
# ============================================================================
# CELL 9B: TRANSLATE PART B (20K)
# ============================================================================

# Cell 9B5
df_partB5_urdu = translate_dataframe(parts_B['B5'], 'B5', 'train_urdu_partB5.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART B5
Started at: 21:35:52

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [18:15<00:00, 17.39s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [01:46<00:00,  1.69s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9174.40it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9156.92it/s]


PART B5 COMPLETE
Time: 20.0 min
Saved: train_urdu_partB5.csv





0

## 9C

In [21]:
# ============================================================================
# CELL 9C: TRANSLATE PART C (20K)
# ============================================================================

# Cell 9C1
df_partC1_urdu = translate_dataframe(parts_C['C1'], 'C1', 'train_urdu_partC1.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()



TRANSLATING PART C1
Started at: 22:17:09

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [20:47<00:00, 19.80s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [02:34<00:00,  2.45s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9006.17it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9195.47it/s]


PART C1 COMPLETE
Time: 23.4 min
Saved: train_urdu_partC1.csv





0

In [22]:
# ============================================================================
# CELL 9C: TRANSLATE PART C (20K)
# ============================================================================

# Cell 9C2
df_partC2_urdu = translate_dataframe(parts_C['C2'], 'C2', 'train_urdu_partC2.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART C2
Started at: 22:41:16

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [21:20<00:00, 20.32s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [02:13<00:00,  2.12s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9211.82it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 8308.16it/s]


PART C2 COMPLETE
Time: 23.6 min
Saved: train_urdu_partC2.csv





0

In [24]:
# ============================================================================
# CELL 9C: TRANSLATE PART C (20K)
# ============================================================================

# Cell 9C3
df_partC3_urdu = translate_dataframe(parts_C['C3'], 'C3', 'train_urdu_partC3.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART C3
Started at: 23:23:13

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [41:24<00:00, 39.44s/it] 



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [02:48<00:00,  2.67s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 8538.78it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 8567.85it/s]


PART C3 COMPLETE
Time: 44.2 min
Saved: train_urdu_partC3.csv





1262

In [25]:
# ============================================================================
# CELL 9C: TRANSLATE PART C (20K)
# ============================================================================

# Cell 9C4
df_partC4_urdu = translate_dataframe(parts_C['C4'], 'C4', 'train_urdu_partC4.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART C4
Started at: 00:07:26

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [1:06:31<00:00, 63.36s/it] 



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [02:11<00:00,  2.09s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 7616.77it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 7754.69it/s]


PART C4 COMPLETE
Time: 68.7 min
Saved: train_urdu_partC4.csv





0

In [26]:
# ============================================================================
# CELL 9C: TRANSLATE PART C (20K)
# ============================================================================

# Cell 9C5
df_partC5_urdu = translate_dataframe(parts_C['C5'], 'C5', 'train_urdu_partC5.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART C5
Started at: 01:16:09

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [44:11<00:00, 42.09s/it]   



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [03:01<00:00,  2.88s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 3677.78it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 3508.20it/s]



PART C5 COMPLETE
Time: 47.2 min
Saved: train_urdu_partC5.csv


0

## 9D

In [27]:
# ============================================================================
# CELL 9D: TRANSLATE PART D (20K)
# ============================================================================

# Cell 9D1
df_partD1_urdu = translate_dataframe(parts_D['D1'], 'D1', 'train_urdu_partD1.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART D1
Started at: 12:50:22

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [20:32<00:00, 19.56s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [01:48<00:00,  1.72s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9077.96it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9248.58it/s]


PART D1 COMPLETE
Time: 22.3 min
Saved: train_urdu_partD1.csv





0

In [28]:
# ============================================================================
# CELL 9D: TRANSLATE PART D (20K)
# ============================================================================

# Cell 9D2
df_partD2_urdu = translate_dataframe(parts_D['D2'], 'D2', 'train_urdu_partD2.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART D2
Started at: 13:12:42

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [22:49<00:00, 21.74s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [01:41<00:00,  1.60s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9064.57it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9045.02it/s]


PART D2 COMPLETE
Time: 24.5 min
Saved: train_urdu_partD2.csv





0

In [29]:
# ============================================================================
# CELL 9D: TRANSLATE PART D (20K)
# ============================================================================

# Cell 9D3
df_partD3_urdu = translate_dataframe(parts_D['D3'], 'D3', 'train_urdu_partD3.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART D3
Started at: 13:37:13

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [1:12:01<00:00, 68.59s/it] 



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [03:12<00:00,  3.06s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 8600.76it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 6392.83it/s]


PART D3 COMPLETE
Time: 75.2 min
Saved: train_urdu_partD3.csv





0

In [30]:
# ============================================================================
# CELL 9D: TRANSLATE PART D (20K)
# ============================================================================

# Cell 9D4
df_partD4_urdu = translate_dataframe(parts_D['D4'], 'D4', 'train_urdu_partD4.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART D4
Started at: 14:52:27

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [37:10<00:00, 35.40s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [02:35<00:00,  2.46s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 7111.29it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 7205.73it/s]


PART D4 COMPLETE
Time: 39.8 min
Saved: train_urdu_partD4.csv





0

In [31]:
# ============================================================================
# CELL 9D: TRANSLATE PART D (20K)
# ============================================================================

# Cell 9D5
df_partD5_urdu = translate_dataframe(parts_D['D5'], 'D5', 'train_urdu_partD5.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART D5
Started at: 15:32:13

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [55:12<00:00, 52.58s/it]   



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [04:10<00:00,  3.97s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 3461.33it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 3160.59it/s]



PART D5 COMPLETE
Time: 59.4 min
Saved: train_urdu_partD5.csv


0

## 9E

In [32]:
# ============================================================================
# CELL 9E: TRANSLATE PART E (20K)
# ============================================================================

# Cell 9E1
df_partE1_urdu = translate_dataframe(parts_E['E1'], 'E1', 'train_urdu_partE1.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART E1
Started at: 18:00:39

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [22:36<00:00, 21.54s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [01:38<00:00,  1.56s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9114.59it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9216.96it/s]


PART E1 COMPLETE
Time: 24.2 min
Saved: train_urdu_partE1.csv





0

In [33]:
# ============================================================================
# CELL 9E: TRANSLATE PART E (20K)
# ============================================================================

# Cell 9E2
df_partE2_urdu = translate_dataframe(parts_E['E2'], 'E2', 'train_urdu_partE2.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART E2
Started at: 18:24:54

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [21:22<00:00, 20.35s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [01:47<00:00,  1.71s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9148.04it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9442.92it/s]


PART E2 COMPLETE
Time: 23.2 min
Saved: train_urdu_partE2.csv





0

In [34]:
# ============================================================================
# CELL 9E: TRANSLATE PART E (20K)
# ============================================================================

# Cell 9E3
df_partE3_urdu = translate_dataframe(parts_E['E3'], 'E3', 'train_urdu_partE3.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART E3
Started at: 18:48:04

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [38:19<00:00, 36.49s/it]   



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [01:34<00:00,  1.50s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9167.40it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9125.29it/s]


PART E3 COMPLETE
Time: 39.9 min
Saved: train_urdu_partE3.csv





0

In [35]:
# ============================================================================
# CELL 9E: TRANSLATE PART E (20K)
# ============================================================================

# Cell 9E4
df_partE4_urdu = translate_dataframe(parts_E['E4'], 'E4', 'train_urdu_partE4.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART E4
Started at: 19:27:58

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [20:02<00:00, 19.09s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [01:25<00:00,  1.35s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9205.08it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9235.97it/s]


PART E4 COMPLETE
Time: 21.5 min
Saved: train_urdu_partE4.csv





0

In [36]:
# ============================================================================
# CELL 9E: TRANSLATE PART E (20K)
# ============================================================================

# Cell 9E5
df_partE5_urdu = translate_dataframe(parts_E['E5'], 'E5', 'train_urdu_partE5.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART E5
Started at: 19:49:26

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [19:32<00:00, 18.62s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [02:20<00:00,  2.22s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9091.07it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9272.92it/s]


PART E5 COMPLETE
Time: 21.9 min
Saved: train_urdu_partE5.csv





0

## 9F

In [37]:
# ============================================================================
# CELL 9F: TRANSLATE PART F (20K)
# ============================================================================

# Cell 9F1
df_partF1_urdu = translate_dataframe(parts_F['F1'], 'F1', 'train_urdu_partF1.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART F1
Started at: 22:59:54

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [20:56<00:00, 19.94s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [01:38<00:00,  1.57s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9130.65it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9275.52it/s]


PART F1 COMPLETE
Time: 22.6 min
Saved: train_urdu_partF1.csv





0

In [38]:
# ============================================================================
# CELL 9F: TRANSLATE PART F (20K)
# ============================================================================

# Cell 9F2
df_partF2_urdu = translate_dataframe(parts_F['F2'], 'F2', 'train_urdu_partF2.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART F2
Started at: 23:22:29

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [20:16<00:00, 19.31s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [01:36<00:00,  1.53s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9084.51it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9027.10it/s]


PART F2 COMPLETE
Time: 21.9 min
Saved: train_urdu_partF2.csv





0

In [39]:
# ============================================================================
# CELL 9F: TRANSLATE PART F (20K)
# ============================================================================

# Cell 9F3
df_partF3_urdu = translate_dataframe(parts_F['F3'], 'F3', 'train_urdu_partF3.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART F3
Started at: 23:44:22

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [20:28<00:00, 19.50s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [01:26<00:00,  1.37s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9260.57it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9432.80it/s]


PART F3 COMPLETE
Time: 21.9 min
Saved: train_urdu_partF3.csv





0

In [40]:
# ============================================================================
# CELL 9F: TRANSLATE PART F (20K)
# ============================================================================

# Cell 9F4
df_partF4_urdu = translate_dataframe(parts_F['F4'], 'F4', 'train_urdu_partF4.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART F4
Started at: 00:06:17

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [20:38<00:00, 19.66s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [01:52<00:00,  1.78s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 8999.12it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9244.37it/s]


PART F4 COMPLETE
Time: 22.5 min
Saved: train_urdu_partF4.csv





0

In [41]:
# ============================================================================
# CELL 9F: TRANSLATE PART F (20K)
# ============================================================================

# Cell 9F5
df_partF5_urdu = translate_dataframe(parts_F['F5'], 'F5', 'train_urdu_partF5.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART F5
Started at: 00:28:48

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [1:04:50<00:00, 61.76s/it] 



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [02:44<00:00,  2.61s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 8985.96it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 9047.81it/s]


PART F5 COMPLETE
Time: 67.6 min
Saved: train_urdu_partF5.csv





0

## 9G

In [42]:
# ============================================================================
# CELL 9G: TRANSLATE PART G (20,319)
# ============================================================================

# Cell 9G1
df_partG1_urdu = translate_dataframe(parts_G['G1'], 'G1', 'train_urdu_partG1.csv', BATCH_SIZE)


torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART G1
Started at: 01:36:23

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [59:00<00:00, 55.32s/it]   



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [02:31<00:00,  2.37s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [00:00<00:00, 7708.79it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [00:00<00:00, 7837.30it/s]


PART G1 COMPLETE
Time: 61.5 min
Saved: train_urdu_partG1.csv





0

In [43]:
# ============================================================================
# CELL 9G: TRANSLATE PART G (20,319)
# ============================================================================

# Cell 9G2
df_partG2_urdu = translate_dataframe(parts_G['G2'], 'G2', 'train_urdu_partG2.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART G2
Started at: 02:37:55

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [35:03<00:00, 32.87s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [02:31<00:00,  2.37s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [00:00<00:00, 7662.80it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [00:00<00:00, 7594.94it/s]


PART G2 COMPLETE
Time: 37.6 min
Saved: train_urdu_partG2.csv





0

In [44]:
# ============================================================================
# CELL 9G: TRANSLATE PART G (20,319)
# ============================================================================

# Cell 9G3
df_partG3_urdu = translate_dataframe(parts_G['G3'], 'G3', 'train_urdu_partG3.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART G3
Started at: 03:15:31

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [1:19:54<00:00, 74.92s/it]   



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [02:51<00:00,  2.68s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [00:00<00:00, 7697.74it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [00:00<00:00, 7835.70it/s]


PART G3 COMPLETE
Time: 82.8 min
Saved: train_urdu_partG3.csv





0

In [45]:
# ============================================================================
# CELL 9G: TRANSLATE PART G (20,319)
# ============================================================================

# Cell 9G4
df_partG4_urdu = translate_dataframe(parts_G['G4'], 'G4', 'train_urdu_partG4.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART G4
Started at: 04:38:17

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [38:04<00:00, 35.69s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [02:13<00:00,  2.09s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [00:00<00:00, 7734.11it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [00:00<00:00, 7911.22it/s]


PART G4 COMPLETE
Time: 40.3 min
Saved: train_urdu_partG4.csv





0

In [46]:
# ============================================================================
# CELL 9G: TRANSLATE PART G (20,319)
# ============================================================================

# Cell 9G5
df_partG5_urdu = translate_dataframe(parts_G['G5'], 'G5', 'train_urdu_partG5.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()


TRANSLATING PART G5
Started at: 05:18:36

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [50:09<00:00, 47.02s/it]   



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [03:07<00:00,  2.92s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [00:00<00:00, 6253.01it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [00:00<00:00, 6065.65it/s]


PART G5 COMPLETE
Time: 53.3 min
Saved: train_urdu_partG5.csv





0

In [47]:
print("\n" + "="*80)
print("ALL TRAIN PARTS COMPLETE!")
print("="*80)


ALL TRAIN PARTS COMPLETE!


## Dev

In [48]:
print("\nTRANSLATING DEV SPLIT - Part 1")
print("="*80)

# Cell 9Dev1
df_partDev1_urdu = translate_dataframe(parts_Dev['Dev1'], 'Dev1', 'dev_urdu_partDev1.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

print(f"\nDev complete: {len(df_partDev1_urdu):,} rows")


TRANSLATING DEV SPLIT - Part 1

TRANSLATING PART Dev1
Started at: 15:38:52

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62/62 [1:04:22<00:00, 62.30s/it] 



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62/62 [02:56<00:00,  2.85s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62/62 [00:00<00:00, 7721.11it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62/62 [00:00<00:00, 7712.40it/s]


PART Dev1 COMPLETE
Time: 67.3 min
Saved: dev_urdu_partDev1.csv

Dev complete: 3,957 rows





In [49]:
print("\nTRANSLATING DEV SPLIT - Part 2")
print("="*80)

# Cell 9Dev2
df_partDev2_urdu = translate_dataframe(parts_Dev['Dev2'], 'Dev2', 'dev_urdu_partDev2.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

print(f"\nDev complete: {len(df_partDev2_urdu):,} rows")


TRANSLATING DEV SPLIT - Part 2

TRANSLATING PART Dev2
Started at: 16:46:12

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62/62 [37:24<00:00, 36.21s/it]



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62/62 [02:20<00:00,  2.26s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62/62 [00:00<00:00, 4124.59it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62/62 [00:00<00:00, 2751.76it/s]



PART Dev2 COMPLETE
Time: 39.7 min
Saved: dev_urdu_partDev2.csv

Dev complete: 3,957 rows


In [50]:
print("\nTRANSLATING DEV SPLIT - Part 3")
print("="*80)

# Cell 9Dev3
df_partDev3_urdu = translate_dataframe(parts_Dev['Dev3'], 'Dev3', 'dev_urdu_partDev3.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

print(f"\nDev complete: {len(df_partDev3_urdu):,} rows")


TRANSLATING DEV SPLIT - Part 3

TRANSLATING PART Dev3
Started at: 17:25:57

Translating: context


context: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62/62 [58:27<00:00, 56.57s/it]  



Translating: question


question: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62/62 [03:51<00:00,  3.74s/it]



Translating: answer_text


answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62/62 [00:00<00:00, 5707.41it/s]



Translating: plausible_answer_text


plausible_answer_text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62/62 [00:00<00:00, 5838.11it/s]


PART Dev3 COMPLETE
Time: 62.3 min
Saved: dev_urdu_partDev3.csv

Dev complete: 3,959 rows





# Post-Processing

## Concatenate

In [51]:
# ============================================================================
# CELL 11: CONCATENATE ALL SUB-PARTS
# ============================================================================

print("\nCONCATENATING ALL PARTS")
print("="*80)

# Concatenate train parts
df_train_urdu_complete = pd.concat([
    df_partA_Urdu,
    df_partB1_urdu, df_partB2_urdu, df_partB3_urdu, df_partB4_urdu, df_partB5_urdu,
    df_partC1_urdu, df_partC2_urdu, df_partC3_urdu, df_partC4_urdu, df_partC5_urdu,
    df_partD1_urdu, df_partD2_urdu, df_partD3_urdu, df_partD4_urdu, df_partD5_urdu,
    df_partE1_urdu, df_partE2_urdu, df_partE3_urdu, df_partE4_urdu, df_partE5_urdu,
    df_partF1_urdu, df_partF2_urdu, df_partF3_urdu, df_partF4_urdu, df_partF5_urdu,
    df_partG1_urdu, df_partG2_urdu, df_partG3_urdu, df_partG4_urdu, df_partG5_urdu
], ignore_index=True)

# Concatenate dev parts
df_dev_urdu_complete = pd.concat([
    df_partDev1_urdu,
    df_partDev2_urdu,
    df_partDev3_urdu
], ignore_index=True)

# Save
df_train_urdu_complete.to_csv('squad2_train_urdu_complete.csv', index=False, encoding='utf-8-sig')
df_dev_urdu_complete.to_csv('squad2_dev_urdu_complete.csv', index=False, encoding='utf-8-sig')

print(f"Train: {len(df_train_urdu_complete):,} rows")
print(f"Dev: {len(df_dev_urdu_complete):,} rows")
print("="*80)


CONCATENATING ALL PARTS
Train: 130,319 rows
Dev: 11,873 rows


In [52]:
# ============================================================================
# CELL 12: FINAL STATISTICS
# ============================================================================

print("\nFINAL STATISTICS")
print("="*80)

print(f"\nCOMPLETED TRANSLATION:")
print(f"  Train: {len(df_train_urdu_complete):,} rows")
print(f"  Dev: {len(df_dev_urdu_complete):,} rows")
print(f"  Total: {len(df_train_urdu_complete) + len(df_dev_urdu_complete):,} rows")

# Calculate ratios
df_train_urdu_complete['context_ratio'] = (
    df_train_urdu_complete['context_urdu'].str.len() / 
    df_train_urdu_complete['context'].str.len()
)
df_train_urdu_complete['question_ratio'] = (
    df_train_urdu_complete['question_urdu'].str.len() / 
    df_train_urdu_complete['question'].str.len()
)

print(f"\nLENGTH RATIOS (Urdu/English):")
print(f"  Context median: {df_train_urdu_complete['context_ratio'].median():.2f}x")
print(f"  Question median: {df_train_urdu_complete['question_ratio'].median():.2f}x")

print(f"\nOUTPUT FILES:")
print(f"  ‚úÖ squad2_train_urdu_complete.csv ({len(df_train_urdu_complete):,} rows)")
print(f"  ‚úÖ squad2_dev_urdu_complete.csv ({len(df_dev_urdu_complete):,} rows)")
print(f"  ‚úÖ train_urdu_partA.csv through partG.csv (backups)")

print("\n" + "="*80)
print("TRANSLATION COMPLETE!")
print("="*80)


FINAL STATISTICS

COMPLETED TRANSLATION:
  Train: 130,319 rows
  Dev: 11,873 rows
  Total: 142,192 rows

LENGTH RATIOS (Urdu/English):
  Context median: 0.92x
  Question median: 0.90x

OUTPUT FILES:
  ‚úÖ squad2_train_urdu_complete.csv (130,319 rows)
  ‚úÖ squad2_dev_urdu_complete.csv (11,873 rows)
  ‚úÖ train_urdu_partA.csv through partG.csv (backups)

TRANSLATION COMPLETE!


In [53]:
# ============================================================================
# CELL 13: SAMPLE TRANSLATIONS
# ============================================================================

print("\nSAMPLE TRANSLATIONS FROM COMPLETE DATASET")
print("="*80)

# One sample from each part
sample_indices = [0, 15000, 40000, 60000, 80000, 100000, 125000]

for idx in sample_indices:
    row = df_train_urdu_complete.iloc[idx]
    print(f"\nSample {idx+1} (Row {idx}):")
    title_val = (
    row['title'] if 'title' in df_train_urdu_complete.columns else
    row['article_title'] if 'article_title' in df_train_urdu_complete.columns else
    row.get('article', "N/A") if 'article' in df_train_urdu_complete.columns else
    row.get('doc_title', "N/A") if 'doc_title' in df_train_urdu_complete.columns else
    "N/A"
    )
    print(f"  Article: {title_val}")
    print(f"  \n  English Q: {row['question']}")
    print(f"  Urdu Q:   {row['question_urdu']}")
    print(f"  \n  English A: {row['answer_text'] if pd.notna(row['answer_text']) else 'N/A'}")
    print(f"  Urdu A:   {row['answer_text_urdu']}")
    print("-"*80)

print("\n" + "="*80)
print("READY FOR BENCHMARKING!")
print("="*80)


SAMPLE TRANSLATIONS FROM COMPLETE DATASET

Sample 1 (Row 0):
  Article: Beyonc√©
  
  English Q: When did Beyonce start becoming popular?
  Urdu Q:   ÿ®€åŸàŸÜÿ≥€å ⁄©ÿ® ŸÖŸÇÿ®ŸàŸÑ €ÅŸà ÿ±€Å€å ÿ™⁄æ€åÿü
  
  English A: N/A
  Urdu A:   
--------------------------------------------------------------------------------

Sample 15001 (Row 15000):
  Article: Comics
  
  English Q: Storyboards are not used a lot by what?
  Urdu Q:   ⁄©€ÅÿßŸÜ€å ÿ®Ÿàÿ±⁄à ⁄©Ÿà ⁄©€åÿß ÿ≤€åÿßÿØ€Å ÿßÿ≥ÿ™ÿπŸÖÿßŸÑ ŸÜ€Å€å⁄∫ ⁄©€åÿß ÿ¨ÿßÿ™ÿß €Å€íÿü
  
  English A: N/A
  Urdu A:   
--------------------------------------------------------------------------------

Sample 40001 (Row 40000):
  Article: Macintosh
  
  English Q: What is the standard amount of RAM shipped with most Mac models?
  Urdu Q:   ÿ≤€åÿßÿØ€Å ÿ™ÿ± ŸÖ€å⁄© ŸÖÿß⁄àŸÑÿ≤ ŸÖ€å⁄∫ ÿ±€åŸÖ ⁄©€å ŸÖÿπ€åÿßÿ±€å ŸÖŸÇÿØÿßÿ± ⁄©ÿ™ŸÜ€å €Å€íÿü
  
  English A: N/A
  Urdu A:   
--------------------------------------------------------------------------------

Sample 60001 (Row 6