# Pre-Processing & Setup

## GPU, Imports & Load Data

In [None]:
import torch

print("GPU CHECK")
print("="*80)

if not torch.cuda.is_available():
    raise RuntimeError("CUDA not available!")

device = "cuda"
gpu_name = torch.cuda.get_device_name(0)
gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)

print(f"GPU: {gpu_name}")
print(f"VRAM: {gpu_memory:.1f} GB")
print(f"CUDA: {torch.version.cuda}")
print(f"Device: {device}")
print("="*80)

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.cuda.amp import autocast
from tqdm import tqdm
import warnings
import gc
from datetime import datetime
warnings.filterwarnings('ignore')

print("Libraries imported")

In [None]:
print("\nLOADING DATA")
print("="*80)

TRAIN_CSV = "squad2_train.csv"
DEV_CSV = "squad2_dev.csv"

df_train_full = pd.read_csv(TRAIN_CSV)
df_dev_full = pd.read_csv(DEV_CSV)

print(f"Train: {len(df_train_full):,} rows")
print(f"Dev: {len(df_dev_full):,} rows")
print("="*80)

## 7 Parts with 1st part 10k, and 6-20k parts split into 5 parts.

In [None]:
print("\nSPLITTING TRAIN DATA INTO 7 PARTS")
print("="*80)

splits = [
    ('A', 0, 10000, 10000),
    ('B', 10000, 30000, 20000),
    ('C', 30000, 50000, 20000),
    ('D', 50000, 70000, 20000),
    ('E', 70000, 90000, 20000),
    ('F', 90000, 110000, 20000),
    ('G', 110000, 130319, 20319)
]

parts = {}
for part_name, start, end, size in splits:
    parts[part_name] = df_train_full.iloc[start:end].copy()
    print(f"Part {part_name}: Rows {start:,}-{end-1:,} ({size:,} rows)")

print(f"\nTotal: {sum([s[3] for s in splits]):,} rows")
print("="*80)

## Configurations, Model and Translation Function

In [None]:
BATCH_SIZE = 64
MAX_LENGTH = 384
TARGET_LANG = "tam_Taml"
SOURCE_LANG = "eng_Latn"
MODEL_NAME = "facebook/nllb-200-distilled-600M"

print("\nCONFIGURATION")
print("="*80)
print(f"Batch Size: {BATCH_SIZE}")
print(f"Max Length: {MAX_LENGTH}")
print(f"Model: {MODEL_NAME}")
print(f"Target: Tamil")
print("="*80)

In [None]:
print("\nLOADING MODEL")
print("="*80)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, src_lang=SOURCE_LANG)
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)
model.eval()

print("Model loaded successfully")
print("="*80)

In [None]:
def translate_batch(texts, batch_size=BATCH_SIZE):
    translations = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        ).to(device)
        
        with torch.no_grad():
            with autocast():
                outputs = model.generate(
                    **inputs,
                    forced_bos_token_id=tokenizer.lang_code_to_id[TARGET_LANG],
                    max_length=MAX_LENGTH,
                    num_beams=1,
                    early_stopping=True
                )
        
        batch_translations = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        translations.extend(batch_translations)
        
        if i % (batch_size * 10) == 0:
            torch.cuda.empty_cache()
    
    return translations

print("Translation function defined")

In [None]:
def translate_dataframe(df, part_name, output_file, batch_size=BATCH_SIZE):
    print(f"\nTranslating {part_name}: {len(df):,} rows")
    print("="*80)
    
    start_time = datetime.now()
    df_result = df.copy()
    
    contexts = df['context'].fillna('').astype(str).tolist()
    questions = df['question'].fillna('').astype(str).tolist()
    answers = df['answer_text'].fillna('').astype(str).tolist()
    
    print("Translating contexts...")
    contexts_tamil = translate_batch(contexts, batch_size)
    
    print("Translating questions...")
    questions_tamil = translate_batch(questions, batch_size)
    
    print("Translating answers...")
    answers_tamil = translate_batch(answers, batch_size)
    
    df_result['context_tamil'] = contexts_tamil
    df_result['question_tamil'] = questions_tamil
    df_result['answer_text_tamil'] = answers_tamil
    
    df_result.to_csv(output_file, index=False, encoding='utf-8-sig')
    
    elapsed = datetime.now() - start_time
    print(f"\nCompleted {part_name}")
    print(f"Time: {elapsed}")
    print(f"Saved: {output_file}")
    print("="*80)
    
    return df_result

print("Dataframe translation function defined")

## Further Split B-G into 5 Sub-parts Each

In [None]:
print("\nSPLITTING B-G INTO 5 SUB-PARTS EACH")
print("="*80)

def split_into_5(df, part_name):
    size = len(df)
    chunk_size = size // 5
    
    parts = {}
    for i in range(1, 6):
        start = (i-1) * chunk_size
        end = i * chunk_size if i < 5 else size
        parts[f"{part_name}{i}"] = df.iloc[start:end].copy()
        print(f"  {part_name}{i}: {len(parts[f'{part_name}{i}']):,} rows")
    
    return parts

parts_B = split_into_5(parts['B'], 'B')
parts_C = split_into_5(parts['C'], 'C')
parts_D = split_into_5(parts['D'], 'D')
parts_E = split_into_5(parts['E'], 'E')
parts_F = split_into_5(parts['F'], 'F')
parts_G = split_into_5(parts['G'], 'G')

print("="*80)

## Split Dev into 3 Sub-parts

In [None]:
print("\nSPLITTING DEV INTO 3 SUB-PARTS")
print("="*80)

dev_size = len(df_dev_full)
dev_chunk = dev_size // 3

parts_Dev = {
    'Dev1': df_dev_full.iloc[0:dev_chunk].copy(),
    'Dev2': df_dev_full.iloc[dev_chunk:2*dev_chunk].copy(),
    'Dev3': df_dev_full.iloc[2*dev_chunk:].copy()
}

for name, df in parts_Dev.items():
    print(f"{name}: {len(df):,} rows")

print("="*80)

# Translation

## Part A

In [None]:
df_partA_tamil = translate_dataframe(parts['A'], 'A', 'train_tamil_partA.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

## Part B

In [None]:
df_partB1_tamil = translate_dataframe(parts_B['B1'], 'B1', 'train_tamil_partB1.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partB2_tamil = translate_dataframe(parts_B['B2'], 'B2', 'train_tamil_partB2.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partB3_tamil = translate_dataframe(parts_B['B3'], 'B3', 'train_tamil_partB3.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partB4_tamil = translate_dataframe(parts_B['B4'], 'B4', 'train_tamil_partB4.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partB5_tamil = translate_dataframe(parts_B['B5'], 'B5', 'train_tamil_partB5.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

## Part C

In [None]:
df_partC1_tamil = translate_dataframe(parts_C['C1'], 'C1', 'train_tamil_partC1.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partC2_tamil = translate_dataframe(parts_C['C2'], 'C2', 'train_tamil_partC2.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partC3_tamil = translate_dataframe(parts_C['C3'], 'C3', 'train_tamil_partC3.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partC4_tamil = translate_dataframe(parts_C['C4'], 'C4', 'train_tamil_partC4.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partC5_tamil = translate_dataframe(parts_C['C5'], 'C5', 'train_tamil_partC5.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

## Part D

In [None]:
df_partD1_tamil = translate_dataframe(parts_D['D1'], 'D1', 'train_tamil_partD1.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partD2_tamil = translate_dataframe(parts_D['D2'], 'D2', 'train_tamil_partD2.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partD3_tamil = translate_dataframe(parts_D['D3'], 'D3', 'train_tamil_partD3.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partD4_tamil = translate_dataframe(parts_D['D4'], 'D4', 'train_tamil_partD4.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partD5_tamil = translate_dataframe(parts_D['D5'], 'D5', 'train_tamil_partD5.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

## Part E

In [None]:
df_partE1_tamil = translate_dataframe(parts_E['E1'], 'E1', 'train_tamil_partE1.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partE2_tamil = translate_dataframe(parts_E['E2'], 'E2', 'train_tamil_partE2.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partE3_tamil = translate_dataframe(parts_E['E3'], 'E3', 'train_tamil_partE3.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partE4_tamil = translate_dataframe(parts_E['E4'], 'E4', 'train_tamil_partE4.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partE5_tamil = translate_dataframe(parts_E['E5'], 'E5', 'train_tamil_partE5.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

## Part F

In [None]:
df_partF1_tamil = translate_dataframe(parts_F['F1'], 'F1', 'train_tamil_partF1.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partF2_tamil = translate_dataframe(parts_F['F2'], 'F2', 'train_tamil_partF2.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partF3_tamil = translate_dataframe(parts_F['F3'], 'F3', 'train_tamil_partF3.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partF4_tamil = translate_dataframe(parts_F['F4'], 'F4', 'train_tamil_partF4.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partF5_tamil = translate_dataframe(parts_F['F5'], 'F5', 'train_tamil_partF5.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

## Part G

In [None]:
df_partG1_tamil = translate_dataframe(parts_G['G1'], 'G1', 'train_tamil_partG1.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partG2_tamil = translate_dataframe(parts_G['G2'], 'G2', 'train_tamil_partG2.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partG3_tamil = translate_dataframe(parts_G['G3'], 'G3', 'train_tamil_partG3.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partG4_tamil = translate_dataframe(parts_G['G4'], 'G4', 'train_tamil_partG4.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
df_partG5_tamil = translate_dataframe(parts_G['G5'], 'G5', 'train_tamil_partG5.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

In [None]:
print("\n" + "="*80)
print("ALL TRAIN PARTS COMPLETE")
print("="*80)

## Dev

In [None]:
print("\nTRANSLATING DEV SPLIT - Part 1")
print("="*80)

df_partDev1_tamil = translate_dataframe(parts_Dev['Dev1'], 'Dev1', 'dev_tamil_partDev1.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

print(f"\nDev complete: {len(df_partDev1_tamil):,} rows")

In [None]:
print("\nTRANSLATING DEV SPLIT - Part 2")
print("="*80)

df_partDev2_tamil = translate_dataframe(parts_Dev['Dev2'], 'Dev2', 'dev_tamil_partDev2.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

print(f"\nDev complete: {len(df_partDev2_tamil):,} rows")

In [None]:
print("\nTRANSLATING DEV SPLIT - Part 3")
print("="*80)

df_partDev3_tamil = translate_dataframe(parts_Dev['Dev3'], 'Dev3', 'dev_tamil_partDev3.csv', BATCH_SIZE)

torch.cuda.empty_cache()
gc.collect()

print(f"\nDev complete: {len(df_partDev3_tamil):,} rows")

# Post-Processing

## Concatenate

In [None]:
print("\nCONCATENATING ALL PARTS")
print("="*80)

df_train_tamil_complete = pd.concat([
    df_partA_tamil,
    df_partB1_tamil, df_partB2_tamil, df_partB3_tamil, df_partB4_tamil, df_partB5_tamil,
    df_partC1_tamil, df_partC2_tamil, df_partC3_tamil, df_partC4_tamil, df_partC5_tamil,
    df_partD1_tamil, df_partD2_tamil, df_partD3_tamil, df_partD4_tamil, df_partD5_tamil,
    df_partE1_tamil, df_partE2_tamil, df_partE3_tamil, df_partE4_tamil, df_partE5_tamil,
    df_partF1_tamil, df_partF2_tamil, df_partF3_tamil, df_partF4_tamil, df_partF5_tamil,
    df_partG1_tamil, df_partG2_tamil, df_partG3_tamil, df_partG4_tamil, df_partG5_tamil
], ignore_index=True)

df_dev_tamil_complete = pd.concat([
    df_partDev1_tamil,
    df_partDev2_tamil,
    df_partDev3_tamil
], ignore_index=True)

df_train_tamil_complete.to_csv('squad2_train_tamil_complete.csv', index=False, encoding='utf-8-sig')
df_dev_tamil_complete.to_csv('squad2_dev_tamil_complete.csv', index=False, encoding='utf-8-sig')

print(f"Train: {len(df_train_tamil_complete):,} rows")
print(f"Dev: {len(df_dev_tamil_complete):,} rows")
print("="*80)

In [None]:
print("\nFINAL STATISTICS")
print("="*80)

print(f"\nCOMPLETED TRANSLATION:")
print(f"  Train: {len(df_train_tamil_complete):,} rows")
print(f"  Dev: {len(df_dev_tamil):,} rows")
print(f"  Total: {len(df_train_tamil_complete) + len(df_dev_tamil):,} rows")

df_train_tamil_complete['context_ratio'] = (
    df_train_tamil_complete['context_tamil'].str.len() / 
    df_train_tamil_complete['context'].str.len()
)
df_train_tamil_complete['question_ratio'] = (
    df_train_tamil_complete['question_tamil'].str.len() / 
    df_train_tamil_complete['question'].str.len()
)

print(f"\nLENGTH RATIOS (Tamil/English):")
print(f"  Context median: {df_train_tamil_complete['context_ratio'].median():.2f}x")
print(f"  Question median: {df_train_tamil_complete['question_ratio'].median():.2f}x")

print(f"\nOUTPUT FILES:")
print(f"  squad2_train_tamil_complete.csv ({len(df_train_tamil_complete):,} rows)")
print(f"  squad2_dev_tamil_complete.csv ({len(df_dev_tamil):,} rows)")
print(f"  train_tamil_partA.csv through partG.csv (backups)")

print("\n" + "="*80)
print("TRANSLATION COMPLETE")
print("="*80)

In [None]:
print("\nSAMPLE TRANSLATIONS FROM COMPLETE DATASET")
print("="*80)

sample_indices = [0, 15000, 40000, 60000, 80000, 100000, 125000]

for idx in sample_indices:
    row = df_train_tamil_complete.iloc[idx]
    print(f"\nSample {idx+1} (Row {idx}):")
    print(f"  Article: {row['article_title']}")
    print(f"  \n  English Q: {row['question']}")
    print(f"  Tamil Q:   {row['question_tamil']}")
    print(f"  \n  English A: {row['answer_text'] if pd.notna(row['answer_text']) else 'N/A'}")
    print(f"  Tamil A:   {row['answer_text_tamil']}")
    print("-"*80)

print("\n" + "="*80)
print("READY FOR BENCHMARKING")
print("="*80)