In [None]:
!pip install --upgrade torch==2.3.0
!pip install --upgrade transformers==4.41.2
!pip install accelerate sentencepiece

Collecting torch==2.3.0
  Downloading torch-2.3.0-cp312-cp312-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.3.0)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.3.0)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.3.0)
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylin

Collecting transformers==4.41.2
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m111.1 MB/s[0m eta [36m0:00:00[0m
[?25hTraceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
^C


In [None]:
# %% === Auto-Continue TRAIN Translator (after 15k) ===
!pip -q install transformers==4.44.2 torch tqdm pandas

import json, pandas as pd, torch, gc, os, time
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from google.colab import drive
drive.mount('/content/drive')

# === CONFIG ===
ROOT = "/content/drive/MyDrive/msu-npl"
TRAIN_PATH = f"{ROOT}/train-v2.0.json"
MODEL_NAME = "facebook/nllb-200-distilled-600M"
SRC_LANG, TGT_LANG = "eng_Latn", "hin_Deva"
CHUNK_TRAIN = 5000
RESUME_FROM_TRAIN = 15000   # resume after last completed chunk

# === Load Model ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("✅ Device:", device)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
tokenizer.src_lang = SRC_LANG
try:
    bos_id = tokenizer.lang_code_to_id[TGT_LANG]
except AttributeError:
    bos_id = tokenizer.convert_tokens_to_ids(TGT_LANG)
print("✅ Model loaded successfully!")

# === Helper: flatten SQuAD JSON ===
def flatten_squad(path):
    with open(path) as f:
        data = json.load(f)
    recs=[]
    for art in data["data"]:
        title = art.get("title","")
        for para in art["paragraphs"]:
            ctx = para["context"]
            for qa in para["qas"]:
                q = qa["question"]
                ans = qa.get("answers", [])
                txt = ans[0]["text"] if ans else ""
                recs.append({
                    "title": title,
                    "context": ctx,
                    "question": q,
                    "answer_text": txt
                })
    return pd.DataFrame(recs)

df_train = flatten_squad(TRAIN_PATH)
print(f"📊 Train dataset loaded | Total: {len(df_train)} samples")

# === Translator ===
def batch_translate(texts, batch_size=8, max_len=512):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Translating"):
        batch = [str(t) if isinstance(t, str) else "" for t in texts[i:i+batch_size]]
        inputs = tokenizer(batch, return_tensors="pt", padding=True,
                           truncation=True, max_length=max_len).to(device)
        with torch.no_grad():
            gen = model.generate(**inputs, forced_bos_token_id=bos_id, max_length=max_len)
        results.extend([tokenizer.decode(g, skip_special_tokens=True) for g in gen])
        torch.cuda.empty_cache(); gc.collect()
    return results

# === Loop through remaining train chunks ===
for TRAIN_START in range(RESUME_FROM_TRAIN, len(df_train), CHUNK_TRAIN):
    TRAIN_END = min(TRAIN_START + CHUNK_TRAIN, len(df_train))
    print(f"\n=== 🔹 Translating Train {TRAIN_START}-{TRAIN_END} ===")

    train_chunk = df_train.iloc[TRAIN_START:TRAIN_END].copy().reset_index(drop=True)
    train_chunk["context_hi"]  = batch_translate(train_chunk["context"].tolist())
    train_chunk["question_hi"] = batch_translate(train_chunk["question"].tolist())

    train_out = f"{ROOT}/train_translated_{TRAIN_START}-{TRAIN_END}.csv"
    train_chunk.to_csv(train_out, index=False)
    print(f"✅ Saved: {train_out}")

    # brief pause between chunks to prevent GPU timeouts
    time.sleep(30)

print("\n🎉 All remaining training chunks processed successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Device: cuda




✅ Model loaded successfully!
📊 Train dataset loaded | Total: 130319 samples

=== 🔹 Translating Train 15000-20000 ===


Translating:   0%|          | 1/625 [00:04<50:00,  4.81s/it]