In [1]:
import pandas as pd
import numpy as np 
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("translation", model="facebook/nllb-200-3.3B")

  _torch_pytree._register_pytree_node(
Loading checkpoint shards: 100%|██████████| 3/3 [01:50<00:00, 36.92s/it]


In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [4]:
src_lang = "eng_Latn"
tgt_lang = "tel_Telu"   # Telugu

INPUT_CSV  = "train_dataset.csv"          
OUTPUT_CSV = "dataset_telugu.csv"   
QUESTION_COL = "qa_question"              
NEW_COL = "question_in_telugu"            
MAX_LEN = 200

In [5]:
df = pd.read_csv('train_dataset.csv')

if QUESTION_COL not in df.columns:
    raise ValueError(f"Column '{QUESTION_COL}' not found in {INPUT_CSV}")
assert df[QUESTION_COL].isna().sum() == 0, "Found missing values in questions."

texts = df[QUESTION_COL].astype(str).tolist()

In [6]:
tokenizer.src_lang = src_lang

In [8]:
import torch

In [9]:
def get_forced_bos_id(tok, lang_code: str):
    # 1) Preferred (newer transformers)
    if hasattr(tok, "lang_code_to_id") and isinstance(getattr(tok, "lang_code_to_id"), dict):
        if lang_code in tok.lang_code_to_id:
            return tok.lang_code_to_id[lang_code]
    # 2) Convert token -> id (works in many versions)
    try:
        tid = tok.convert_tokens_to_ids(lang_code)
        if isinstance(tid, int) and tid != tok.unk_token_id:
            return tid
    except Exception:
        pass
    # 3) Look up in vocab directly
    vocab = tok.get_vocab() if hasattr(tok, "get_vocab") else {}
    if lang_code in vocab:
        return vocab[lang_code]
    # 4) As a last resort, raise a clear error
    raise RuntimeError(
        f"Could not resolve BOS id for language '{lang_code}'. "
        "Try updating transformers: pip install -U transformers"
    )

FORCED_BOS_ID = get_forced_bos_id(tokenizer, tgt_lang)

device = torch.device("mps")
model.to(device)
model.eval()

M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): Embedding(256206, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): Embedding(256206, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-11): 12 x M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm

In [10]:
def translate_batch(batch_texts):
    enc = tokenizer(
        batch_texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
    ).to(device)
    
    # FIX: Use tokenizer.convert_tokens_to_ids() instead
    # Set the target language properly
    tokenizer.src_lang = "eng_Latn"  # Source language (English)
    forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_lang)
    
    with torch.no_grad():
        gen = model.generate(
            **enc,
            forced_bos_token_id=forced_bos_token_id,
            max_length=MAX_LEN,
        )
    
    return [tokenizer.decode(g, skip_special_tokens=True) for g in gen]

In [11]:
from tqdm import tqdm
import torch


translated = []
for i in tqdm(range(0, len(texts), 32), desc="Translating to Telugu"):
    batch = texts[i:i+32]
    translated.extend(translate_batch(batch))

Translating to Telugu: 100%|██████████| 4073/4073 [6:10:24<00:00,  5.46s/it]   


In [12]:
len(translated)

130319

In [13]:
df[NEW_COL] = translated

In [14]:
df.head()

Unnamed: 0,answer_answer_start,answer_text,context,paragraph_index,plausible_answer_answer_start,plausible_answer_text,qa_id,qa_is_impossible,qa_question,qas_index,title,version,question_in_telugu
0,269.0,in the late 1990s,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,0,,,56be85543aeaaa14008c9063,False,When did Beyonce start becoming popular?,0,Beyoncé,v2.0,బీయన్స్ ఎప్పుడు ప్రజాదరణ పొందడం ప్రారంభించింది?
1,207.0,singing and dancing,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,0,,,56be85543aeaaa14008c9065,False,What areas did Beyonce compete in when she was...,1,Beyoncé,v2.0,బీయన్స్ ఏ రంగాల్లో పోటీ పడ్డాడు?
2,526.0,2003,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,0,,,56be85543aeaaa14008c9066,False,When did Beyonce leave Destiny's Child and bec...,2,Beyoncé,v2.0,బీయన్స్ ఎప్పుడు డెస్టినీ చైల్డ్ను విడిచిపెట్టి...
3,166.0,"Houston, Texas",Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,0,,,56bf6b0f3aeaaa14008c9601,False,In what city and state did Beyonce grow up?,3,Beyoncé,v2.0,"బీయన్స్ ఏ నగరంలో, ఏ రాష్ట్రంలో పెరిగాడు?"
4,276.0,late 1990s,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,0,,,56bf6b0f3aeaaa14008c9602,False,In which decade did Beyonce become famous?,4,Beyoncé,v2.0,ఏ దశాబ్దంలో బీయన్స్ ప్రసిద్ధి చెందింది?


In [16]:
df.to_csv('translated_to_telugu_train.csv', index=False)