In [1]:
!pip install -q transformers datasets sentencepiece accelerate evaluate sacrebleu rouge-score

import os
import json
import gc
import pandas as pd
from dataclasses import dataclass
from typing import Dict, List

import torch
from torch import nn

import sentencepiece as spm
from sentencepiece import sentencepiece_model_pb2 as sp_proto

from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForSeq2SeqLM,
    NllbTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

2025-09-20 06:59:36.449620: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758351576.634558      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758351576.687320      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
BASE_MODEL = "facebook/nllb-200-distilled-600M"
WORKDIR = "/kaggle/working/"
os.makedirs(WORKDIR, exist_ok=True)

CSV_PATH = "/kaggle/input/supervise-nllb/top_3000_rows.csv"  # expects columns: Hindi, Bhili
SPM_PREFIX = os.path.join(WORKDIR, "spm_bhili")
MERGED_SPM_PATH = os.path.join(WORKDIR, "sentencepiece.bpe.model")

SRC_LANG = "hin_Deva"   # Hindi (existing NLLB tag)
NEW_LANG = "bhb_Deva"   # Bhili in Devanagari; use "bhb_Gujr" if Gujarati script
MAX_LENGTH = 256
BATCH_SIZE = 2
NUM_EPOCHS = 3
LR = 5e-5
OUTPUT_DIR = os.path.join(WORKDIR, "finetuned-hin-bhb")


# Loading Dataset

In [3]:
df = pd.read_csv(CSV_PATH)
df = df.rename(columns={c: c.strip() for c in df.columns})
df = df.dropna(subset=["Hindi", "Bhili"])
df = df[["Hindi", "Bhili"]].reset_index(drop=True)
len(df), df.head()


(999,
                                                Hindi  \
 0  वे महाराष्ट्र के चंद्रपुर संसदीय क्षेत्र से 16...   
 1  उन्होंने 1972 बैच के आंध्रप्रदेश कैडर के भारती...   
 2  वे आंध्रप्रदेश कैडर के 1972 बैच के भारतीय प्रश...   
 3  उन्होंने 1975 के बैच के भारतीय रेल सेवा के सिग...   
 4  1975 के बैच के सिग्‍नल अभियंताओं के भारतीय रेल...   
 
                                                Bhili  
 0  त्यां महारास्ट्र नां सन्द्रपुर संसदीय ईलाका थी...  
 1  तिनायीं 1972 पाळी ना आंध्रप्रदेस कैडर ना भारती...  
 2  त्यां आंध्रप्रदेस कैडर ना 1972 पाळी नां भारतीय...  
 3  तिहुयें 1975 नी पाळी ना भारतीय रेल सेवा ना सिग...  
 4  1975 नां बैस ना सिग्नल इनजींनीयर नीं भारतीय रे...  )

#  Building a Custom Tokenizer

In [4]:
bhili_corpus_path = os.path.join(WORKDIR, "bhili_corpus.txt")
df["Bhili"].to_csv(bhili_corpus_path, index=False, header=False)

spm.SentencePieceTrainer.train(
    input=bhili_corpus_path,
    model_prefix=SPM_PREFIX,
    vocab_size=2100,
    character_coverage=1.0,
    model_type="unigram",
    shuffle_input_sentence=True,
    num_threads=4,
    bos_id=0,
    eos_id=2,
    pad_id=1,
    unk_id=3,
    add_dummy_prefix=True,
)


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: /kaggle/working/bhili_corpus.txt
  input_format: 
  model_prefix: /kaggle/working/spm_bhili
  model_type: UNIGRAM
  vocab_size: 2100
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 4
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 3
  bos_id: 0
  eos_id: 2
  pad_id: 1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_d

# Training SentencePiece and Adding New-Tag

In [5]:
tok_old = NllbTokenizer.from_pretrained(BASE_MODEL)
sp_old = sp_proto.ModelProto()
sp_old.ParseFromString(tok_old.sp_model.serialized_model_proto())

sp_trained = spm.SentencePieceProcessor(model_file=f"{SPM_PREFIX}.model")
sp_new = sp_proto.ModelProto()
sp_new.ParseFromString(sp_trained.serialized_model_proto())

existing = {p.piece for p in sp_old.pieces}
min_score = min(p.score for p in sp_old.pieces)

added = 0
for p in sp_new.pieces:
    if p.type != 1:  # normal pieces only
        continue
    if p.piece not in existing:
        new_p = sp_proto.ModelProto().SentencePiece()
        new_p.piece = p.piece
        new_p.score = min_score - 1.0
        sp_old.pieces.append(new_p)
        added += 1

with open(MERGED_SPM_PATH, "wb") as f:
    f.write(sp_old.SerializeToString())

print(f"Added {added} new pieces into merged SentencePiece model at {MERGED_SPM_PATH}")


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

Added 941 new pieces into merged SentencePiece model at /kaggle/working/sentencepiece.bpe.model


# Merging Tokenizer

In [6]:
tok_merged = NllbTokenizer.from_pretrained(
    BASE_MODEL,
    vocab_file=MERGED_SPM_PATH,
)

additional = list(tok_merged.additional_special_tokens)
if NEW_LANG not in additional:
    additional.append(NEW_LANG)
tok_merged.add_special_tokens({"additional_special_tokens": additional})

print("Tokenizer size after SP merge:", len(tok_merged))
print("New language token id:", tok_merged.convert_tokens_to_ids(NEW_LANG))


Tokenizer size after SP merge: 256943
New language token id: 256942


In [7]:
model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)
model.resize_token_embeddings(len(tok_merged))

with torch.no_grad():
    new_id = tok_merged.convert_tokens_to_ids(NEW_LANG)
    hin_id = tok_merged.convert_tokens_to_ids(SRC_LANG)
    if new_id is not None and hin_id is not None:
        model.model.shared.weight[new_id] = model.model.shared.weight[hin_id].clone()


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [8]:
tok_merged.src_lang = SRC_LANG
tok_merged.tgt_lang = NEW_LANG

def preprocess(batch):
    return tok_merged(
        batch["Hindi"],
        text_target=batch["Bhili"],
        max_length=MAX_LENGTH,
        truncation=True,
    )


In [9]:
split_idx = int(0.9 * len(df))
train_df = df.iloc[:split_idx].copy()
val_df = df.iloc[split_idx:].copy()

ds = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
})

tokenized = ds.map(preprocess, batched=True, remove_columns=["Hindi", "Bhili"])
tokenized


Map:   0%|          | 0/899 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 899
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [10]:
collator = DataCollatorForSeq2Seq(tokenizer=tok_merged, model=model, padding=True)


# Training

In [11]:
args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=50,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    report_to=[],
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=collator,
    tokenizer=tok_merged,
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
tok_merged.save_pretrained(OUTPUT_DIR)


  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,2.472,3.3564
2,1.8642,3.096367
3,1.7341,3.016486




RuntimeError: [enforce fail at inline_container.cc:626] . unexpected pos 3662915840 vs 3662915728

In [19]:
# !pip install -q transformers sacrebleu pandas

import os, re, torch, pandas as pd
from typing import List
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
import sacrebleu

# --------------------------
# Paths and language settings
# --------------------------
# If uploaded as Kaggle dataset:
# BASE_DIR = "/kaggle/input/my-finetuned-nllb"  # contains checkpoint-XXXX or the saved model files
# If using current working directory outputs:
# BASE_DIR = "/kaggle/working/finetuned-hin-bhb"

BASE_DIR = "/kaggle/working/finetuned-hin-bhb/checkpoint-1350"   # change me
SRC_LANG = "hin_Deva"   # source language tag present in NLLB
TGT_LANG = "bhb_Deva"   # target tag you used during fine-tuning (e.g., Bhili in Devanagari)

# Evaluation CSV with columns: source and reference
EVAL_CSV = "/kaggle/input/supervise-nllb2/1k_test.csv"  # change me
SRC_COL = "Hindi"      # source column name
REF_COL = "Bhili"      # reference column name

MAX_LENGTH = 256
BATCH_SIZE = 4
NUM_BEAMS = 4

# --------------------------
# Pick checkpoint directory
# --------------------------
ckpts = [d for d in os.listdir(BASE_DIR) if d.startswith("checkpoint-") and os.path.isdir(os.path.join(BASE_DIR, d))]
if not ckpts:
    CHECKPOINT_DIR = BASE_DIR  # model saved directly at root
else:
    steps = [(int(re.findall(r"checkpoint-(\d+)", d)[0]), d) for d in ckpts if re.findall(r"checkpoint-(\d+)", d)]
    steps.sort()
    CHECKPOINT_DIR = os.path.join(BASE_DIR, steps[-1][1])
print("Using checkpoint:", CHECKPOINT_DIR)

# --------------------------
# Load model and tokenizer
# --------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT_DIR).to(device)
tokenizer = NllbTokenizer.from_pretrained(CHECKPOINT_DIR, src_lang=SRC_LANG, tgt_lang=TGT_LANG)
forced_bos_id = tokenizer.convert_tokens_to_ids(TGT_LANG)
assert forced_bos_id is not None, f"Target language token {TGT_LANG} not found in tokenizer."

# --------------------------
# Generation function
# --------------------------
@torch.no_grad()
def translate(texts: List[str]) -> List[str]:
    enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LENGTH).to(device)
    gen = model.generate(
        **enc,
        forced_bos_token_id=forced_bos_id,  # critical for NLLB target control
        max_length=MAX_LENGTH,
        num_beams=NUM_BEAMS
    )
    return tokenizer.batch_decode(gen, skip_special_tokens=True)

# --------------------------
# Load eval data and translate
# --------------------------
df = pd.read_csv(EVAL_CSV)
sources = df[SRC_COL].fillna("").astype(str).tolist()
references = df[REF_COL].fillna("").astype(str).tolist()

predictions = []
for i in range(0, len(sources), BATCH_SIZE):
    batch = sources[i:i+BATCH_SIZE]
    predictions.extend(translate(batch))

# --------------------------
# Scoring: BLEU and chrF2
# --------------------------
# Feed detokenized strings to SacreBLEU and let it handle tokenization for BLEU.
# chrF2 is the default for chrF in SacreBLEU (beta=2).
bleu = sacrebleu.corpus_bleu(predictions, [references])         # BLEU
chrf2 = sacrebleu.CHRF(word_order=0, beta=2).corpus_score(       # chrF2 (character n-grams only)
    predictions, [references]
)

print(f"BLEU = {bleu.score:.2f}")
print(f"chrF2 = {chrf2.score:.2f}")


Using checkpoint: /kaggle/working/finetuned-hin-bhb/checkpoint-1350
BLEU = 8.41
chrF2 = 34.60
