## Part 0: Environment Set Up

Run the following cells to load the necessary dependencies and the model Llama 3.2 1b. These should be very similar to the steps in a3.

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
%pip install huggingface_hub
%pip install sacrebleu
%pip install -U bitsandbytes
!hf auth login

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1

    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|       

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer,StoppingCriteria, StoppingCriteriaList
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", dtype="auto", device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

## Part 0: Baseline Score Evaluation

We first get baseline BLEU scores for the model with no changes.

First, load our datasets.

In [5]:
from datasets import load_dataset

# REPLACE WITH YOUR OWN FILE PATH "/content/drive/{path}"
project_directory = "/content/drive/MyDrive/2025-2026/NLP/project"

data_files = {
    "train": f"{project_directory}/datasets/native_train.jsonl",
    "validation": f"{project_directory}/datasets/native_val.jsonl",
    "test": f"{project_directory}/datasets/native_test.jsonl"
}

ds = load_dataset("json", data_files=data_files)

train_ds = ds["train"]
val_ds   = ds["validation"]
test_ds  = ds["test"]


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [11]:
# --- Ensure tokenizer has a pad token for baseline evaluation ---
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# BASELINE BLEU EVALUATION
import sacrebleu
import torch

def baseline_generate(hindi_sentences, model, tok, max_new_tokens=80):
    # No system prompt — pure baseline ability
    inputs = tok(hindi_sentences, return_tensors="pt", padding=True).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False
    )
    return tok.batch_decode(outputs, skip_special_tokens=True)

baseline_preds = []
baseline_refs = []

# TODO: Determine good test data set size
for i in range(5):
    ex = test_ds[i]
    pred = baseline_generate([ex["hi"]], model, tokenizer)[0]
    baseline_preds.append(pred.strip())
    baseline_refs.append(ex["en"].strip())

baseline_bleu = sacrebleu.corpus_bleu(baseline_preds, [baseline_refs])
print("=== BASELINE BLEU (before SFT) ===")
print(baseline_bleu.score)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


=== BASELINE BLEU (before SFT) ===
0.1543677125206915


## Part 1: Supervised Fine Tuning

Tokenizer and prompt prefix.

In [None]:
# System prompt for SFT
PROMPT = (
    "You are a translation assistant. Translate the Hindi text into English. "
    "Do not add explanations or context. Output only the English translation.\n"
)

# Reuse tokenizer
tok = tokenizer
tok.padding_side = "right"
if tok.pad_token is None:
    tok.pad_token = tok.eos_token
tok.truncation_side = "left"

MAX_LEN = 400
SYS_IDS = tok(PROMPT, add_special_tokens=False)["input_ids"]

In [None]:
"""
TOKENIZATION FUNCTION
"""

def tokenize_batch(batch, include_answer=True):
    # Input: Hindi text in column "hi"
    qs = [q.rstrip() for q in batch["hi"]]
    enc_q = tok(qs, add_special_tokens=False, padding=False)

    # Target: English translation in column "en"
    if include_answer:
        ans = [a.rstrip() for a in batch["en"]]
        enc_a = tok(ans, add_special_tokens=False, padding=False)
    else:
        enc_a = {"input_ids": [[] for _ in qs]}

    input_ids_list, prompt_len_list = [], []

    for q_ids, a_ids in zip(enc_q["input_ids"], enc_a["input_ids"]):
        # prompt + hindi + english + eos
        ids = SYS_IDS + q_ids + a_ids + [tok.eos_token_id]

        if len(ids) > MAX_LEN:
            ids = ids[-MAX_LEN:]

        input_ids_list.append(ids)
        prompt_len_list.append(len(SYS_IDS) + len(q_ids))

    return {
        "input_ids": input_ids_list,
        "prompt_len": prompt_len_list,
    }

In [None]:
"""
APPLY TOKENIZATION
"""
train_tok = train_ds.map(
    tokenize_batch,
    batched=True,
    batch_size=512,
    remove_columns=train_ds.column_names,
)

val_tok = val_ds.map(
    tokenize_batch,
    batched=True,
    batch_size=512,
    remove_columns=val_ds.column_names,
)

test_tok = test_ds.map(
    tokenize_batch,
    batched=True,
    batch_size=512,
    remove_columns=test_ds.column_names,
)

In [None]:
"""
PROMPT MASKED COLLATOR
"""

import torch

class PromptMaskedCollator:
    def __init__(self, tokenizer, pad_to_multiple_of=8):
        self.tok = tokenizer
        self.pad_to_multiple_of = pad_to_multiple_of

    def __call__(self, features):
        prompt_len = torch.tensor([f["prompt_len"] for f in features], dtype=torch.long)
        feats = [{k: v for k, v in f.items() if k != "prompt_len"} for f in features]

        batch = self.tok.pad(
            feats,
            padding=True,
            return_tensors="pt",
            pad_to_multiple_of=self.pad_to_multiple_of,
        )

        input_ids = batch["input_ids"]
        attn = batch["attention_mask"]

        T = input_ids.shape[1]
        ar = torch.arange(T).unsqueeze(0)

        labels = input_ids.clone()
        labels[ar < prompt_len.unsqueeze(1)] = -100
        labels[attn == 0] = -100

        batch["labels"] = labels
        return batch

collator = PromptMaskedCollator(tok)


In [None]:
"""
LOAD LORA MODEL
"""

from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B",
    device_map="auto",
    torch_dtype="auto",
    attn_implementation="sdpa",
)

model.config.use_cache = False

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
"""
TRAIN
"""

from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="./hindi_translation_sft",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=5,
    eval_strategy="steps",
    eval_steps=50,
    save_steps=300,
    save_total_limit=2,
    fp16=True,
    remove_unused_columns=False,
    gradient_checkpointing=True,
    group_by_length=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok.select(range(100)),
    data_collator=collator,
)

trainer.train()

trainer.save_model()
tok.save_pretrained("./hindi_translation_sft")


In [None]:
"""
BLEU EVALUATION
"""

import sacrebleu

def generate_translation(model, tok, hindi_sentences, max_new_tokens=80):
    inputs = tok(hindi_sentences, return_tensors="pt", padding=True).to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tok.batch_decode(outputs, skip_special_tokens=True)

preds = []
refs = []

for ex in test_ds:
    pred = generate_translation(model, tok, [ex["hi"]])[0]
    preds.append(pred.strip())
    refs.append(ex["en"].strip())

bleu = sacrebleu.corpus_bleu(preds, [refs])
print("BLEU:", bleu.score)