## Part 0: Environment Set Up

Run the following cells to load the necessary dependencies and the model Llama 3.2 1b. These should be very similar to the steps in a3.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%pip install tinker

Collecting tinker
  Downloading tinker-0.5.1-py3-none-any.whl.metadata (1.9 kB)
Downloading tinker-0.5.1-py3-none-any.whl (160 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m160.1/160.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tinker
Successfully installed tinker-0.5.1


In [3]:
import tinker
from google.colab import userdata

tinker_api_key = userdata.get('tinker-key')
service_client = tinker.ServiceClient(api_key=tinker_api_key)

In [4]:
base_model = "meta-llama/Llama-3.2-1B"

training_client = service_client.create_lora_training_client(
    base_model=base_model
)

tokenizer = training_client.get_tokenizer()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [21]:
from tinker import types
import json

# REPLACE WITH YOUR OWN FILE PATH "/content/drive/{path}"
project_directory = "/content/drive/MyDrive/2025-2026/NLP/project"
script_type = "romanized"

#native to english
dataset_path = f"{project_directory}/datasets/{script_type}/train.jsonl"

def process_example(example, tokenizer):
    hi = example["hi"].strip()
    en = example["en"].strip()

    if not en:
        return None

    # Input is JUST Hindi
    prompt = hi

    # Tokenization
    prompt_tokens = tokenizer.encode(prompt, add_special_tokens=True)
    prompt_weights = [0] * len(prompt_tokens)

    completion_text = en.strip() + " <end_of_text>"
    completion_tokens = tokenizer.encode(" " + en, add_special_tokens=False)
    completion_weights = [1] * len(completion_tokens)

    tokens = prompt_tokens + completion_tokens
    weights = prompt_weights + completion_weights

    input_tokens = tokens[:-1]
    target_tokens = tokens[1:]
    weights = weights[1:]

    return types.Datum(
        model_input=types.ModelInput.from_ints(tokens=input_tokens),
        loss_fn_inputs=dict(
            weights=np.array(weights, dtype=np.float32),
            target_tokens=np.array(target_tokens, dtype=np.int32)
        )
    )

# Load dataset
dataset = []
with open(dataset_path, "r") as f:
    for line in f:
        try:
            ex = process_example(json.loads(line), tokenizer)
            if ex:
                dataset.append(ex)
        except:
            continue

print(f"Loaded {len(dataset)} examples")

Loaded 9991 examples


In [22]:
from tqdm import tqdm
import numpy as np

learning_rate = 2e-5
epochs = 2
batch_size = 32  # adjust based on GPU memory

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")

    # Iterate in batches
    for start_idx in tqdm(range(0, len(dataset), batch_size), desc="Training", unit="batch"):
        batch = dataset[start_idx:start_idx + batch_size]

        # Forward/backward pass
        fwdbwd_future = training_client.forward_backward(batch, "cross_entropy")
        optim_future = training_client.optim_step(types.AdamParams(learning_rate=learning_rate))

        # Wait for results
        fwdbwd = fwdbwd_future.result()
        optim = optim_future.result()

        # Optional: compute weighted loss for monitoring
        logprobs_list = [np.array(o["logprobs"].data) for o in fwdbwd.loss_fn_outputs]
        logprobs = np.concatenate(logprobs_list)

        weights_list = [np.array(d.loss_fn_inputs["weights"].data) for d in batch]
        weights_list = [w for w in weights_list if w.size > 0]
        weights = np.concatenate(weights_list)

        loss = -np.dot(logprobs, weights) / weights.sum()
        if (start_idx // batch_size) % 100 == 0:
            tqdm.write(
                f"Batch {start_idx//batch_size + 1}/"
                f"{len(dataset)//batch_size + 1}, Loss: {loss:.4f}"
            )

Epoch 1/2


Training:   0%|          | 1/313 [00:01<06:06,  1.17s/batch]

Batch 1/313, Loss: 1.7397


Training:  32%|‚ñà‚ñà‚ñà‚ñè      | 101/313 [06:11<38:04, 10.78s/batch]

Batch 101/313, Loss: 2.1598


Training:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 201/313 [10:34<11:26,  6.13s/batch]

Batch 201/313, Loss: 2.1961


Training:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 301/313 [14:41<00:13,  1.13s/batch]

Batch 301/313, Loss: 2.1143


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 313/313 [16:18<00:00,  3.13s/batch]


Epoch 2/2


Training:   0%|          | 1/313 [00:08<43:44,  8.41s/batch]

Batch 1/313, Loss: 1.6609


Training:  32%|‚ñà‚ñà‚ñà‚ñè      | 101/313 [05:11<16:38,  4.71s/batch]

Batch 101/313, Loss: 2.0524


Training:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 201/313 [09:25<06:17,  3.37s/batch]

Batch 201/313, Loss: 2.0985


Training:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 301/313 [14:12<00:23,  1.97s/batch]

Batch 301/313, Loss: 2.0298


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 313/313 [15:04<00:00,  2.89s/batch]


In [23]:
sampling_client = training_client.save_weights_and_get_sampling_client(
    name="llama-hi-en-translation"
)

In [35]:
from concurrent.futures import ThreadPoolExecutor

def translate_one(text):
    prompt_tokens = tokenizer.encode(text.strip(), add_special_tokens=True)
    model_input = types.ModelInput.from_ints(prompt_tokens)

    sampling_params = types.SamplingParams(
        max_tokens=60,
        temperature=0.2,
        stop=["\n", "<end_of_text>"]
    )

    result = sampling_client.sample(
        prompt=model_input,
        num_samples=1,
        sampling_params=sampling_params
    ).result()

    decoded = tokenizer.decode(result.sequences[0].tokens)
    return decoded.replace("<end_of_text>", "").strip()


In [36]:
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

test_dataset_path = f"{project_directory}/datasets/{script_type}/test.jsonl"
output_path = f"{project_directory}/romanized_predictions.jsonl"

# Load lines
with open(test_dataset_path, "r") as f:
    lines = [json.loads(l) for l in f]

hi_texts = [ex["hi"].strip() for ex in lines]
refs =     [ex["en"].strip() for ex in lines]

predictions = []

# Number of parallel workers
num_workers = 16   # 8‚Äì32 works well
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = {executor.submit(translate_one, txt): idx for idx, txt in enumerate(hi_texts)}

    for future in tqdm(as_completed(futures), total=len(futures), desc="Translating test set", ncols=100):
        i = futures[future]
        pred = future.result()

        predictions.append({
            "hi": hi_texts[i],
            "reference": refs[i],
            "prediction": pred
        })

# Save predictions
with open(output_path, "w") as f:
    for p in predictions:
        f.write(json.dumps(p, ensure_ascii=False) + "\n")

print(f"Predictions saved to {output_path}")


Translating test set: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2507/2507 [09:52<00:00,  4.23it/s]

Predictions saved to /content/drive/MyDrive/2025-2026/NLP/project/romanized_predictions.jsonl





### Calculate BLEU score for fine-tuned model

In [37]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m104.1/104.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.

In [38]:
import sacrebleu

refs = []
hyps = []

with open(output_path, "r") as f:
    for line in f:
        entry = json.loads(line)
        hyps.append(entry["prediction"])
        refs.append([entry["reference"]])  # sacreBLEU expects list of references

bleu = sacrebleu.corpus_bleu(hyps, refs)
print("BLEU score:", bleu.score)

BLEU score: 14.095612952919945


## OLD TINKER CODE BELOW

In [24]:
import json
from tqdm import tqdm
from tinker import types

test_dataset_path = f"{project_directory}/datasets/{script_type}/test.jsonl"
output_path = f"{project_directory}/results/romanized_predictions.jsonl"

predictions = []

def translate_hindi_to_english(hindi_text, sampling_client, tokenizer, max_tokens=60, temperature=0.2):
    """
    Translate a single Hindi sentence to English using the trained LoRA model.
    """
    prompt_tokens = tokenizer.encode(hindi_text.strip(), add_special_tokens=True)
    model_input = types.ModelInput.from_ints(prompt_tokens)
    sampling_params = types.SamplingParams(max_tokens=60, temperature=0.2, stop=["<end_of_text>"])
    result = sampling_client.sample(prompt=model_input, sampling_params=sampling_params, num_samples=1).result()
    translation = tokenizer.decode(result.sequences[0].tokens)
    return translation.replace("<end_of_text>", "").strip()

# Loop over test dataset
for line in tqdm(open(test_dataset_path), desc="Generating predictions"):
    example = json.loads(line)
    hi = example["hi"].strip()
    reference = example["en"].strip()

    pred = translate_hindi_to_english(hi, sampling_client, tokenizer, max_tokens=100, temperature=0.0)

    predictions.append({
        "hi": hi,
        "reference": reference,
        "prediction": pred
    })

# Save predictions for BLEU evaluation
with open(output_path, "w") as f:
    for p in predictions:
        f.write(json.dumps(p, ensure_ascii=False) + "\n")

print(f"Predictions saved to {output_path}")


Generating predictions: 34it [00:48,  1.42s/it]


KeyboardInterrupt: 

In [25]:
def translate_hindi_to_english(hindi_text, sampling_client, tokenizer, max_tokens=60, temperature=0.2):
    """
    Translate a single Hindi sentence to English using the trained LoRA model.
    """
    prompt_tokens = tokenizer.encode(hindi_text.strip(), add_special_tokens=True)
    model_input = types.ModelInput.from_ints(prompt_tokens)
    sampling_params = types.SamplingParams(max_tokens=60, temperature=0.2, stop=["\n", "<end_of_text>"])
    result = sampling_client.sample(prompt=model_input, sampling_params=sampling_params, num_samples=1).result()
    translation = tokenizer.decode(result.sequences[0].tokens)
    return translation.replace("<end_of_text>", "").strip()

In [33]:
import json
from tqdm import tqdm

test_dataset_path = f"{project_directory}/datasets/{script_type}/test.jsonl"
batch_size = 16  # Tinker handles multiple prompts per request if desired

predictions = []

with open(test_dataset_path, "r") as f:
    lines = f.readlines()

for start_idx in tqdm(range(0, len(lines), batch_size), desc="Translating test dataset", ncols=100):
    batch_lines = lines[start_idx:start_idx+batch_size]
    batch_inputs = [json.loads(line)["hi"].strip() for line in batch_lines]
    batch_refs = [json.loads(line)["en"].strip() for line in batch_lines]

    # Translate each sentence individually
    for hi_text, ref_text in zip(batch_inputs, batch_refs):
        pred = translate_hindi_to_english(hi_text, sampling_client, tokenizer)
        predictions.append({
            "hi": hi_text,
            "reference": ref_text,
            "prediction": pred
        })

# Save predictions to file
output_path = f"{project_directory}/romanized_predictions.jsonl"
with open(output_path, "w") as f:
    for p in predictions:
        f.write(json.dumps(p, ensure_ascii=False) + "\n")

print(f"Predictions saved to {output_path}")

Translating test dataset:   0%|                                             | 0/157 [00:00<?, ?it/s]


TypeError: SamplingClient.sample() got an unexpected keyword argument 'batch'

# PREVIOUS CODE BELOW

In [None]:
%pip install huggingface_hub
%pip install sacrebleu
%pip install -U bitsandbytes
!hf auth login

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m104.1/104.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebl

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer,StoppingCriteria, StoppingCriteriaList
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", dtype="auto", device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

## Part 0: Baseline Score Evaluation

We first get baseline BLEU scores for the model with no changes.

First, load our datasets.

In [None]:
from datasets import load_dataset

# REPLACE WITH YOUR OWN FILE PATH "/content/drive/{path}"
project_directory = "/content/drive/MyDrive/2025-2026/NLP/project"

data_files = {
    "train": f"{project_directory}/datasets/native_train.jsonl",
    "validation": f"{project_directory}/datasets/native_val.jsonl",
    "test": f"{project_directory}/datasets/native_test.jsonl"
}

ds = load_dataset("json", data_files=data_files)

train_ds = ds["train"]
val_ds   = ds["validation"]
test_ds  = ds["test"]


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
# --- Ensure tokenizer has a pad token for baseline evaluation ---
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# BASELINE BLEU EVALUATION
import sacrebleu
import torch

def baseline_generate(hindi_sentences, model, tok, max_new_tokens=80):
    # No system prompt ‚Äî pure baseline ability
    inputs = tok(hindi_sentences, return_tensors="pt", padding=True).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False
    )
    return tok.batch_decode(outputs, skip_special_tokens=True)

baseline_preds = []
baseline_refs = []

# TODO: Determine good test data set size
for i in range(5):
    ex = test_ds[i]
    pred = baseline_generate([ex["hi"]], model, tokenizer)[0]
    baseline_preds.append(pred.strip())
    baseline_refs.append(ex["en"].strip())

baseline_bleu = sacrebleu.corpus_bleu(baseline_preds, [baseline_refs])
print("=== BASELINE BLEU (before SFT) ===")
print(baseline_bleu.score)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


=== BASELINE BLEU (before SFT) ===
0.1543677125206915


## Part 1: Supervised Fine Tuning

Tokenizer and prompt prefix.

In [None]:
# System prompt for SFT
PROMPT = (
    "You are a translation assistant. Translate the Hindi text into English. "
    "Do not add explanations or context. Output only the English translation.\n"
)

# Reuse tokenizer
tok = tokenizer
tok.padding_side = "right"
if tok.pad_token is None:
    tok.pad_token = tok.eos_token
tok.truncation_side = "left"

MAX_LEN = 400
SYS_IDS = tok(PROMPT, add_special_tokens=False)["input_ids"]

In [None]:
"""
TOKENIZATION FUNCTION
"""

def tokenize_batch(batch, include_answer=True):
    # Input: Hindi text in column "hi"
    qs = [q.rstrip() for q in batch["hi"]]
    enc_q = tok(qs, add_special_tokens=False, padding=False)

    # Target: English translation in column "en"
    if include_answer:
        ans = [a.rstrip() for a in batch["en"]]
        enc_a = tok(ans, add_special_tokens=False, padding=False)
    else:
        enc_a = {"input_ids": [[] for _ in qs]}

    input_ids_list, prompt_len_list = [], []

    for q_ids, a_ids in zip(enc_q["input_ids"], enc_a["input_ids"]):
        # prompt + hindi + english + eos
        ids = SYS_IDS + q_ids + a_ids + [tok.eos_token_id]

        if len(ids) > MAX_LEN:
            ids = ids[-MAX_LEN:]

        input_ids_list.append(ids)
        prompt_len_list.append(len(SYS_IDS) + len(q_ids))

    return {
        "input_ids": input_ids_list,
        "prompt_len": prompt_len_list,
    }

In [None]:
"""
APPLY TOKENIZATION
"""
train_tok = train_ds.map(
    tokenize_batch,
    batched=True,
    batch_size=512,
    remove_columns=train_ds.column_names,
)

val_tok = val_ds.map(
    tokenize_batch,
    batched=True,
    batch_size=512,
    remove_columns=val_ds.column_names,
)

test_tok = test_ds.map(
    tokenize_batch,
    batched=True,
    batch_size=512,
    remove_columns=test_ds.column_names,
)

In [None]:
"""
PROMPT MASKED COLLATOR
"""

import torch

class PromptMaskedCollator:
    def __init__(self, tokenizer, pad_to_multiple_of=8):
        self.tok = tokenizer
        self.pad_to_multiple_of = pad_to_multiple_of

    def __call__(self, features):
        prompt_len = torch.tensor([f["prompt_len"] for f in features], dtype=torch.long)
        feats = [{k: v for k, v in f.items() if k != "prompt_len"} for f in features]

        batch = self.tok.pad(
            feats,
            padding=True,
            return_tensors="pt",
            pad_to_multiple_of=self.pad_to_multiple_of,
        )

        input_ids = batch["input_ids"]
        attn = batch["attention_mask"]

        T = input_ids.shape[1]
        ar = torch.arange(T).unsqueeze(0)

        labels = input_ids.clone()
        labels[ar < prompt_len.unsqueeze(1)] = -100
        labels[attn == 0] = -100

        batch["labels"] = labels
        return batch

collator = PromptMaskedCollator(tok)


In [None]:
"""
LOAD LORA MODEL
"""

from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B",
    device_map="auto",
    torch_dtype="auto",
    attn_implementation="sdpa",
)

model.config.use_cache = False

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
"""
TRAIN
"""

from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="./hindi_translation_sft",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=5,
    eval_strategy="steps",
    eval_steps=50,
    save_steps=300,
    save_total_limit=2,
    fp16=True,
    remove_unused_columns=False,
    gradient_checkpointing=True,
    group_by_length=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok.select(range(100)),
    data_collator=collator,
)

trainer.train()

trainer.save_model()
tok.save_pretrained("./hindi_translation_sft")


In [None]:
"""
BLEU EVALUATION
"""

import sacrebleu

def generate_translation(model, tok, hindi_sentences, max_new_tokens=80):
    inputs = tok(hindi_sentences, return_tensors="pt", padding=True).to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tok.batch_decode(outputs, skip_special_tokens=True)

preds = []
refs = []

for ex in test_ds:
    pred = generate_translation(model, tok, [ex["hi"]])[0]
    preds.append(pred.strip())
    refs.append(ex["en"].strip())

bleu = sacrebleu.corpus_bleu(preds, [refs])
print("BLEU:", bleu.score)