In [1]:
import os
import torch

# Set new cache directories
os.environ["HF_DATASETS_CACHE"] = "/ephemeral/hf_cache"
os.environ["HF_HOME"] = "/ephemeral/transformers_cache"
os.environ["TMPDIR"] = "/ephemeral/tmp"

# Ensure the directories exist
os.makedirs("/ephemeral/hf_cache", exist_ok=True)
os.makedirs("/ephemeral/transformers_cache", exist_ok=True)
os.makedirs("/ephemeral/tmp", exist_ok=True)

import torch
import wandb
from transformers import AutoModelForCausalLM,AutoTokenizer,Trainer,TrainingArguments,DataCollatorForLanguageModeling,DataCollatorWithPadding
from datasets import Dataset, DatasetDict, load_dataset
from huggingface_hub import login

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cpu"

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel
import torch

# Initialize the tokenizer with the source language (Mongolian in Cyrillic)
tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-1.3B",
    src_lang="khk_Cyrl"
)

# Load the base model directly onto the CPU
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    "facebook/nllb-200-distilled-1.3B",
    device_map={"": "cpu"}  # Explicitly set the device map to CPU
)

# Load the fine-tuned LoRA model onto the CPU
model = PeftModel.from_pretrained(
    base_model,
    "Billyyy/mon_nllb_3B",
    torch_device="cpu"  # Ensure the adapter is loaded on the CPU
)

# Confirm the model is on CPU
device = torch.device("cpu")
model.to(device)

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): M2M100ForConditionalGeneration(
      (model): M2M100Model(
        (shared): M2M100ScaledWordEmbedding(256206, 1024, padding_idx=1)
        (encoder): M2M100Encoder(
          (embed_tokens): M2M100ScaledWordEmbedding(256206, 1024, padding_idx=1)
          (embed_positions): M2M100SinusoidalPositionalEmbedding()
          (layers): ModuleList(
            (0-23): 24 x M2M100EncoderLayer(
              (self_attn): M2M100SdpaAttention(
                (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1024, out_features=16, bias=False)
                  )
                  

In [376]:
from datasets import load_dataset

# Load the FLORES-200 dataset
dataset = load_dataset("facebook/flores", "eng_Latn-khk_Cyrl", split="devtest")

In [383]:
dataset[4]

{'id': 5,
 'URL': 'https://en.wikinews.org/wiki/Nobel_Prize_in_Literature_Committee_abandons_efforts_to_contact_Bob_Dylan',
 'domain': 'wikinews',
 'topic': 'music',
 'has_image': 0,
 'has_hyperlink': 0,
 'sentence_eng_Latn': 'Danius said, "Right now we are doing nothing. I have called and sent emails to his closest collaborator and received very friendly replies. For now, that is certainly enough."',
 'sentence_khk_Cyrl': 'Даниус хэлэхдээ "Яг одоо бид юу ч хийхгүй байгаа. Бид түүний хамгийн ойрын хамтрагчтай ярьж, цахим шуудан илгээсэн ба маш нөхөрсөг хариулт хүлээн авсан. Одоохондоо тэр л хангалттай" гэв.'}

In [7]:
import os

os.environ["OMP_NUM_THREADS"] = "8"  # Adjust based on testing
os.environ["OPENBLAS_NUM_THREADS"] = "8"
os.environ["MKL_NUM_THREADS"] = "8"
os.environ["VECLIB_MAXIMUM_THREADS"] = "8"
os.environ["NUMEXPR_NUM_THREADS"] = "8"

import torch
torch.set_num_threads(8)


In [19]:
import json

def load_questions(filename):
    """
    Load multiple-choice questions from a JSON file.

    Args:
        filename (str): The path to the JSON file containing questions.

    Returns:
        list: A list of dictionaries, each representing a question.
    """
    with open(filename, 'r', encoding='utf-8') as file:
        questions = json.load(file)
    return questions

def extract_questions_answers_choices(questions):
    """
    Extract questions, their corresponding answer keys, and choices with labels.

    Args:
        questions (list): A list of dictionaries, each containing a question and its details.

    Returns:
        tuple: Three lists - questions, answer keys, and choices with labels.
    """
    question_texts = []
    answer_keys = []
    choices_with_labels = []

    for q in questions:
        question_texts.append(q['question'])
        answer_keys.append(q['answerKey'])
        choices = {choice['label']: choice['text'] for choice in q['choices']}
        choices_with_labels.append(choices)

    return question_texts, answer_keys, choices_with_labels

# Path to your JSON file
filename = 'evaluation/MM-Eval/knowledge_eval.json'

# Load questions from the JSON file
questions = load_questions(filename)

# Extract questions, answer keys, and choices with labels
question_texts, answer_keys, choices_with_labels = extract_questions_answers_choices(questions)

# Example usage: print the first question, its answer key, and choices
print(f"First Question: {question_texts[0]}")
print(f"Answer Key: {answer_keys[0]}")
print("Choices:")
for label, text in choices_with_labels[0].items():
    print(f"  {label}: {text}")



First Question: бахамад ямар мөнгө авах вэ?
Answer Key: A
Choices:
  A: багамын доллар
  B: британийн фунт
  C: евро
  D: ам.доллар


In [18]:
answer_keys

['A',
 'B',
 'C',
 'B',
 'B',
 'A',
 'B',
 'D',
 'B',
 'B',
 'A',
 'D',
 'C',
 'C',
 'A',
 'A',
 'D',
 'B',
 'C',
 'C',
 'B',
 'B',
 'D',
 'A',
 'A',
 'D',
 'B',
 'C',
 'D',
 'C',
 'D',
 'D',
 'C',
 'C',
 'D',
 'D',
 'B',
 'A',
 'B',
 'B',
 'D',
 'D',
 'B',
 'C',
 'B',
 'A',
 'D',
 'A',
 'C',
 'C',
 'D',
 'B',
 'C',
 'D',
 'D',
 'C',
 'A',
 'A',
 'D',
 'A',
 'B',
 'C',
 'C',
 'C',
 'C',
 'A',
 'C',
 'C',
 'D',
 'B',
 'A',
 'A',
 'C',
 'A',
 'B',
 'B',
 'A',
 'C',
 'A',
 'D',
 'D',
 'A',
 'C',
 'B',
 'A',
 'A',
 'D',
 'C',
 'D',
 'D',
 'A',
 'A',
 'A',
 'B',
 'A',
 'B',
 'C',
 'A',
 'B',
 'D',
 'D',
 'A',
 'C',
 'B',
 'C',
 'B',
 'A',
 'B',
 'C',
 'A',
 'D',
 'B',
 'D',
 'D',
 'C',
 'A',
 'B',
 'B',
 'B',
 'A',
 'A',
 'D',
 'D',
 'C',
 'D',
 'B',
 'A',
 'D',
 'A',
 'A',
 'A',
 'C',
 'A',
 'D',
 'A',
 'D',
 'D',
 'A',
 'A',
 'A',
 'D',
 'D',
 'A',
 'A',
 'C',
 'D',
 'B',
 'D',
 'A',
 'B',
 'A',
 'B',
 'C',
 'B',
 'A',
 'C',
 'A',
 'B',
 'B',
 'D',
 'B',
 'D',
 'B',
 'C',
 'A',
 'B',
 'C'

In [8]:
import sacrebleu
import tqdm

def translate_text(text):
    """Translates English text to Mongolian using the pre-trained model."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256).to(device)
    with torch.no_grad():
        output_tokens = model.generate(**inputs, max_length=256)
    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)

# Collect model predictions and references
predictions = []
references = []

for example in tqdm.tqdm(dataset):
    src_text = example["sentence_khk_Cyrl"]  # English input
    ref_text = example["sentence_eng_Latn"]  # Mongolian reference
    
    # Translate using the pre-trained model
    translated_text = translate_text(src_text)
    
    # Store for BLEU evaluation
    predictions.append(translated_text)
    references.append([ref_text])  # sacrebleu expects a list of references

# Compute BLEU score
import sacrebleu
bleu = sacrebleu.corpus_bleu(predictions, references, tokenize="intl")
print(f"BLEU Score (Pre-trained NLLB-200): {bleu.score:.2f}")

# Compute chrF++ score
chrf = sacrebleu.corpus_chrf(predictions, references, beta=2)  # chrF++ (β=2 for recall emphasis)
print(f"chrF++ Score (Pre-trained NLLB-200): {chrf.score:.2f}")


  1%|▏         | 15/1012 [01:34<1:45:01,  6.32s/it]


KeyboardInterrupt: 

In [2]:
def show_first_lines_text(file_path, num_lines=5):
    """
    Reads and prints the first `num_lines` lines from a plain text file.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        for i in range(num_lines):
            line = f.readline()
            if not line:
                break  # Stop if file has fewer than num_lines
            print(f"Line {i+1}: {line.strip()}")


In [386]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import re

model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
device = "cuda"

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.48s/it]


In [392]:
def get_numeric_answer(question):
    """Generate a response and extract only the number from DeepSeek."""
    prompt = f"You are a AI problem solver working with very bad quality questions. Read the problem carefully and answer based on the variation of question that makes more sense and output the answer in following format ///YOUR NUMBER/// the numerical answer, without explanation or reasoning.\n\nProblem: {question}"
    
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        output_tokens = model.generate(
            **inputs,
            max_new_tokens=500,  # Small limit to avoid extra words
            do_sample=False  # Greedy decoding for deterministic output
        )
    
    output_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    

    return output_text

# Example usage
question = "Janet's half-day egg is 16 eggs. He eats three in the morning and uses four daily for his friends to make muffins. The remaining eggs are sold at $ 2 per fresh egg in the farmer's market every day. How much money does he make every day at the farmer's market?"
numeric_answer = get_numeric_answer(question)
print("Extracted Answer:", numeric_answer)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Extracted Answer: You are a AI problem solver working with very bad quality questions. Read the problem carefully and answer based on the variation of question that makes more sense and output the answer in following format ///YOUR NUMBER/// the numerical answer, without explanation or reasoning.

Problem: Janet's half-day egg is 16 eggs. He eats three in the morning and uses four daily for his friends to make muffins. The remaining eggs are sold at $ 2 per fresh egg in the farmer's market every day. How much money does he make every day at the farmer's market?

Answer: 0

Explanation: Janet's half-day egg is 16 eggs. He eats three in the morning and uses four daily for his friends to make muffins. The remaining eggs are sold at $2 per fresh egg in the farmer's market every day. So, 16 - 3 - 4 = 9 eggs left. 9 * 2 = $18. So, he makes $18 every day.

Wait, but the problem says "Janet's half-day egg is 16 eggs." That seems a bit confusing. Maybe it's a typo and should be "Janet has 16 eg

In [46]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login

# Log in to the Hugging Face Hub
login(token="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")


model_name = "meta-llama/Llama-3.1-8B-Instruct"

# 1. Create a quantization config for 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                    # Enable 4-bit quantization
    bnb_4bit_quant_type="nf4",            # Options: 'nf4' (recommended) or 'fp4'
    bnb_4bit_use_double_quant=True,       # Nested quantization for higher accuracy
    bnb_4bit_compute_dtype="bfloat16"     # Mixed-precision type: 'fp16', 'bf16', etc.
)

# 3. Load model in 4-bit
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"  # auto-distributes across GPU(s) if available
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:
from transformers import PreTrainedTokenizerFast
# 2. Load tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained("Billyyy/llama_8K_extended")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BloomTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [7]:
from datasets import load_dataset
merged_dataset = load_dataset("Billyyy/llama_8K_extended")

In [14]:

def merge_sequences_with_text(dataset, tokenizer, max_length=1024):
    """
    Merge sequences in a dataset while keeping 'text' and 'input_ids' aligned.
    
    Args:
        dataset (Dataset): Hugging Face Dataset with 'text' and 'input_ids'.
        tokenizer: Tokenizer used to tokenize the text.
        max_length (int): Maximum length for merged sequences.

    Returns:
        dict: A dictionary with merged 'text' and 'input_ids'.
    """
    merged_texts = []
    merged_input_ids = []
    current_text = []
    current_input_ids = []

    for text, input_ids in zip(dataset["text"], dataset["input_ids"]):
        # Check if adding the current sequence exceeds the max length
        if len(current_input_ids) + len(input_ids) > max_length:
            # Append the merged sequences
            merged_texts.append(" ".join(current_text))
            merged_input_ids.append(current_input_ids[:max_length])
            # Reset for the next sequence
            current_text = []
            current_input_ids = []
        
        # Extend the current sequence
        current_text.append(text)
        current_input_ids.extend(input_ids)
    
    # Add the final batch if it exists
    if current_input_ids:
        merged_texts.append(" ".join(current_text))
        merged_input_ids.append(current_input_ids[:max_length])

    return {"text": merged_texts, "input_ids": merged_input_ids}


In [15]:
dataset = dataset.remove_columns("attention_mask")

# Add 'input_ids' column if not already in the dataset
if "input_ids" not in dataset.column_names:
    dataset = dataset.map(
        lambda examples: {"input_ids": tokenizer(examples["text"], truncation=True, padding=False, max_length=256)["input_ids"]},
        batched=True,
        num_proc=27
    )

# Apply the merging function
merged_dataset = dataset.map(
    lambda batch: merge_sequences_with_text(batch, tokenizer, max_length=256),
    batched=True, 
    num_proc=27)

Map (num_proc=27): 100%|██████████| 955519/955519 [00:13<00:00, 72378.89 examples/s]
Map (num_proc=27): 100%|██████████| 50291/50291 [00:00<00:00, 51975.37 examples/s]
Map (num_proc=27): 100%|██████████| 955519/955519 [00:04<00:00, 221512.98 examples/s]
Map (num_proc=27): 100%|██████████| 50291/50291 [00:00<00:00, 96506.03 examples/s] 


In [30]:
merged_dataset.remove_columns("text")

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 645131
    })
    eval: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 34004
    })
})

In [11]:
merged_dataset.push_to_hub("Billyyy/llama_8K_extended")

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/216 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/216 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/216 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/35 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/423 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Billyyy/llama_8K_extended/commit/298dc49a0351e7853c5357e4cc5971b82ce2eed8', commit_message='Upload dataset', commit_description='', oid='298dc49a0351e7853c5357e4cc5971b82ce2eed8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Billyyy/llama_8K_extended', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Billyyy/llama_8K_extended'), pr_revision=None, pr_num=None)

In [7]:
def tokenize_function(examples):
    """Tokenizes each line in the dataset."""
    return tokenizer(
        examples["text"], truncation=True, padding=True, max_length=256
    )

In [10]:
merged_dataset = merged_dataset.map(tokenize_function, batched=True, num_proc=27)

Map (num_proc=27):   0%|          | 0/645131 [00:00<?, ? examples/s]

Map (num_proc=27):   0%|          | 0/34004 [00:00<?, ? examples/s]

In [29]:
merged_dataset['train'][60000]['text']

'<mn> Хамгийн анх "Халуун сэтгэл" киноны найруулагч Л.Энхбаяр ах 2006 онд намайг кинонд уриад, гол дүрээ өгөхөд нь маш их баярлаж байлаа. <en>The first director of "Hot Soul", L.Enkhbayar, was excited to thank me for the role of the film in 2006. <mn> Манай хойд хєршийг хувьд ч гэсэн сурталчилгааны зорилгоор хуурамч мєнгийг ашиглах туршлагыг хэрэгжvvлж байжээ. <en>In addition, our northern neighbors have been involved in the use of counterfeit money for advertising.'

In [None]:
# 5. Training Arguments -------------------------------------------
training_args = TrainingArguments(
    output_dir="/ephemeral/llama_3B_translation",  
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=2,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1,
    weight_decay=0.01,
    fp16=True,
    logging_dir="/workspace/logs",
    logging_steps=10,
    report_to="wandb",
    remove_unused_columns=False,
    group_by_length=False,
    
    # A100-specific optimizations
    gradient_checkpointing=True,
    optim="adamw_torch",
    ddp_find_unused_parameters=False,

)

# 6. Data Collator for Masked Language Modeling -------------------
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# 7. Trainer Initialization ---------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=merged_dataset["train"],
    eval_dataset=merged_dataset["eval"], 
    data_collator=data_collator
)

In [10]:
merged_dataset['train']

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 645131
})

In [3]:

# 2. Load Pretrained Tokenizer & Model -----------------------------
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"  # Change if using another model
TOKENIZER_PATH = "Billyyy/llama_8K_extended" # Path to your trained tokenizer
DATASET_PATH = "/workspace/labeled_corpus.txt" 

# Load model
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
vocab_size = len(tokenizer)

# 3. Resize Model's Embedding Layer -------------------------------
old_vocab_size = model.get_input_embeddings().num_embeddings


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
old_vocab_size

128256

In [5]:

if vocab_size > old_vocab_size:
    print(f"Resizing model embeddings from {old_vocab_size} → {vocab_size}")
    model.resize_token_embeddings(vocab_size)

# Freeze all layers except embedding
for param in model.parameters():
    param.requires_grad = False  # Freeze entire model

# Enable training for embedding layer only
model.get_input_embeddings().weight.requires_grad = True

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Resizing model embeddings from 128256 → 135126


The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [6]:
import torch

def count_trainable_parameters(model):
    """Returns the number of trainable parameters in the model."""
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Percentage of trainable parameters: {100 * trainable_params / total_params:.2f}%")
    
    return trainable_params

# Example Usage (after loading your model)
trainable_params = count_trainable_parameters(model)


Total parameters: 8,086,540,288
Trainable parameters: 553,476,096
Percentage of trainable parameters: 6.84%


In [39]:
trainer.train()

ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 777, in convert_to_tensors
    tensor = as_tensor(value)
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 739, in as_tensor
    return torch.tensor(value)
ValueError: too many dimensions 'str'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/workspace/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/workspace/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch
    return self.collate_fn(data)
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/data/data_collator.py", line 45, in __call__
    return self.torch_call(features)
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/data/data_collator.py", line 943, in torch_call
    batch = pad_without_fast_tokenizer_warning(
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/data/data_collator.py", line 66, in pad_without_fast_tokenizer_warning
    padded = tokenizer.pad(*pad_args, **pad_kwargs)
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3397, in pad
    return BatchEncoding(batch_outputs, tensor_type=return_tensors)
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 241, in __init__
    self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 793, in convert_to_tensors
    raise ValueError(
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).


In [11]:
merged_dataset = merged_dataset.remove_columns(["text"])

In [375]:
tokenizer.decode(2)

'</s>'

In [42]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

def show_collated_batch(dataset, tokenizer, data_collator, batch_size=4):
    """Fetches and prints a sample batch after applying the DataCollator."""
    
    # Create DataLoader with DataCollator
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
    
    # Get one batch
    batch = next(iter(dataloader))
    
    for i in range(batch_size):
        print(f"\n📝 **Sample {i+1}:**")
        print("Input IDs:", batch["input_ids"][i].tolist())
        print("Attention Mask:", batch["attention_mask"][i].tolist())

        # Decode tokenized input back to text
        decoded_text = tokenizer.decode(batch["input_ids"][i], skip_special_tokens=True)
        print("Decoded Text:", decoded_text)

        if "labels" in batch:
            print("Labels:", batch["labels"][i].tolist())

# Initialize Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False
)

# Example Usage
show_collated_batch(merged_dataset['train'], tokenizer, data_collator)



📝 **Sample 1:**
Input IDs: [128000, 14066, 77, 29, 81529, 129063, 220, 128972, 129025, 130337, 220, 128594, 129051, 220, 128665, 134437, 130829, 129613, 220, 128989, 129461, 135069, 220, 129807, 131538, 220, 131143, 133635, 132778, 131651, 220, 131061, 128480, 129051, 134429, 128530, 129461, 129126, 220, 128813, 131160, 220, 128480, 133572, 130829, 131208, 131636, 128480, 129461, 131814, 129113, 45458, 130226, 128594, 130829, 130226, 129461, 220, 132551, 128989, 129461, 133545, 130829, 131143, 134429, 131061, 134429, 220, 128480, 131613, 128594, 129051, 220, 128697, 133208, 128594, 220, 130337, 134600, 129325, 220, 130829, 131061, 134429, 131763, 220, 134801, 128594, 130829, 133635, 129461, 133635, 129461, 130337, 220, 134899, 128480, 128382, 129461, 129461, 134076, 134429, 129561, 220, 134429, 128812, 128594, 220, 128972, 129025, 130337, 13, 366, 268, 43093, 3786, 374, 7373, 18641, 449, 44539, 89108, 6444, 889, 527, 1101, 39075, 18250, 1534, 449, 264, 11336, 3786, 13, 128257, 128257,

In [44]:
old_vocab_size

128256

In [335]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel

class LogitLensAnalyzer:
    def __init__(self, peft_model=None, device="cpu", top_k=5, num_tokens=5):
        """
        Initializes the Logit Lens analysis for NLLB models.
        """
        self.device = device
        self.top_k = top_k  # Number of top tokens to extract per layer
        self.num_tokens = num_tokens  # Number of tokens to analyze
        self.tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B", src_lang="khk_Cyrl")
        self.hidden_states_per_step = []  # Store hidden states at each step
        self.peft_model = peft_model

        if self.peft_model:
            # Load the base model and LoRA-adapted model
            base_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-1.3B").to(device)
            self.model = PeftModel.from_pretrained(base_model, peft_model).to(device)
        else:
            # Load standard NLLB model
            self.model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-1.3B").to(device)

        # Automatically detect where the decoder layers are located
        self.decoder_layers = self._find_decoder_layers()
        self.lm_head = self.model.lm_head  # The final layer that maps hidden states to logits

    def _find_decoder_layers(self):
        """Finds the correct path to the decoder layers dynamically."""
        if self.peft_model:
            print("LoRA model detected: Using `self.model.base_model.model.decoder.layers`")
            return self.model.model.model.decoder.layers  # LoRA-adapted model
        else:
            print("Base model detected: Using `self.model.model.decoder.layers`")
            return self.model.model.decoder.layers  # Standard model

    def hook_fn(self, module, input, output):
        """Captures hidden states from all decoder layers."""
        self.hidden_states_per_step[-1].append(output[0])  # Extract the main hidden state

    def register_hooks(self):
        """Registers hooks for ALL decoder layers."""
        self.hooks = []
        for layer in self.decoder_layers:
            hook = layer.register_forward_hook(self.hook_fn)
            self.hooks.append(hook)

    def remove_hooks(self):
        """Removes hooks after execution."""
        for hook in self.hooks:
            hook.remove()

    def extract_logits_autoregressive(self, input_text):
        """
        Runs input text through the model and extracts hidden states while generating tokens autoregressively.
        """
        self.hidden_states_per_step = []
        self.register_hooks()

        # Tokenize the input and move to device
        inputs = self.tokenizer(input_text, return_tensors="pt").to(self.device)
        
        # Prepare encoder outputs (do this once to avoid redundant computation)
        with torch.no_grad():
            encoder_outputs = self.model.get_encoder()(**inputs, return_dict=True)

        # Initialize decoder with the correct start token
        if self.model.config.decoder_start_token_id is not None:
            decoder_input_ids = torch.tensor([[self.model.config.decoder_start_token_id]], device=self.device)
        else:
            decoder_input_ids = torch.tensor([[self.tokenizer.bos_token_id]], device=self.device)  # Fallback

        past_key_values = None  # Store cached past key values for efficient decoding
        generated_tokens = []

        print(f"\n🔹 **Starting autoregressive generation for input:** '{input_text}'\n")

        with torch.no_grad():
            for step in range(self.num_tokens):
                self.hidden_states_per_step.append([])  # Prepare storage for this step

                # Forward pass with past key values
                outputs = self.model(
                    encoder_outputs=encoder_outputs,
                    decoder_input_ids=decoder_input_ids,
                    past_key_values=past_key_values,
                    use_cache=True,  # Enables faster decoding
                    output_hidden_states=True,  # Ensures hidden states are returned
                    return_dict=True
                )

                logits = outputs.logits[:, -1, :]  # Logits for the last generated token
                past_key_values = outputs.past_key_values  # Update cache for next step

                # Store hidden states for this step
                self.hidden_states_per_step[-1] = outputs.decoder_hidden_states  # Capture hidden states for this step

                # Select next token (greedy decoding)
                next_token = torch.argmax(logits, dim=-1, keepdim=True)
                generated_tokens.append(next_token.item())

                # Convert token ID to text and print debug info
                generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
                print(f"🔹 **Step {step+1}: Generated Token ID:** {next_token.item()} -> '{self.tokenizer.decode(next_token.item())}'")
                print(f"   **Current Generated Text:** '{generated_text}'\n")

                # Append new token to decoder input for next step
                decoder_input_ids = next_token  # Keep only the latest token

                # Stop if EOS token is generated
                if next_token.item() == self.tokenizer.eos_token_id:
                    print("🔹 **EOS token detected. Stopping generation.**")
                    break

        self.remove_hooks()
        return self.hidden_states_per_step, generated_tokens  # Return all recorded hidden states and generated tokens



    def get_top_k_predictions(self, hidden_states):
        """Extracts top-k predictions for each layer at each step."""
        top_k_predictions = []

        for layer_states in hidden_states:  # Iterate over decoder layers
            logits = self.lm_head(layer_states[:, -1, :])  # Compute logits
            top_k_values, top_k_indices = torch.topk(logits, self.top_k, dim=-1)
            decoded_tokens = [self.tokenizer.decode(idx.item()) for idx in top_k_indices[0]]
            top_k_predictions.append(decoded_tokens)

        return top_k_predictions  # List of [layer][top-k words]

    def compare_models(self, base_hidden_states, lora_hidden_states):
        """
        Compares hidden states from base and LoRA models and extracts logit predictions.
        """
        min_steps = min(len(base_hidden_states), len(lora_hidden_states))  # Ensure matching lengths
        similarities = []
        entropies = []
        base_top_k_tokens = []
        lora_top_k_tokens = []

        for step in range(min_steps):  # Step-wise comparison
            base_step_states = base_hidden_states[step]
            lora_step_states = lora_hidden_states[step]

            step_similarities = []
            step_entropies = []

            # Compute top-k token predictions
            base_top_k_tokens.append(self.get_top_k_predictions(base_step_states))
            lora_top_k_tokens.append(self.get_top_k_predictions(lora_step_states))

            for base_h, lora_h in zip(base_step_states, lora_step_states):
                # Compute cosine similarity
                cos_sim = F.cosine_similarity(base_h, lora_h, dim=-1).mean().item()
                step_similarities.append(cos_sim)

                # Compute entropy
                probs = torch.nn.functional.softmax(lora_h, dim=-1)
                entropy = -torch.sum(probs * torch.log(probs + 1e-9), dim=-1).mean().item()
                step_entropies.append(entropy)

            similarities.append(step_similarities)
            entropies.append(step_entropies)

        return similarities, entropies, base_top_k_tokens, lora_top_k_tokens

# Load models
base_model_analyzer = LogitLensAnalyzer(num_tokens=5)  # Analyze 5 tokens
lora_model_analyzer = LogitLensAnalyzer(peft_model="Billyyy/mon_nllb_3B")

Base model detected: Using `self.model.model.decoder.layers`
LoRA model detected: Using `self.model.base_model.model.decoder.layers`


In [365]:
# Sample input
input_text = "британийн фунт"

# Extract hidden states for both models while preserving autoregressive behavior
base_hidden_states, base_generated_tokens = base_model_analyzer.extract_logits_autoregressive(input_text)
lora_hidden_states, lora_generated_tokens = lora_model_analyzer.extract_logits_autoregressive(input_text)

# Compare models
similarities, entropies, base_top_k, lora_top_k = base_model_analyzer.compare_models(base_hidden_states, lora_hidden_states)





🔹 **Starting autoregressive generation for input:** 'британийн фунт'

🔹 **Step 1: Generated Token ID:** 256047 -> 'eng_Latn'
   **Current Generated Text:** ''

🔹 **Step 2: Generated Token ID:** 43504 -> 'British'
   **Current Generated Text:** 'British'

🔹 **Step 3: Generated Token ID:** 218790 -> 'pound'
   **Current Generated Text:** 'British pound'

🔹 **Step 4: Generated Token ID:** 2 -> '</s>'
   **Current Generated Text:** 'British pound'

🔹 **EOS token detected. Stopping generation.**

🔹 **Starting autoregressive generation for input:** 'британийн фунт'

🔹 **Step 1: Generated Token ID:** 256047 -> 'eng_Latn'
   **Current Generated Text:** ''

🔹 **Step 2: Generated Token ID:** 43504 -> 'British'
   **Current Generated Text:** 'British'

🔹 **Step 3: Generated Token ID:** 1920 -> 'Po'
   **Current Generated Text:** 'British Po'

🔹 **Step 4: Generated Token ID:** 514 -> 'und'
   **Current Generated Text:** 'British Pound'

🔹 **Step 5: Generated Token ID:** 248075 -> '.'
   **Current

In [370]:
len(entropies)

4

In [367]:
similarities[0]

[1.0000001192092896,
 0.9999422430992126,
 0.9999804496765137,
 0.9999713897705078,
 0.9999773502349854,
 0.9999727010726929,
 0.9999706149101257,
 0.9999663829803467,
 0.9999510049819946,
 0.9999452233314514,
 0.9999325275421143,
 0.9999133944511414,
 0.9999029636383057,
 0.9998898506164551,
 0.9998372197151184,
 0.9997361302375793,
 0.9988932609558105,
 0.9970133304595947,
 0.9954288601875305,
 0.9949801564216614,
 0.993058979511261,
 0.989745020866394,
 0.983683705329895,
 0.9715161919593811,
 0.7704209685325623]

In [356]:
base_top_k[4]

[['ys', 'YS', 'yst', 'yd', 'yse'],
 ['ys', 'YS', 'yd', 'yn', 'yst'],
 ['ys', 'yd', 'yst', 'yn', 'YS'],
 ['ys', 'ays', 'ying', 'yl', 'yst'],
 ['ys', 'laid', 'lay', 'ays', 'ying'],
 ['ys', 'laid', 'lay', 'Lay', 'lies'],
 ['laid', 'ys', 'lay', 'Lay', 'play'],
 ['laid', 'lay', 'ys', 'play', 'Lay'],
 ['laid', 'lay', 'down', 'Lay', 'ys'],
 ['laid', 'lay', 'down', 'around', 'Lay'],
 ['laid', 'lay', 'down', 'around', 'up'],
 ['up', 'around', 'down', 'lay', 'approximately'],
 ['up', 'only', 'out', 'approximately', 'around'],
 ['up', 'about', 'approximately', 'around', 'out'],
 ['up', 'about', 'approximately', 'more', 'around'],
 ['up', 'about', 'approximately', 'around', 'more'],
 ['up', 'about', 'approximately', 'only', 'more'],
 ['up', 'about', 'approximately', 'more', 'nearly'],
 ['up', 'about', 'approximately', 'more', 'around'],
 ['up', 'about', 'approximately', 'more', 'nearly'],
 ['up', 'about', 'a', 'approximately', '16'],
 ['up', 'about', '16', 'a', 'approximately'],
 ['up', '16', 'abo

In [362]:
lora_top_k[5]

IndexError: list index out of range