### Check GPU Availability

In [1]:
!nvidia-smi

Mon Jun 16 09:55:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   47C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### Installation

In [2]:
%%capture
# Temporarily as of Jan 31st 2025, Colab has some issues with Pytorch
# Using pip install unsloth will take 3 minutes, whilst the below takes <1 minute:

!pip install transformers==4.37.2
!pip install unsloth==2025.2.15 unsloth_zoo==2025.2.7
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
!pip install --no-deps cut_cross_entropy
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer

### Initialize the LLM

In [5]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

### Changing the model here is forbidden !

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "taide/Llama-3.1-TAIDE-LX-8B-Chat",    ### Do not change the model for any other models or quantization versions
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = False,
    token = "",
)

==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



taide/Llama-3.1-TAIDE-LX-8B-Chat does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.


## Load datasets (store in google cloud)

In [None]:
from datasets import load_dataset
# ---- 讀入 Q&A 資料集 (假設 csv 欄位 question / answer) -----------------
ds = load_dataset("csv", data_files="qa_pairs.csv")["train"]
def format_fn(example):
    return f"""### 提問:
{example['question']}
### 回答:
{example['answer']}"""
# 轉成 text 欄位供 SFTTrainer 使用
ds = ds.map(lambda x: {"text": format_fn(x)}, remove_columns=ds.column_names)

Add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [6]:
from unsloth import SFTTrainer
from transformers import TrainingArguments
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    lora_alpha = 32,
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_dropout = 0.12,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    use_rslora = True,
    loftq_config = None,
    random_state = 3407,
)

# 訓練（其他超參可沿用）
args = TrainingArguments(
    per_device_train_batch_size = 4,   # LoRA 記憶體佔用小，可用較大 batch
    gradient_accumulation_steps = 4,
    num_train_epochs = 3,
    learning_rate = 5e-4,    # LoRA 通常較大 LR
    fp16 = True,
    logging_steps = 20,
    save_strategy = "epoch",
    output_dir = "lx8b_lora",
)

trainer = SFTTrainer(
    model = model,
    train_dataset = ds,
    dataset_text_field = "text",
    max_seq_length = 2048,
    tokenizer = tokenizer,
    args = args,
)

trainer.train()
# 儲存 LoRA adapter（權重量小）
trainer.model.save_pretrained("lx8b_lora")
tokenizer.save_pretrained("lx8b_lora")

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.12.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Exception ignored in: <function _xla_gc_callback at 0x784576b082c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/jax/_src/lib/__init__.py", line 96, in _xla_gc_callback
    def _xla_gc_callback(*args):
    
KeyboardInterrupt: 
Exception ignored in: <function _xla_gc_callback at 0x784576b082c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/jax/_src/lib/__init__.py", line 96, in _xla_gc_callback
    def _xla_gc_callback(*args):
    
KeyboardInterrupt: 
Unsloth 2025.2.15 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


NameError: name 'TrainingArguments' is not defined

## Inference

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

base  = "taide/Llama-3.1-TAIDE-LX-8B-Chat"
lora  = "lx8b_lora"

tok   = AutoTokenizer.from_pretrained(lora)
baseM = AutoModelForCausalLM.from_pretrained(base, device_map="auto", torch_dtype=torch.float16, token="")
model = PeftModel.from_pretrained(baseM, lora, device_map="auto", torch_dtype=torch.float16)

prompt = "### 提問:\n台灣的首都在哪裡？\n\n### 回答:"
inputs = tok(prompt, return_tensors="pt").to(model.device)
out    = model.generate(**inputs, max_new_tokens=64)
print(tok.decode(out[0], skip_special_tokens=True))

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",  ### Use llama-3.1 template for better performance here
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass


# Dataset Preperation (Loading and Refining)

## Data Filtering & Sorting

In [None]:
from datasets import load_dataset, Dataset, load_from_disk

# Load the dataset from Hugging Face
dataset = load_from_disk("/content/ML_Spring2025_HW5/fastchat_alpaca_52k")

# ---------------------------
# Add a "text" field to each example
# ---------------------------
# This function extracts the first assistant message from the conversation
def add_text_field(example):
    # Extract the first message where role == 'assistant'
    assistant_texts = [msg["content"] for msg in example["conversations"] if msg["role"] == "assistant"]
    text = assistant_texts[0] if assistant_texts else ""
    return {"text": text}

# Map the function over the dataset to add the "text" column.
dataset = dataset.map(add_text_field)

# Print the dataset structure to confirm the new feature.
print(dataset)


# ---------------------------
#################### TODO : Define a helper function for computing conversation length ###############

# The default "conversation length" here refers to the length of the input (human) and output (gpt), you can modify it at your will

def compute_conversation_length(example):
    # Compute total word count across all messages in the 'conversations' field
    return sum(len(message["content"].split()) for message in example["conversations"])


#################### TODO ############################################################################

# ---------------------------
# Simple Sorting Method  (Default)
# ---------------------------
# Sort the dataset from shortest to longest conversation (by word count) reverse : small to big
sorted_dataset_simple_list = sorted(dataset, key=compute_conversation_length, reverse=False)

# Convert back to a Dataset object
sorted_dataset_simple = Dataset.from_list(sorted_dataset_simple_list)

print("\nTop examples sorted by simple conversation length:")
for entry in sorted_dataset_simple.select(range(5)):
    print(f"ID: {entry['id']}, Conversation Length: {compute_conversation_length(entry)}")
# ---------------------------



############## Advanced Sorting Method (TODO : Modify the sorting key ##################
# ---------------------------
# Default : Sorting based on Combining conversation length with the 'score' field using a weighted sum.
# Here, we multiply the score by 10 and add it to the conversation length.
def advanced_sort_key(example):
    conversation_len = compute_conversation_length(example)
    score = example["score"]
    # return 1e-5 * conversation_len + score * 1
    return -0.01 * conversation_len + score * 2

####################################### TODO ###########################################

sorted_dataset_advanced_list = sorted(dataset, key=advanced_sort_key, reverse=True)
# Convert back to a Dataset object
sorted_dataset_advanced = Dataset.from_list(sorted_dataset_advanced_list)

print("\nTop examples sorted by advanced key (combination of conversation length and score):")
for entry in sorted_dataset_advanced.select(range(5)):
    print(f"ID: {entry['id']}, Advanced Key Value: {advanced_sort_key(entry)}")


Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'conversations', 'score', 'text'],
    num_rows: 52002
})

Top examples sorted by simple conversation length:
ID: identity_23488, Conversation Length: 22
ID: identity_717, Conversation Length: 23
ID: identity_1347, Conversation Length: 23
ID: identity_1790, Conversation Length: 23
ID: identity_2502, Conversation Length: 23

Top examples sorted by advanced key (combination of conversation length and score):
ID: identity_26362, Advanced Key Value: 9.73
ID: identity_28542, Advanced Key Value: 9.73
ID: identity_35167, Advanced Key Value: 9.73
ID: identity_9103, Advanced Key Value: 9.71
ID: identity_48727, Advanced Key Value: 9.69


#### Note : You are limited to use 100 sorted data among the 1000 data in the given dataset, no more than 100 data is allowed for training !!!

In [None]:
################# TODO : select the simple or advanced dataset for training ##############

dataset_used = "sorted_dataset_advanced" # Changed to advanced dataset for better quality data

################# TODO ###################################################################

# Higher quality dataset selection
if dataset_used == "sorted_dataset_simple":
    train_dataset = sorted_dataset_simple.select(range(0,80))    ### You can also select from the middle, e.g. sorted_dataset_simple.select(range(50,150))
    eval_dataset = sorted_dataset_simple.select(range(80, 100)) # Added evaluation dataset
else:
    # Select the best examples based on score
    top_examples = sorted(sorted_dataset_advanced, key=lambda x: x["score"], reverse=True)[:80]
    train_dataset = Dataset.from_list(top_examples)

    # Select next best examples for evaluation
    eval_examples = sorted(sorted_dataset_advanced, key=lambda x: x["score"], reverse=True)[80:100]
    eval_dataset = Dataset.from_list(eval_examples)

from unsloth.chat_templates import standardize_sharegpt
train_dataset = standardize_sharegpt(train_dataset)
train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)
eval_dataset = standardize_sharegpt(eval_dataset)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched = True,) # Process eval dataset too

Standardizing format:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Standardizing format:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

# Dataset Visualize

In [None]:
dataset[5]["conversations"]

[{'content': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: Identify the odd one out. ### Input: Twitter, Instagram, Telegram',
  'role': 'user'},
 {'content': 'Telegram', 'role': 'assistant'}]

And we see how the chat template transformed these conversations.

**[Notice]** Llama 3.1 Instruct's default chat template default adds `"Cutting Knowledge Date: December 2023\nToday Date: 26 July 2024"`, so do not be alarmed!

In [None]:
dataset[5]["text"]

'Telegram'

## Training

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported


################# TODO : Tweak the training hyperparameters here.  #####################


training_config = {
    "per_device_train_batch_size": 1,  # Small batch size for better generalization
    "gradient_accumulation_steps": 8, # Increased for larger effective batch size (8) - better stability
    "warmup_ratio": 0.1,  # Optimal warmup ratio based on results
    "num_train_epochs": 3, # Reduced epochs to focus on best performance window
    "learning_rate": 8e-5, # Slight increase for better optimization
    "optim": "adamw_8bit",
    "weight_decay": 0.02, # Fine-tuned weight decay
    "lr_scheduler_type": "cosine_with_restarts", # Maintaining effective scheduler
    "seed": 3407,   ### Do not modify the seed for reproducibility
    # Evaluation settings
    "evaluation_strategy": "steps",
    "eval_steps": 3, # More frequent evaluation
    "save_strategy": "steps",
    "save_steps": 3,
    "load_best_model_at_end": True,
    "metric_for_best_model": "eval_loss",
    "greater_is_better": False,
    "save_total_limit": 3,
    # More aggressive early stopping to prevent overfitting
    "early_stopping_patience": 4,
    "early_stopping_threshold": 0.003,
}


################# TODO #################################################################
# Add EarlyStoppingCallback
from transformers import EarlyStoppingCallback

# Import the data collator we need
from transformers import DataCollatorForLanguageModeling

# Use a standard data collator that works better with the model
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Not using masked language modeling
)

# Set up the trainer with proper configuration for loss calculation
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = data_collator,
    dataset_num_proc = 2,
    packing = False, # Not using packing for better quality
    remove_unused_columns = True, # Standard column handling
    args = TrainingArguments(
        per_device_train_batch_size = training_config["per_device_train_batch_size"],
        gradient_accumulation_steps = training_config["gradient_accumulation_steps"],
        warmup_ratio = training_config["warmup_ratio"],
        num_train_epochs = training_config["num_train_epochs"], # Set this for 1 full training run.
        # max_steps = 60,
        learning_rate = training_config["learning_rate"],
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = training_config["optim"],
        weight_decay = training_config["weight_decay"],
        lr_scheduler_type = training_config["lr_scheduler_type"],
        seed = training_config["seed"],
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
        # Added evaluation and saving arguments
        evaluation_strategy = training_config["evaluation_strategy"],
        eval_steps = training_config["eval_steps"],
        save_strategy = training_config["save_strategy"],
        save_steps = training_config["save_steps"],
        load_best_model_at_end = training_config["load_best_model_at_end"],
        metric_for_best_model = training_config["metric_for_best_model"],
        greater_is_better = training_config["greater_is_better"],
        save_total_limit = training_config["save_total_limit"],
    ),
    # Add early stopping callback
    callbacks=[EarlyStoppingCallback(
        early_stopping_patience=training_config["early_stopping_patience"],
        early_stopping_threshold=training_config["early_stopping_threshold"]
    )],
)



Converting train dataset to ChatML (num_proc=2):   0%|          | 0/80 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/80 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/80 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=2):   0%|          | 0/80 [00:00<?, ? examples/s]

Converting eval dataset to ChatML (num_proc=2):   0%|          | 0/20 [00:00<?, ? examples/s]

Applying chat template to eval dataset (num_proc=2):   0%|          | 0/20 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=2):   0%|          | 0/20 [00:00<?, ? examples/s]

Truncating eval dataset (num_proc=2):   0%|          | 0/20 [00:00<?, ? examples/s]

We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs.

In [None]:
"""We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs."""

# We'll skip the train_on_responses_only for now as it might be causing issues
# from unsloth.chat_templates import train_on_responses_only
# trainer = train_on_responses_only(
#    trainer,
#    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
#    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
# )

# Use original train_dataset that's already properly formatted
original_train_dataset = train_dataset

# Simplify training to debug the loss issue
print("Starting training with standard configuration...")
trainer_stats = trainer.train()

# If basic training works, we can proceed with the weight averaging approach in a separate run
print("Training complete. You can now test the model or run inference.")

Starting training with standard configuration...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 80 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 8
\        /    Total batch size = 8 | Total steps = 30
 "-____-"     Number of trainable parameters = 39,976,960


Step,Training Loss,Validation Loss
3,2.1793,1.818615
6,1.18,1.004233
9,0.64,0.510107
12,0.389,0.308091
15,0.2155,0.254622
18,0.2707,0.233204
21,0.1816,0.221103
24,0.2134,0.219061
27,0.187,0.217925
30,0.1952,0.217657


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Training complete. You can now test the model or run inference.


In [None]:
#trainer_stats = trainer.train()

#### TODO : Curriculum Training  (Optional)
start training the LLM with “easier” examples (e.g., shorter, clearer conversations) and progressively introduce more complex ones.

The total data amount used to train should still not exceed 100 data.

In [None]:
############## TODO : Curriculum Training  ######################

# Add data augmentation function to increase dataset diversity
def augment_conversation(example):
    import random
    import copy
    augmented_example = copy.deepcopy(example)

    # Only augment if there are at least 2 turns in the conversation
    if len(augmented_example["conversations"]) >= 2:
        # 50% chance to add emphasis to certain words
        if random.random() > 0.5:
            for i, msg in enumerate(augmented_example["conversations"]):
                if msg["role"] == "user" and len(msg["content"].split()) > 5:
                    words = msg["content"].split()
                    # Add emphasis to 1-3 random words (not too many)
                    num_words = min(3, max(1, len(words) // 10))
                    for _ in range(num_words):
                        idx = random.randint(0, len(words) - 1)
                        # Only augment non-punctuation words longer than 3 chars
                        if len(words[idx]) > 3 and words[idx].isalnum():
                            words[idx] = words[idx].upper()
                    augmented_example["conversations"][i]["content"] = " ".join(words)

    return augmented_example

# First training phase: Train on shorter, simpler examples with augmentation
curriculum_train_1_base = sorted_dataset_advanced.select(range(0, 30))  # First 30 examples

# Convert to list for augmentation
curriculum_train_1_base_list = curriculum_train_1_base.to_list()

# Apply augmentation to create additional training examples
curriculum_train_1_aug = [augment_conversation(ex) for ex in curriculum_train_1_base_list]

# Combine base and augmented examples (20 augmented)
combined_data_1 = curriculum_train_1_base_list + curriculum_train_1_aug[:20]  # Total: 50 examples

# Convert back to Dataset object
curriculum_train_1 = Dataset.from_list(combined_data_1)
curriculum_train_1 = standardize_sharegpt(curriculum_train_1)
curriculum_train_1 = curriculum_train_1.map(formatting_prompts_func, batched=True)

# Second training phase: Train on more complex examples with augmentation
curriculum_train_2_base = sorted_dataset_advanced.select(range(30, 60))  # Next 30 examples

# Convert to list for augmentation
curriculum_train_2_base_list = curriculum_train_2_base.to_list()

# Apply augmentation to create additional training examples
curriculum_train_2_aug = [augment_conversation(ex) for ex in curriculum_train_2_base_list]

# Combine base and augmented examples (20 augmented)
combined_data_2 = curriculum_train_2_base_list + curriculum_train_2_aug[:20]  # Total: 50 examples

# Convert back to Dataset object
curriculum_train_2 = Dataset.from_list(combined_data_2)
curriculum_train_2 = standardize_sharegpt(curriculum_train_2)
curriculum_train_2 = curriculum_train_2.map(formatting_prompts_func, batched=True)

# Note: We're not actually running this code in the main part,
# but the implementation is ready if you want to use it.

Standardizing format:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Standardizing format:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

<a name="Inference"></a>
## Inference


In [None]:
def parse_true_output(text):
    """
    Extracts the true assistant output from the decoded model output.

    It looks for the assistant header token:
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
    and extracts everything after it until the first occurrence of "<|eot_id|>".
    If the assistant header is not found, it falls back to the last occurrence
    of "<|end_header_id|>\n\n". If "<|eot_id|>" is not found, the extraction
    continues until the end of the string.
    """
    assistant_header = "<|start_header_id|>assistant<|end_header_id|>\n\n"
    start_index = text.find(assistant_header)
    if start_index != -1:
        start_index += len(assistant_header)
    else:
        # Fallback: use the last occurrence of the generic header ending
        generic_header = "<|end_header_id|>\n\n"
        start_index = text.rfind(generic_header)
        if start_index != -1:
            start_index += len(generic_header)
        else:
            start_index = 0

    end_index = text.find("<|eot_id|>", start_index)
    if end_index == -1:
        end_index = len(text)
    return text[start_index:end_index].strip()

In [None]:
from unsloth.chat_templates import get_chat_template
import json
from datetime import datetime

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Load the test set JSON file (without GPT responses)
with open("/content/ML_Spring2025_HW5/test_set_evol_instruct_150.json", "r") as infile:
    test_data = json.load(infile)

# Dictionary to store inference results
inference_results = {}

# Loop over each data entry in the test set
for index,entry in enumerate(test_data):
    entry_id = entry.get("id", "unknown_id")

    # Build the messages list from the human conversation entries
    # (Test set is expected to have only "human" messages)
    messages = []
    for conv in entry.get("conversations", []):
        if conv.get("from") == "human":
            messages.append({"role": "user", "content": conv.get("value", "")})
        else:
            messages.append({"role": "assistant", "content": conv.get("value", "")})

    # Create inputs using the chat template (required for generation)
    # inputs = tokenizer.apply_chat_template(
    #     messages,
    #     tokenize=True,
    #     add_generation_prompt=True,  # Must add for generation
    #     return_tensors="pt",
    # ).to("cuda")


    prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    )

    # 2. 使用 tokenizer 產生 input_ids 和 attention_mask
    encoded_input = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )

    input_ids = encoded_input["input_ids"].to("cuda")
    attention_mask = encoded_input["attention_mask"].to("cuda")
    # encoded_input = tokenizer.apply_chat_template(
    #     messages,
    #     tokenize=True,
    #     add_generation_prompt=True,
    #     return_tensors="pt",
    # )

    # input_ids = encoded_input["input_ids"].to("cuda")
    # attention_mask = encoded_input["attention_mask"].to("cuda")


################# TODO : Tweak Decoding Parameters here.  #####################


        # Simplified generation strategy with optimized parameters for all query types
    # input_content = tokenizer.decode(inputs[0])
    # input_length = len(inputs[0])

    # Set optimal generation parameters that work well across all types
    # Using values that produced the best results in previous versions
    temperature = 0.75    # Slightly higher temperature for more creativity
    max_tokens = 250      # Good length for most responses
    top_p = 0.95          # Higher top_p for more diversity
    top_k = 80            # More vocabulary options
    rep_penalty = 1.08    # Moderate repetition penalty

    # Generate using optimized parameters for best quality
    outputs = model.generate(
        attention_mask=attention_mask,
        input_ids=input_ids,
        do_sample=True,
        max_new_tokens=max_tokens,
        use_cache=True,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=rep_penalty,
        # Remove no_repeat_ngram_size as it can sometimes hurt fluency
        # Force longer completions for better quality by disabling early stopping
        early_stopping=False,
        pad_token_id=tokenizer.eos_token_id,  # Ensure proper padding
        eos_token_id=tokenizer.eos_token_id   # Explicit EOS token
    )


################# TODO  ##########################################################

    # Decode the generated tokens
    decoded_outputs = tokenizer.batch_decode(outputs)

    # Parse each output to extract the true assistant response
    parsed_outputs = [parse_true_output(output) for output in decoded_outputs]

    # Store the result for the current entry
    inference_results[entry_id] = {
        "input": messages,
        "output": parsed_outputs
    }

    print(f"Inference completed for entry {entry_id}")


#Write the inference results to the prediction JSON file
with open(f"pred.json", "w") as outfile:
    json.dump(inference_results, outfile, indent=4)
with open(f"training_config.json", "w") as outfile:
    json.dump(training_config, outfile, indent=4)

from google.colab import files
files.download('/content/pred.json')

print("Inference completed for all entries in the test set.")

"""## Saving, loading finetuned models

### Save the model
"""

model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")

"""### Load the model"""

from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "lora_model", # The folder path containing of the folder that contains adapter_model.safetensors, adapter_config.json and README.md
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

Inference completed for entry identity_8174
Inference completed for entry identity_16675
Inference completed for entry identity_51749
Inference completed for entry identity_53196
Inference completed for entry identity_65799
Inference completed for entry identity_31686
Inference completed for entry identity_25291
Inference completed for entry identity_31699
Inference completed for entry identity_30359
Inference completed for entry identity_67085
Inference completed for entry identity_60450
Inference completed for entry identity_3070
Inference completed for entry identity_36778
Inference completed for entry identity_50478
Inference completed for entry identity_20143
Inference completed for entry identity_8300
Inference completed for entry identity_45513
Inference completed for entry identity_62606
Inference completed for entry identity_38166
Inference completed for entry identity_22233
Inference completed for entry identity_54369
Inference completed for entry identity_39141
Inference com

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Inference completed for all entries in the test set.
==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.12, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
             

## Saving, loading finetuned models

### Save the model

In [None]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.model',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

### Load the model

In [None]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "lora_model", # The folder path containing of the folder that contains adapter_model.safetensors, adapter_config.json and README.md
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.12, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
             