# Fine Tune Llama-3.2-1B Model on BMW Press Releases

In [2]:
import os
import sys
import json
import random

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datasets import load_dataset, Dataset
from unsloth import FastLanguageModel
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq


MODEL_NAME = "unsloth/Llama-3.2-1B-Instruct"
print(MODEL_NAME)
# Training settings
# llama3.2 support 128k tokens
MAX_SEQ_LENGTH = 4096 # Choose any! We auto support RoPE Scaling internally!
LOAD_IN_4BIT = False # Use 4bit quantization to reduce memory usage. Can be False.


WANDB_PROJECT = "BMW-Llama-3.2-1B"
WANDB_ENTITY = None  
WANDB_RUN_NAME = "BMW-Llama-3.2-1B-2000Articles2"

TRAIN_CHAT_DATA_FILE_NAME = '../datasets/chat_data_2000/train_chat.jsonl'
VAL_CHAT_DATA_FILE_NAME = '../datasets/chat_data_2000/val_chat.jsonl'
TEST_CHAT_DATA_FILE_NAME = '../datasets/chat_data_2000/test_chat.jsonl'

# Set checkpoint directory
CHECKPOINT_DIR = f"../{WANDB_RUN_NAME}"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Use the checkpoint_dir variable to save models
LORA_MODEL_PATH = os.path.join(CHECKPOINT_DIR, "lora_model")
MERGED_MODEL_PATH = os.path.join(CHECKPOINT_DIR, "merged_model")
# Create directories if they don't exist
os.makedirs(LORA_MODEL_PATH, exist_ok=True)
os.makedirs(MERGED_MODEL_PATH, exist_ok=True)

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
unsloth/Llama-3.2-1B-Instruct


In [2]:
dataset_train = load_dataset("json", data_files=TRAIN_CHAT_DATA_FILE_NAME, split="train")
dataset_val = load_dataset("json", data_files=VAL_CHAT_DATA_FILE_NAME, split="train")
dataset_test = load_dataset("json", data_files=TEST_CHAT_DATA_FILE_NAME, split="train")

train_ds = Dataset.from_list(dataset_train)
val_ds = Dataset.from_list(dataset_val)
test_ds = Dataset.from_list(dataset_test)

print("HuggingFace Chat Datasets:")
print(f"  Train: {train_ds}")
print(f"  Validation: {val_ds}")
print(f"  Test: {test_ds}")

print(f"\nDataset features: {train_ds.features}")
print(f"\nFirst sample structure:")
train_ds[0]

HuggingFace Chat Datasets:
  Train: Dataset({
    features: ['messages'],
    num_rows: 6264
})
  Validation: Dataset({
    features: ['messages'],
    num_rows: 787
})
  Test: Dataset({
    features: ['messages'],
    num_rows: 787
})

Dataset features: {'messages': List({'content': Value('string'), 'role': Value('string')})}

First sample structure:


{'messages': [{'content': 'You are an expert at summarizing BMW news articles. Provide concise, informative summaries that capture the key points.',
   'role': 'system'},
  {'content': 'Summarize the following BMW news article in a concise way.\n\n‚ÄúWe are delighted to be joining BMW M in celebrating the 25th anniversary season of our partnership in 2023,‚Äù said Carmelo Ezpeleta, CEO of Dorna Sports. ‚ÄúIn BMW M we have had a strong partner at our side for a quarter of a century; a partner with whom we have enjoyed superb collaboration in many different areas. We are very proud of this long-standing partnership that is never at a standstill, it gives plenty of fresh momentum each year. With the most innovative technologies, BMW M is taking care of safety in our sport for the 25th year now, and is a firm fixture in the MotoGP paddock with a wide range of activities. Here‚Äôs to a fantastic anniversary season in 2023!‚Äù\n\n‚Äú2023 is our 25th season as Official Car of MotoGP ‚Äì a lon

## Load Model 

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME, 
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = None,  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = LOAD_IN_4BIT,
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2026.1.2: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA GeForce RTX 5090. Num GPUs = 1. Max memory: 31.348 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 12.0. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


### Define format function

In [4]:
from unsloth.chat_templates import get_chat_template

# Use llama-3.2 chat template for Llama-3.2 Instruct models
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.2",  # Use conversational format for Instruct models
)

def formatting_chat_prompts_func(examples):
    """
    Format chat conversations for training.
    Input: examples with 'messages' field containing list of {role, content} dicts
    Output: formatted text strings with chat template applied
    """
    messages_list = examples["messages"]
    
    texts = []
    for messages in messages_list:
        # Apply chat template to the messages
        text = tokenizer.apply_chat_template(
            messages,
            tokenize = False,
            add_generation_prompt = False
        )
        texts.append(text)
    
    return {"text": texts}

In [5]:
# Apply chat template formatting to create 'text' field for training
train_formatted_ds = train_ds.map(
    formatting_chat_prompts_func,
    batched=True,
    batch_size=100,
    desc="Formatting train dataset"
)
val_formatted_ds = val_ds.map(
    formatting_chat_prompts_func,
    batched=True,
    batch_size=100,
    desc="Formatting validation dataset"
)

print("‚úì Chat template applied to datasets")
print(f"Train formatted: {len(train_formatted_ds)} samples")
print(f"Validation formatted: {len(val_formatted_ds)} samples")

Formatting train dataset:   0%|          | 0/6264 [00:00<?, ? examples/s]

Formatting validation dataset:   0%|          | 0/787 [00:00<?, ? examples/s]

‚úì Chat template applied to datasets
Train formatted: 6264 samples
Validation formatted: 787 samples


In [6]:
# Show example of formatted text with chat template applied
print("=" * 60)
print("Example formatted text (with Llama-3.2 chat template):")
print("=" * 60)
print(train_formatted_ds[0]['text'][:2000])
print("..." if len(train_formatted_ds[0]['text']) > 2000 else "")

Example formatted text (with Llama-3.2 chat template):
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

You are an expert at summarizing BMW news articles. Provide concise, informative summaries that capture the key points.<|eot_id|><|start_header_id|>user<|end_header_id|>

Summarize the following BMW news article in a concise way.

‚ÄúWe are delighted to be joining BMW M in celebrating the 25th anniversary season of our partnership in 2023,‚Äù said Carmelo Ezpeleta, CEO of Dorna Sports. ‚ÄúIn BMW M we have had a strong partner at our side for a quarter of a century; a partner with whom we have enjoyed superb collaboration in many different areas. We are very proud of this long-standing partnership that is never at a standstill, it gives plenty of fresh momentum each year. With the most innovative technologies, BMW M is taking care of safety in our sport for the 25th year now, and is a firm fixture in the MotoG

### Filter long samples

In [7]:
instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n"
response_tokens = tokenizer(response_part, add_special_tokens=False)["input_ids"]

def filter_long_samples(example):
    """Keep only samples where assistant response won't be truncated."""
    tokens = tokenizer(example["text"], add_special_tokens=False)["input_ids"]
    if len(tokens) <= MAX_SEQ_LENGTH:
        return True
    # Find where assistant response starts
    for i in range(len(tokens) - len(response_tokens)):
        if tokens[i:i+len(response_tokens)] == response_tokens:
            return i < MAX_SEQ_LENGTH  # Keep if assistant starts before truncation
    return True  # Keep if no assistant marker found (shouldn't happen)

train_before = len(train_formatted_ds)
val_before = len(val_formatted_ds)

train_formatted_ds = train_formatted_ds.filter(filter_long_samples, desc="Filtering long train samples")
val_formatted_ds = val_formatted_ds.filter(filter_long_samples, desc="Filtering long val samples")

train_filtered = train_before - len(train_formatted_ds)
val_filtered = val_before - len(val_formatted_ds)
if train_filtered > 0 or val_filtered > 0:
    print(f"Filtered out {train_filtered} train and {val_filtered} val samples (too long, assistant would be truncated)")


Filtering long train samples:   0%|          | 0/6264 [00:00<?, ? examples/s]

Filtering long val samples:   0%|          | 0/787 [00:00<?, ? examples/s]

Filtered out 112 train and 20 val samples (too long, assistant would be truncated)


# Train
---
---

### Add LoRA adapters to fine tune 

In [8]:
# We now add LoRA adapters so we only need to update 1 to 10% of all parameters!
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2026.1.2 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [None]:
import wandb
import weave
from dotenv import load_dotenv

load_dotenv()  # Load WANDB_API_KEY from .env file

wandb.init(
            project=WANDB_PROJECT,
            name=WANDB_RUN_NAME,
            entity=WANDB_ENTITY,  # None Ë°®Á§∫‰ΩøÁî®ÈªòËÆ§Ë¥¶Êà∑
            config={
                "model": "Llama-3.2-1B-Instruct",
                "task": "BMW News Fine-tuning",
                "method": "LoRA",
                "dataset": "bmw_training_latest.json",
            })


In [10]:
# Use the formatted chat datasets (train_formatted and val_formatted)
# These have been converted to chat format using chatbmw processor
# and formatted with Llama-3.2 chat template
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_formatted_ds,      # Chat format dataset with 'text' field
    eval_dataset = val_formatted_ds,          # Validation dataset
    dataset_text_field = "text",           # The field containing formatted text
    max_seq_length = MAX_SEQ_LENGTH,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    packing = False,  # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 8,
        per_device_eval_batch_size = 4,    
        eval_strategy = "steps",
        eval_steps = 50,                   # Evaluate less frequently due to more samples
        gradient_accumulation_steps = 1,
        warmup_steps = 10,
        num_train_epochs = 20,
        learning_rate = 2e-4,
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.001,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "wandb",
        # Checkpoint saving configuration
        output_dir = CHECKPOINT_DIR,
        save_strategy = "steps",
        save_steps = 50,
        save_total_limit = 3,  
        load_best_model_at_end = True,
        metric_for_best_model = "eval_loss",
        greater_is_better = False,
    ),
)

print(f"‚úì SFTTrainer configured")
print(f"  Train samples: {len(train_formatted_ds)}")
print(f"  Validation samples: {len(val_formatted_ds)}")

Unsloth: Tokenizing ["text"] (num_proc=36):   0%|          | 0/6152 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=36):   0%|          | 0/767 [00:00<?, ? examples/s]

‚úì SFTTrainer configured
  Train samples: 6152
  Validation samples: 767


We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs.

In [11]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map (num_proc=36):   0%|          | 0/6152 [00:00<?, ? examples/s]

Map (num_proc=36):   0%|          | 0/767 [00:00<?, ? examples/s]

We verify masking is actually done:

In [12]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

'<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\nYou are an expert at creating headlines for BMW news articles. Generate concise, informative, and engaging titles.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nGenerate a concise and informative title for the following BMW news article.\n\n+++ Call for Metaverse solutions to solve specific industrial challenges +++ Opportunity of a partnership with the BMW Group for the winning teams +++Munich. The BMW Group Supplierthon with a focus on the ‚ÄúMetaverse and other Virtual Experiences‚Äù is starting today. The aim is to attract researchers, startups and pioneering tech leaders from within the global Metaverse community in order to gain an outside-in perspective. Application here: Link to Metaverse Supplierthon The Metaverse is the next iteration of the internet: a single, shared, persistent, immersive, 3D virtual space where humans ex

In [13]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

'                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Launch of a Supplierthon for the ‚ÄûMetaverse and other Virtual Experiences‚Äù to crowd-source Innovation ‚Äì apply now!<|eot_id|>'

We can see the System and Instruction prompts are successfully masked!

In [14]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 5090. Max memory = 31.348 GB.
2.41 GB of memory reserved.


In [15]:
from transformers import EarlyStoppingCallback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience = 10,     # How many steps we will wait if the eval loss doesn't decrease
                                     # For example the loss might increase, but decrease after 3 steps
    early_stopping_threshold = 0.0,  # Can set higher - sets how much loss should decrease by until
                                     # we consider early stopping. For eg 0.01 means if loss was
                                     # 0.02 then 0.01, we consider to early stop the run.
)
trainer.add_callback(early_stopping_callback)

In [None]:
trainer_stats = trainer.train()

### Save model

In [None]:
# Save LoRA model
model.save_pretrained(LORA_MODEL_PATH)
tokenizer.save_pretrained(LORA_MODEL_PATH)

# Merge LoRA weights into base model and save
model.save_pretrained_merged(
    MERGED_MODEL_PATH,
    tokenizer,
    save_method="merged_16bit",  # Options: "merged_16bit", "merged_4bit", "lora"
)


Found HuggingFace hub cache directory: /home/zewen/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 1 files from cache to `../bmw-llama-3.2-1b-1500articles/merged_model`: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  2.17it/s]


Successfully copied all 1 files from cache to `../bmw-llama-3.2-1b-1500articles/merged_model`
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 29330.80it/s]
Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:06<00:00,  6.56s/it]


Unsloth: Merge process complete. Saved to `/home/zewen/alwinyang91/ChatBMW/bmw-llama-3.2-1b-1500articles/merged_model`
