# Fine-Tune A Reduced LLAMA3.2-1B Model 

In [1]:
import os
import sys
import json
import random

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datasets import load_dataset, Dataset
from transformers import DataCollatorForSeq2Seq, EarlyStoppingCallback, AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig

MODEL_NAME = "unsloth/Llama-3.2-1B-Instruct"
print(MODEL_NAME)
# Training settings
# llama3.2 support 128k tokens
MAX_SEQ_LENGTH = 4096 # Choose any! We auto support RoPE Scaling internally!
LOAD_IN_4BIT = False # Use 4bit quantization to reduce memory usage. Can be False.


WANDB_PROJECT = "BMW-Llama-3.2-1B"
WANDB_ENTITY = None  
WANDB_RUN_NAME = "BMW-Llama-3.2-1B-2000Articles_pruned"

TRAIN_CHAT_DATA_FILE_NAME = '../datasets/chat_data_2000/train_chat.jsonl'
VAL_CHAT_DATA_FILE_NAME = '../datasets/chat_data_2000/val_chat.jsonl'
TEST_CHAT_DATA_FILE_NAME = '../datasets/chat_data_2000/test_chat.jsonl'

# Set checkpoint directory
CHECKPOINT_DIR = f"../{WANDB_RUN_NAME}"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Use the checkpoint_dir variable to save models
LORA_MODEL_PATH = os.path.join(CHECKPOINT_DIR, "lora_model")
MERGED_MODEL_PATH = os.path.join(CHECKPOINT_DIR, "merged_model")
# Create directories if they don't exist
os.makedirs(LORA_MODEL_PATH, exist_ok=True)
os.makedirs(MERGED_MODEL_PATH, exist_ok=True)

unsloth/Llama-3.2-1B-Instruct


## Load Model 

In [2]:
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Ensure pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Structures the dataset into prompt-expected output pairs.
def formatting_function(examples):
    return tokenizer.apply_chat_template(examples["messages"], tokenize=False, add_generation_prompt=False)

In [3]:
layers = model.model.layers
print(f"Original layer count: {len(layers)}")

Original layer count: 16


In [4]:
model.model.layers = torch.nn.ModuleList(layers[:-1])
model.config.num_hidden_layers = len(model.model.layers)

# Print layer information
print(f"Number of layers: {len(model.model.layers)}")
print(f"Config num_hidden_layers: {model.config.num_hidden_layers}")
print(f"\nLayer structure:")
for i, layer in enumerate(model.model.layers):
    print(f"  Layer {i}: {type(layer).__name__}")
    if hasattr(layer, 'self_attn'):
        print(f"    - self_attn: {type(layer.self_attn).__name__}")
    if hasattr(layer, 'mlp'):
        print(f"    - mlp: {type(layer.mlp).__name__}")
    if hasattr(layer, 'input_layernorm'):
        print(f"    - input_layernorm: {type(layer.input_layernorm).__name__}")
    if hasattr(layer, 'post_attention_layernorm'):
        print(f"    - post_attention_layernorm: {type(layer.post_attention_layernorm).__name__}")

# Print detailed info for first layer as example
if len(model.model.layers) > 0:
    print(f"\nDetailed info for first layer (Layer 0):")
    print(model.model.layers[0])

Number of layers: 15
Config num_hidden_layers: 15

Layer structure:
  Layer 0: LlamaDecoderLayer
    - self_attn: LlamaAttention
    - mlp: LlamaMLP
    - input_layernorm: LlamaRMSNorm
    - post_attention_layernorm: LlamaRMSNorm
  Layer 1: LlamaDecoderLayer
    - self_attn: LlamaAttention
    - mlp: LlamaMLP
    - input_layernorm: LlamaRMSNorm
    - post_attention_layernorm: LlamaRMSNorm
  Layer 2: LlamaDecoderLayer
    - self_attn: LlamaAttention
    - mlp: LlamaMLP
    - input_layernorm: LlamaRMSNorm
    - post_attention_layernorm: LlamaRMSNorm
  Layer 3: LlamaDecoderLayer
    - self_attn: LlamaAttention
    - mlp: LlamaMLP
    - input_layernorm: LlamaRMSNorm
    - post_attention_layernorm: LlamaRMSNorm
  Layer 4: LlamaDecoderLayer
    - self_attn: LlamaAttention
    - mlp: LlamaMLP
    - input_layernorm: LlamaRMSNorm
    - post_attention_layernorm: LlamaRMSNorm
  Layer 5: LlamaDecoderLayer
    - self_attn: LlamaAttention
    - mlp: LlamaMLP
    - input_layernorm: LlamaRMSNorm
    -

In [5]:
print(f"New layer count: {len(model.model.layers)}")
print(f"Config num_layers: {model.config.num_hidden_layers}")

New layer count: 15
Config num_layers: 15


In [6]:
# Apply LoRA
lora_config = LoraConfig(
    r=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)


In [7]:
dataset_train = load_dataset("json", data_files=TRAIN_CHAT_DATA_FILE_NAME, split="train")
dataset_val = load_dataset("json", data_files=VAL_CHAT_DATA_FILE_NAME, split="train")
dataset_test = load_dataset("json", data_files=TEST_CHAT_DATA_FILE_NAME, split="train")

# =============================================================================
# Step 3: Create HuggingFace Datasets from chat data
# =============================================================================
train_ds = Dataset.from_list(dataset_train)
val_ds = Dataset.from_list(dataset_val)
test_ds = Dataset.from_list(dataset_test)

print("HuggingFace Chat Datasets:")
print(f"  Train: {train_ds}")
print(f"  Validation: {val_ds}")
print(f"  Test: {test_ds}")

print(f"\nDataset features: {train_ds.features}")
print(f"\nFirst sample structure:")
train_ds[0]

HuggingFace Chat Datasets:
  Train: Dataset({
    features: ['messages'],
    num_rows: 6264
})
  Validation: Dataset({
    features: ['messages'],
    num_rows: 787
})
  Test: Dataset({
    features: ['messages'],
    num_rows: 787
})

Dataset features: {'messages': List({'content': Value('string'), 'role': Value('string')})}

First sample structure:


{'messages': [{'content': 'You are an expert at summarizing BMW news articles. Provide concise, informative summaries that capture the key points.',
   'role': 'system'},
  {'content': 'Summarize the following BMW news article in a concise way.\n\n“We are delighted to be joining BMW M in celebrating the 25th anniversary season of our partnership in 2023,” said Carmelo Ezpeleta, CEO of Dorna Sports. “In BMW M we have had a strong partner at our side for a quarter of a century; a partner with whom we have enjoyed superb collaboration in many different areas. We are very proud of this long-standing partnership that is never at a standstill, it gives plenty of fresh momentum each year. With the most innovative technologies, BMW M is taking care of safety in our sport for the 25th year now, and is a firm fixture in the MotoGP paddock with a wide range of activities. Here’s to a fantastic anniversary season in 2023!”\n\n“2023 is our 25th season as Official Car of MotoGP – a long-standing, cl

# Train
---
---

### Add LoRA adapters to fine tune 

In [None]:
import wandb
import weave
from dotenv import load_dotenv

load_dotenv()  # Load WANDB_API_KEY from .env file

wandb.init(
            project=WANDB_PROJECT,
            name=WANDB_RUN_NAME,
            entity=WANDB_ENTITY,  # None 表示使用默认账户
            )

In [9]:
trainer = SFTTrainer(
    model=model,
     processing_class=tokenizer,
    train_dataset=train_ds,           # Use the original dataset with "messages" field
    eval_dataset=val_ds,
    formatting_func=formatting_function,
    args=SFTConfig(
        per_device_train_batch_size=2,
        per_device_eval_batch_size=1,
        eval_strategy="steps",
        eval_steps=50,
        gradient_accumulation_steps=1,
        warmup_steps=10,
        num_train_epochs=20,
        learning_rate=2e-4,
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.001,
        lr_scheduler_type="linear",
        seed=3407,
        report_to="wandb",
        output_dir=CHECKPOINT_DIR,
        save_strategy="steps",
        save_steps=50,
        save_total_limit=3,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
    ),
)

Applying formatting function to train dataset:   0%|          | 0/6264 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/6264 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/6264 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/787 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/787 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/787 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [10]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 5090. Max memory = 31.348 GB.
4.746 GB of memory reserved.


In [11]:
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience = 20,     # How many steps we will wait if the eval loss doesn't decrease
                                     # For example the loss might increase, but decrease after 3 steps
    early_stopping_threshold = 0.0,  # Can set higher - sets how much loss should decrease by until
                                     # we consider early stopping. For eg 0.01 means if loss was
                                     # 0.02 then 0.01, we consider to early stop the run.
)
trainer.add_callback(early_stopping_callback)

In [None]:
trainer_stats = trainer.train()

### Save model

In [None]:
# Save LoRA model (trainer.model already contains the trained LoRA weights)
trainer.model.save_pretrained(LORA_MODEL_PATH)
tokenizer.save_pretrained(LORA_MODEL_PATH)

In [None]:
# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, 
    dtype=torch.bfloat16, 
    device_map='auto'
    )

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Ensure pad token is set (must match training configuration)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load trained LoRA adapters from the training checkpoint
model_with_lora = PeftModel.from_pretrained(
    base_model, 
    '../bmw-llama-3.2-1b-2000articles_stretch/checkpoint-2750',
    )

# Merge and use for inference
merged_model = model_with_lora.merge_and_unload()
merged_model.eval()

merged_model.save_pretrained(MERGED_MODEL_PATH, safe_serialization=True)
tokenizer.save_pretrained(MERGED_MODEL_PATH)