# Evaulate A Reduced LLAMA3.2-1B Model 

In [None]:
import os
import sys
import json
import random

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datasets import load_dataset, Dataset
from transformers import DataCollatorForSeq2Seq, EarlyStoppingCallback, AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig

MODEL_NAME = "unsloth/Llama-3.2-1B-Instruct"
print(MODEL_NAME)
# Training settings
# llama3.2 support 128k tokens
MAX_SEQ_LENGTH = 4096 # Choose any! We auto support RoPE Scaling internally!
LOAD_IN_4BIT = False # Use 4bit quantization to reduce memory usage. Can be False.


WANDB_PROJECT = "BMW-Llama-3.2-1B"
WANDB_ENTITY = None  
WANDB_RUN_NAME = "BMW-Llama-3.2-1B-2000Articles_pruned"

TRAIN_CHAT_DATA_FILE_NAME = '../datasets/chat_data_2000/train_chat.jsonl'
VAL_CHAT_DATA_FILE_NAME = '../datasets/chat_data_2000/val_chat.jsonl'
TEST_CHAT_DATA_FILE_NAME = '../datasets/chat_data_2000/test_chat.jsonl'

# Set checkpoint directory
CHECKPOINT_DIR = f"../{WANDB_RUN_NAME}"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Use the checkpoint_dir variable to save models
LORA_MODEL_PATH = os.path.join(CHECKPOINT_DIR, "lora_model")
MERGED_MODEL_PATH = os.path.join(CHECKPOINT_DIR, "merged_model")
# Create directories if they don't exist
os.makedirs(LORA_MODEL_PATH, exist_ok=True)
os.makedirs(MERGED_MODEL_PATH, exist_ok=True)

unsloth/Llama-3.2-1B-Instruct


In [None]:
dataset_test = load_dataset("json", data_files=TEST_CHAT_DATA_FILE_NAME, split="train")
test_ds = Dataset.from_list(dataset_test)

print("HuggingFace Chat Datasets:")
print(f"  Test: {test_ds}")

âœ“ Loaded final test dataset: 444 samples
Features: {'messages': List({'role': Value('string'), 'content': Value('string')})}


# Test Model
---
---

In [3]:
# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, 
    dtype=torch.bfloat16, 
    device_map='auto'
    )
base_model.model.layers = torch.nn.ModuleList(base_model.model.layers[:-1])
base_model.config.num_hidden_layers = len(base_model.model.layers)

base_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

base_model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0-14): 15 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,)

In [5]:
test_model = AutoModelForCausalLM.from_pretrained(MERGED_MODEL_PATH, dtype=torch.bfloat16, device_map='auto')
test_tokenizer = AutoTokenizer.from_pretrained(MERGED_MODEL_PATH)

# Ensure pad token is set (in case it wasn't saved properly)
if test_tokenizer.pad_token is None:
    test_tokenizer.pad_token = test_tokenizer.eos_token

test_model.eval()

The tokenizer you are loading from '../bmw-llama-3.2-1b-2000articles_stretch_lessLayer/merged_model' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0-14): 15 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,)

# Evaluate Model Performance
---
---


Refer to the notebook `3.Evaulate_Llama3.2-1B.ipynb`