In [None]:
import sys
sys.path.append("../../../../")
sys.path.append("../../")

In [None]:
!pip install -q --upgrade bitsandbytes==0.48.2 trl==0.25.1 peft
!wget -q https://raw.githubusercontent.com/ed-donner/llm_engineering/main/week7/util.py -O util.py

In [None]:
import os
import re
import math
from tqdm import tqdm
from huggingface_hub import login
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed
from datasets import load_dataset, Dataset, DatasetDict
from datetime import datetime
from peft import PeftModel
from util import evaluate


In [None]:
BASE_MODEL = "meta-llama/Llama-3.2-3B"
PROJECT_NAME = "price-prediction"
DATA_USER = "ed-donner"
USERNAME = "ujalaarshad17"
DATASET_NAME = f"{DATA_USER}/items_prompts_lite"
RUN_NAME =  f"{datetime.now():%Y-%m-%d_%H.%M.%S}"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
HUB_MODEL_NAME = f"{USERNAME}/{PROJECT_RUN_NAME}"

In [None]:
from dotenv import load_dotenv
load_dotenv(override = True)

In [None]:
# Constants

BASE_MODEL = "meta-llama/Llama-3.2-3B"
PROJECT_NAME = "my-second-price-p"
HF_USER = "ujalaarshad17" # your HF name here!

LITE_MODE = True

DATA_USER = "ed-donner"
DATASET_NAME = f"{DATA_USER}/items_prompts_lite"
RUN_NAME = "2025-12-15_14.35.02"
REVISION = "aa26229b1f08fccf94580f83df5f5107a1ce1bc5"

PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"


# Hyper-parameters - QLoRA

QUANT_4_BIT = False
capability = torch.cuda.get_device_capability()
use_bf16 = capability[0] >= 8

In [None]:
hf_token = os.getenv("HF_TOKEN")
login(hf_token, add_to_git_credential=True)

In [None]:
dataset = load_dataset(DATASET_NAME)
test = dataset['test']

In [None]:
test[1]

In [None]:
# pick the right quantization

if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16 if use_bf16 else torch.float16,
    bnb_4bit_quant_type="nf4"
  )
else:
  quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16 if use_bf16 else torch.float16,
  )

In [None]:
# Load the Tokenizer and the Model

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id

fine_tuned_model = PeftModel.from_pretrained(base_model, HUB_MODEL_NAME, revision=REVISION)

print(f"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB")

In [None]:
def model_predict(item):
    inputs = tokenizer(item["prompt"],return_tensors="pt").to("cuda")
    with torch.no_grad():
        output_ids = fine_tuned_model.generate(**inputs, max_new_tokens=8)
    prompt_len = inputs["input_ids"].shape[1]
    generated_ids = output_ids[0, prompt_len:]
    return tokenizer.decode(generated_ids)

In [None]:
set_seed(42)
evaluate(model_predict, test)