In [None]:
from huggingface_hub import login
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, set_seed, BitsAndBytesConfig
from datasets import load_dataset, Dataset, DatasetDict
import wandb
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from datetime import datetime
import matplotlib.pyplot as plt
import os

In [None]:
from dotenv import load_dotenv
load_dotenv(override=True)

In [None]:
torch.cuda.empty_cache()

In [None]:
# Track & push - disable if just putzing around
LOG_TO_WANDB = False
PUSH_TO_HUB = False

# Constants

BASE_MODEL = "google/gemma-3-270m"
PROJECT_NAME = "test-finetune-local"
HF_USER = "adamsarok" 

RUN_NAME =  f"{datetime.now():%Y-%m-%d_%H.%M.%S}" # put stuff here to identify the run - "more dropout etc. ?"

PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"

# Hyper-parameters - overall

EPOCHS = 2
BATCH_SIZE = 8
MAX_SEQUENCE_LENGTH = 128 # max token length per input
GRADIENT_ACCUMULATION_STEPS = 2

# Hyper-parameters - QLoRA

QUANT_4_BIT = True
LORA_R = 16
LORA_ALPHA = LORA_R * 2
ATTENTION_LAYERS = ["q_proj", "v_proj", "k_proj", "o_proj"]
MLP_LAYERS = ["gate_proj", "up_proj", "down_proj"]
TARGET_MODULES = ATTENTION_LAYERS # ATTENTION_LAYERS + MLP_LAYERS
LORA_DROPOUT = 0.1

# Hyper-parameters - training

LEARNING_RATE = 1e-4
WARMUP_RATIO = 0.01 # how long it takes to reach max learning rate 1e-4
LR_SCHEDULER_TYPE = 'cosine' # starts high and drops off
WEIGHT_DECAY = 0.001
OPTIMIZER = "paged_adamw_8bit" # "paged_adamw_32bit" # Adam with Weight Decay

capability = torch.cuda.get_device_capability()
use_bf16 = capability[0] >= 8

# Tracking

VAL_SIZE = 500
LOG_STEPS = 5
SAVE_STEPS = 25

In [None]:
hf_token = os.getenv('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
# Log in to Weights & Biases
wandb_api_key = os.getenv('WANDB_API_KEY')
os.environ["WANDB_API_KEY"] = wandb_api_key
wandb.login()

# Configure Weights & Biases to record against our project
os.environ["WANDB_PROJECT"] = PROJECT_NAME
os.environ["WANDB_LOG_MODEL"] = "false"
os.environ["WANDB_WATCH"] = "false"

In [None]:
from datasets import Dataset

# Simple, clean training examples
dataset = [
    {"text": "The sky is blue."},
    {"text": "Cats are pets."},
    {"text": "Water is wet."},
    {"text": "The sun is hot."},
]

train = Dataset.from_list(dataset * 100)  # 40 samples
val = Dataset.from_list(dataset[:2])     # 2 samples
test = Dataset.from_list(dataset[2:])    # 2 samples

print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")

print(f"Train example: {train[0]}")
print(f"Val example: {val[0]}")
print(f"Test example: {test[0]}") 

In [None]:
INPUT_DATASET = ""
if INPUT_DATASET is not None and INPUT_DATASET != "":
    DATASET_NAME = f"{HF_USER}/{INPUT_DATASET}" # hugging face dataset name

    dataset = load_dataset(DATASET_NAME)
    train = dataset['train']
    val = dataset['val'].select(range(VAL_SIZE))
    test = dataset['test']

In [None]:
if LOG_TO_WANDB:
  wandb.init(project=PROJECT_NAME, name=RUN_NAME)

In [None]:
if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16 if use_bf16 else torch.float16,
    bnb_4bit_quant_type="nf4"
  )
else:
  quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16 if use_bf16 else torch.float16,
  )

In [None]:
# Load the Tokenizer and the Model

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id

print(f"Memory footprint: {base_model.get_memory_footprint() / 1e6:.1f} MB")

In [None]:
# LoRA Parameters

lora_parameters = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=TARGET_MODULES,
)

In [None]:
# Training parameters

train_parameters = SFTConfig(
    output_dir=PROJECT_RUN_NAME,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    optim=OPTIMIZER,
    save_steps=SAVE_STEPS,
    save_total_limit=10,
    logging_steps=LOG_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.001,
    fp16=not use_bf16,
    bf16=use_bf16,
    max_grad_norm=0.5,
    max_steps=-1,
    warmup_ratio=WARMUP_RATIO,
    group_by_length=True,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    report_to="wandb" if LOG_TO_WANDB else None,
    run_name=RUN_NAME,
    max_length=MAX_SEQUENCE_LENGTH,
    save_strategy="steps",
    hub_strategy="every_save",
    push_to_hub=True,
    hub_model_id=HUB_MODEL_NAME,
    hub_private_repo=True,
    eval_strategy="steps",
    eval_steps=SAVE_STEPS,
    gradient_checkpointing=True,  # mem optimization
    gradient_checkpointing_kwargs={'use_reentrant': False},  # mem optimization
)

In [None]:
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=train,
    eval_dataset=val,
    peft_config=lora_parameters,
    args=train_parameters,
)

In [None]:
# Fine-tune!
fine_tuning.train()


In [None]:
# Push our fine-tuned model to Hugging Face
if PUSH_TO_HUB:
    fine_tuning.model.push_to_hub(PROJECT_RUN_NAME, private=True)
    print(f"Saved to the hub: {PROJECT_RUN_NAME}")

In [None]:
if LOG_TO_WANDB:
  wandb.finish()
  print(f"Finished logging to WandB for run: {PROJECT_RUN_NAME}")

In [None]:
# Load and test our fine-tuned model from local - final 

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto"
)

checkpoint_path = os.path.abspath("test-finetune-local-2025-12-21_19.05.57") #/checkpoint-50

# Load LoRA adapter from local checkpoint
model = PeftModel.from_pretrained(
    base_model,
    checkpoint_path  # your output directory
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# Generate text
inputs = tokenizer("The sky is", return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0]))