<a href="https://colab.research.google.com/github/calvinli2024/CS614-genai/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Packages

In [1]:
%pip install evaluate peft huggingface-hub

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from dataclasses import dataclass
from typing import Dict, List, Optional, Any

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    AutoModelForCausalLM,
    pipeline,
    EarlyStoppingCallback
)

import evaluate
from evaluate import evaluator

import itertools
import json
from copy import deepcopy

from peft import LoraConfig, TaskType, get_peft_model

from IPython.display import display, Markdown

import random

from huggingface_hub import HfApi

from kaggle_secrets import UserSecretsClient

2026-02-21 10:49:10.189077: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771670950.212913     906 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771670950.220513     906 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771670950.240412     906 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771670950.240434     906 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771670950.240437     906 computation_placer.cc:177] computation placer alr

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Dataset

In [4]:
dataset = load_dataset("sh0416/ag_news")

dataset = {
    "train": dataset["train"].shuffle(seed=42).select(range(5000)),
    "valid": dataset["train"].shuffle(seed=42).select(range(5000, 6000)),
    "test": dataset["test"].shuffle(seed=42).select(range(2000))
}

num_labels = len(set(dataset["train"]["label"]))

# Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Model

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    "Qwen/Qwen3-0.6B",
    num_labels=num_labels,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    pad_token_id=tokenizer.pad_token_id,
)

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False, # set to False for training
    r=8, # dimension of the smaller matrices
    lora_alpha=32, # scaling factor
    lora_dropout=0.1 # dropout of LoRA layers
)

model = get_peft_model(model, lora_config)

device = "cuda" if torch.cuda.is_available() else "cpu"

model.to(device)

`torch_dtype` is deprecated! Use `dtype` instead!
Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): Qwen3ForSequenceClassification(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 1024, padding_idx=151643)
        (layers): ModuleList(
          (0-27): 28 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=1024, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
      

# Preprocess

In [7]:
max_length = 256

def preprocess(dataset_split, select_columns: List[str]):
    def run_tokenizer(row):
      return tokenizer(
          row["text"],
          padding="max_length",
          truncation=True,
          max_length=max_length,
      )

    def prepare_columns(row):
      row["text"] = row["title"] + " " + row["description"]
      row["label"] = row["label"] - 1

      return row

    dataset_split = dataset_split.map(prepare_columns)

    encoded_dataset = dataset_split.map(run_tokenizer, batched=True)

    encoded_dataset = encoded_dataset.remove_columns(
        [col for col in encoded_dataset.column_names if col not in select_columns]
    )

    return encoded_dataset.with_format("torch")

train_dataset = preprocess(dataset['train'], ["input_ids", "attention_mask", "label"])
valid_dataset = preprocess(dataset['valid'], ["input_ids", "attention_mask", "label"])
test_dataset = preprocess(dataset['test'], ["label", "text"])

# Metrics

In [None]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    results = accuracy.compute(predictions=preds, references=labels)
    results.update(
        f1.compute(predictions=preds, references=labels, average="macro")
    )

    return results

# Initial Performance

In [None]:
def test_performance(model: str):
    model = AutoModelForSequenceClassification.from_pretrained(
        model,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        pad_token_id=tokenizer.pad_token_id,
        num_labels=num_labels
    )

    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

    label_mapping={f"LABEL_{i}": i for i in range(num_labels)}

    predictions_raw = classifier(list(test_dataset["text"]), batch_size=32)

    predicted_labels_str = [pr["label"] for pr in predictions_raw]

    predictions = [label_mapping[str_label] for str_label in predicted_labels_str]

    final_f1 = evaluate.load("f1").compute(predictions=predictions, references=list(test_dataset["label"]), average="macro")["f1"]

    return final_f1

initial_f1 = test_performance("Qwen/Qwen3-0.6B")

# Hyperparameter Tuning

## Training Arguments

In [None]:
training_arguments = TrainingArguments(
    output_dir="qwen3-0.6-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=0,
    lr_scheduler_type="cosine",
    gradient_accumulation_steps=4,
    fp16=False,
    report_to="none",
    save_steps=50,
    eval_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

early_stop = EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.02)

trainer_kwargs = dict(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stop]
)

## Search Space

In [None]:
hyperparameters = []

for learning_rate in [1e-4, 1e-3]:
    for num_train_epochs in [1, 2]:
        for lr_scheduler_type in ["linear", "cosine"]:
            for gradient_accumulation_steps in [4, 8]:
                for weight_decay in [0.01, 0.1]:
                    hyperparameters.append({
                        "learning_rate": learning_rate,
                        "num_train_epochs": num_train_epochs,
                        "lr_scheduler_type": lr_scheduler_type,
                        "gradient_accumulation_steps": gradient_accumulation_steps,
                        "weight_decay": weight_decay
                    })

random.seed(42)
random.shuffle(hyperparameters)
hyperparameters = hyperparameters[:len(hyperparameters) // 2]

## Helper Functions

In [None]:
def run_single_experiment(
    base_training_args: TrainingArguments,
    trainer_cls,
    trainer_kwargs: Dict[str, Any],
    hp_config: Dict[str, float|int],
    idx: int
) -> Dict[str, Any]:
    args_dict = base_training_args.to_dict()
    for k, v in hp_config.items():
        args_dict[k] = v

    training_args = TrainingArguments(**args_dict)

    trainer = trainer_cls(
        args=training_args,
        **trainer_kwargs,
    )

    train_output = trainer.train()
    eval_metrics = trainer.evaluate()

    trainer.save_model(f"checkpoint_{idx}")

    trainer.push_to_hub(
        repo_id="cli08/qwen3-0/6-finetuned",
        token=UserSecretsClient().get_secret("HF_TOKEN")
    )

    result = {
        "hp_config": hp_config,
        "train_samples": train_output.metrics.get("train_samples", None),
        "eval_metrics": eval_metrics,
    }

    return result

def grid_search_hyperparams(
    base_training_args: TrainingArguments,
    trainer_cls,
    trainer_kwargs: Dict[str, Any],
    hyperparameters: Dict[int, Dict[str, int|float]],
    results_path: str = "grid_search_results.jsonl",
) -> List[Dict[str, Any]]:
    all_results = {}

    os.makedirs(os.path.dirname(results_path) or ".", exist_ok=True)

    with open(results_path, "w", encoding="utf-8") as f_out:
        for idx, combo in enumerate(hyperparameters):
            print("\n=== Running config:", combo, "===")

            result = run_single_experiment(
                base_training_args=base_training_args,
                trainer_cls=trainer_cls,
                trainer_kwargs=deepcopy(trainer_kwargs),
                hp_config=combo,
                idx=idx
            )

            # Persist each result as one JSON line
            f_out.write(json.dumps(result) + "\n")
            f_out.flush()

            all_results[idx] = result

    return all_results

# Train

In [None]:
results = grid_search_hyperparams(
    base_training_args=training_arguments,
    trainer_cls=Trainer,
    trainer_kwargs=trainer_kwargs,
    hyperparameters=hyperparameters,
    results_path="grid_search_results.jsonl",
)

In [None]:
best_config = max(results.items(), key=lambda r: r[1]["eval_metrics"].get("eval_f1", 0.0))

print("Best index:", best_config[0])
print("Best config:", best_config[1]["hp_config"])
print("Best metrics:", best_config[1]["eval_metrics"])

# Post Fine-Tuning Performance

In [None]:
finetuned_f1 = test_performance(f"./checkpoint_{best_config[0]}")

In [None]:
display(Markdown("""
|Initial|Post|
|-------|----|
|{initial_f1}|{finetuned_f1}|
""").format(initial_f1=initial_f1, finetuned_f1=finetuned_f1))