In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

deep_learning_spring_2025_project_2_path = kagglehub.competition_download('deep-learning-spring-2025-project-2')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -U transformers
!pip install datasets
!pip install peft
!pip install evaluate

In [None]:
# -----------------------------
# 1. Import libraries
# -----------------------------
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import pickle

# -----------------------------
# 2. Use GPU if available
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# -----------------------------
# 3. Load and preprocess AGNEWS dataset
# -----------------------------
dataset = load_dataset("ag_news")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


In [None]:
# -----------------------------
# 4. Load RoBERTa model with LoRA adapters
# -----------------------------
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=4)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(model, lora_config)
model.to(device)
model.print_trainable_parameters()

In [None]:
print(TrainingArguments.__module__)
print(transformers.__version__)

In [None]:
# -----------------------------
# 5. Define training arguments
# -----------------------------
training_args = transformers.TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    optim="adamw_torch",
    num_train_epochs=3,
    warmup_steps=100,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    logging_dir="./logs",
    report_to="none"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

In [None]:
def get_trainer(model):
    return  Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
# -----------------------------
# 6. Train the model with Hyperparameter sweep
# -----------------------------

from peft import LoraConfig, get_peft_model, TaskType
import pandas as pd

EXPERIMENT = True  # True to sweep, False for default single run

results = []
trained_trainers = {}

grid = [(r, alpha)
        for r in [1, 2, 3, 4, 5]
        for alpha in [4, 8, 16, 32]]

if EXPERIMENT:
    # 1) Filter valid configs
    valid_configs = []
    for r, alpha in grid:
        cfg = LoraConfig(
            r=r,
            lora_alpha=alpha,
            lora_dropout=0.1,
            bias="none",
            target_modules=["query","key","value","dense"],
            task_type=TaskType.SEQ_CLS,
        )
        model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=4)
        model = get_peft_model(model, cfg)  # wrap into `model`
        trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
        if trainable <= 1_000_000:
            valid_configs.append((r, alpha))

    # 2) Train & evaluate each valid config
    for r, alpha in valid_configs:
        print(f"→ Training r={r}, α={alpha}")
        cfg = LoraConfig(
            r=r,
            lora_alpha=alpha,
            lora_dropout=0.1,
            bias="none",
            target_modules=["query","key","value","dense"],
            task_type=TaskType.SEQ_CLS,
        )
        model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=4)
        model = get_peft_model(model, cfg)  # assign to `model`
        model.print_trainable_parameters()

        trainer = get_trainer(model)
        trainer.train()
        metrics = trainer.evaluate(tokenized_dataset["test"])
        acc = metrics["eval_accuracy"]

        results.append({
            "r": r,
            "alpha": alpha,
            "trainable_params": sum(p.numel() for p in model.parameters() if p.requires_grad),
            "accuracy": acc,
        })
        trained_trainers[(r, alpha)] = trainer

    # 3) Aggregate & pick best
    df = pd.DataFrame(results).sort_values("accuracy", ascending=False).reset_index(drop=True)
    best_r, best_alpha = int(df.loc[0, "r"]), int(df.loc[0, "alpha"])
    best_trainer = trained_trainers[(best_r, best_alpha)]
    model = best_trainer.model  # final `model`

else:
    # Default single run: r=1, alpha=32
    best_r, best_alpha = 1, 32
    cfg = LoraConfig(
        r=best_r,
        lora_alpha=best_alpha,
        lora_dropout=0.1,
        bias="none",
        target_modules=["query","key","value","dense"],
        task_type=TaskType.SEQ_CLS,
    )
    model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=4)
    model = get_peft_model(model, cfg)
    print("Default run parameters:")
    model.print_trainable_parameters()

    trainer = get_trainer(model)
    trainer.train()
    metrics = trainer.evaluate(tokenized_dataset["test"])
    acc = metrics["eval_accuracy"]

    results = [{
        "r": best_r,
        "alpha": best_alpha,
        "trainable_params": sum(p.numel() for p in model.parameters() if p.requires_grad),
        "accuracy": acc,
    }]
    df = pd.DataFrame(results)
    best_trainer = trainer

# 4) Summary
print("Sweep/Default results:")
print(df)
print(f"\nSelected best → r={best_r}, α={best_alpha}, acc={df.loc[0,'accuracy']:.4f}")
# `model` and `best_trainer` now hold your final LoRA‑adapted, trained model


In [None]:
# -----------------------------
# 6.2 Visualization
# -----------------------------
import matplotlib.pyplot as plt

history = best_trainer.state.log_history

train_entries = [
    e for e in history
    if "loss" in e and "eval_loss" not in e
]
train_steps = [e["step"] for e in train_entries if "step" in e]
train_loss  = [e["loss"] for e in train_entries]

eval_entries = [e for e in history if "eval_loss" in e]
eval_steps = [e["step"] for e in eval_entries]
eval_loss  = [e["eval_loss"] for e in eval_entries]

plt.figure()
plt.plot(train_steps, train_loss, label="train_loss")
plt.plot(eval_steps,  eval_loss,  label="eval_loss")
plt.xlabel("Step")
plt.ylabel("Loss")
plt.title("Training vs. Validation Loss")
plt.legend()
plt.show()

pivot = df.pivot(index="r", columns="alpha", values="accuracy")

plt.figure()
plt.imshow(pivot, aspect="auto")
plt.xticks(range(len(pivot.columns)), pivot.columns)
plt.yticks(range(len(pivot.index)),   pivot.index)
plt.xlabel("alpha")
plt.ylabel("r")
plt.title("Sweep Accuracy Heatmap")
plt.colorbar(label="accuracy")
plt.show()

In [None]:
# -----------------------------
# 7. Evaluate the model
# -----------------------------
eval_results = best_trainer.evaluate()
print("Final Evaluation Accuracy:", eval_results["eval_accuracy"])

In [None]:
# -----------------------------
# 8. Check trainable parameter count
# -----------------------------
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params}")

In [None]:
from datasets import Dataset
from torch.utils.data import DataLoader

pickle_path = os.path.join(
    deep_learning_spring_2025_project_2_path,
    'test_unlabelled.pkl'
)
# Load dataset object
with open(pickle_path, "rb") as f:
    test_dataset = pickle.load(f)

# Convert to HuggingFace Dataset (already is, but this helps formatting)
test_dataset = Dataset.from_dict({"text": test_dataset["text"]})

# Tokenize function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenizer
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Create PyTorch DataLoader for batching
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=64)

# Prediction loop
model.eval()
all_predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_predictions.extend(preds.cpu().numpy())
print("First 10 predictions:", all_predictions[:10])

In [None]:
# -----------------------------
# 10. Save predictions to CSV
# -----------------------------
df = pd.DataFrame({
    "ID": list(range(len(all_predictions))),   # ID ✅
    "label": all_predictions
})
df.to_csv("submission.csv", index=False)
print("✅ Batched predictions complete. Saved to submission.csv.")