In [1]:
!git clone https://github.com/timothycao/agnews-classifier.git
%cd agnews-classifier

Cloning into 'agnews-classifier'...
remote: Enumerating objects: 37, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 37 (delta 17), reused 28 (delta 9), pack-reused 0 (from 0)[K
Receiving objects: 100% (37/37), 10.68 KiB | 3.56 MiB/s, done.
Resolving deltas: 100% (17/17), done.
/kaggle/working/agnews-classifier


In [16]:
# Model

from peft import LoraConfig
from model import create_lora_model

lora_config = LoraConfig(
    r=1,
    lora_alpha=3,
    lora_dropout=0.05,
    bias='none',
    target_modules=['query', 'key', 'value', 'attention.output.dense'],
    task_type='SEQ_CLS',
    use_rslora = False # regular LoRA: lora_alpha/r, rs-LoRA: lora_alpha/sqrt(r) -> normalization technique
)

model = create_lora_model(lora_config)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 667,396 || all params: 125,316,104 || trainable%: 0.5326


In [17]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): Mo

In [18]:
# Training

from transformers import TrainingArguments
from train import main as train

training_args = TrainingArguments(
    # Core training configs
    max_steps=3000,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    optim='adamw_torch',
    learning_rate=5e-5,
    lr_scheduler_type='linear',
    
    # Logging, evaluation, and checkpointing
    output_dir='/kaggle/working/saved_models',
    logging_strategy='steps',
    logging_steps=100,
    eval_strategy='steps',
    eval_steps=100,
    save_strategy='steps',
    save_steps=100,

    # Miscellaneous
    report_to='none',
    dataloader_num_workers=4,
    gradient_checkpointing=False,
    gradient_checkpointing_kwargs={'use_reentrant':True}
)

train(model, training_args)

Starting training...


Step,Training Loss,Validation Loss,Accuracy
100,1.3642,1.324727,0.760938
200,1.1235,0.557416,0.889062
300,0.4291,0.35355,0.890625
400,0.3035,0.350469,0.889062
500,0.3234,0.32866,0.904687
600,0.314,0.323941,0.901563
700,0.3104,0.328202,0.903125
800,0.3342,0.312822,0.90625
900,0.3049,0.311037,0.903125
1000,0.2978,0.309046,0.903125


In [21]:
# Inference

from inference import main as inference

data_path = '/kaggle/input/deep-learning-spring-2025-project-2/test_unlabelled.pkl'
checkpoint = '/kaggle/working/saved_models/checkpoint-2700'
output_dir = '/kaggle/working/saved_predictions'

inference(data_path, checkpoint, output_dir)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running inference...


100%|██████████| 250/250 [01:00<00:00,  4.14it/s]

Predictions saved to /kaggle/working/saved_predictions/predictions_checkpoint-2700.csv





In [19]:
import pandas as pd

csv_path = '/kaggle/working/saved_models/checkpoint-3000/processed_log_history.csv'

df = pd.read_csv(csv_path)
df

Unnamed: 0,Step,Train Loss,Test Loss,Train Acc,Test Acc,Learning Rate,Epochs,Loss Spread,Loss Ratio,Acc Spread,Acc Ratio
0,100,1.3642,1.324727,0.380625,0.760938,4.8e-05,0.013405,0.039473,1.029797,-0.380312,0.500205
1,200,1.1235,0.557416,0.736161,0.889062,4.7e-05,0.02681,0.566084,2.01555,-0.152902,0.828019
2,300,0.4291,0.35355,0.873661,0.890625,4.5e-05,0.040214,0.07555,1.213691,-0.016964,0.980952
3,400,0.3035,0.350469,0.9,0.889062,4.3e-05,0.053619,-0.046969,0.865983,0.010937,1.012302
4,500,0.3234,0.32866,0.888839,0.904687,4.2e-05,0.067024,-0.00526,0.983995,-0.015848,0.982482
5,600,0.314,0.323941,0.899554,0.901563,4e-05,0.080429,-0.009941,0.969312,-0.002009,0.997772
6,700,0.3104,0.328202,0.899107,0.903125,3.8e-05,0.093834,-0.017802,0.945758,-0.004018,0.995551
7,800,0.3342,0.312822,0.8875,0.90625,3.7e-05,0.107239,0.021378,1.068338,-0.01875,0.97931
8,900,0.3049,0.311037,0.900446,0.903125,3.5e-05,0.120643,-0.006137,0.980271,-0.002679,0.997034
9,1000,0.2978,0.309046,0.901786,0.903125,3.3e-05,0.134048,-0.011246,0.963611,-0.001339,0.998517
