In [None]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
import os
import torch
from torch import nn
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import sklearn
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
from torch.optim import AdamW
from torchinfo import summary
import numpy as np

In [None]:
def load_news_data(data_file):

    df = pd.read_json(data_file, lines=True)
    df.head()

    df['category'] = df['category'].map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)

    df['headline'] = df['headline'].apply(lambda x: str(x).lower())
    df['short_description'] = df['short_description'].apply(lambda x: str(x).lower())

    df['text'] = df['headline'] + " " + df['short_description']
    encoder = LabelEncoder()
    df['label'] = encoder.fit_transform(df['category'])
    print(f"The dataset contains {df['category'].nunique()} unique categories.")

    return df['text'].tolist(), df['label'].tolist(), encoder.classes_.tolist()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_file = "/content/drive/MyDrive/News_Category_Dataset_v2.json"
texts, labels, label_names = load_news_data(data_file)

The dataset contains 40 unique categories.


In [None]:
for idx, name in enumerate(label_names):
    print(f"{idx} → {name}")

0 → ARTS
1 → ARTS & CULTURE
2 → BLACK VOICES
3 → BUSINESS
4 → COLLEGE
5 → COMEDY
6 → CRIME
7 → CULTURE & ARTS
8 → DIVORCE
9 → EDUCATION
10 → ENTERTAINMENT
11 → ENVIRONMENT
12 → FIFTY
13 → FOOD & DRINK
14 → GOOD NEWS
15 → GREEN
16 → HEALTHY LIVING
17 → HOME & LIVING
18 → IMPACT
19 → LATINO VOICES
20 → MEDIA
21 → MONEY
22 → PARENTING
23 → PARENTS
24 → POLITICS
25 → QUEER VOICES
26 → RELIGION
27 → SCIENCE
28 → SPORTS
29 → STYLE
30 → STYLE & BEAUTY
31 → TASTE
32 → TECH
33 → TRAVEL
34 → WEDDINGS
35 → WEIRD NEWS
36 → WELLNESS
37 → WOMEN
38 → WORLD NEWS
39 → WORLDPOST


In [None]:
# Set up parameters
bert_model_name = 'prajjwal1/bert-tiny'
num_classes = len(label_names)
max_length = 256
batch_size = 32
num_epochs = 10
learning_rate = 3e-5

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [None]:
def train(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)  # 不传 labels
        logits = outputs.logits
        loss = nn.CrossEntropyLoss()(outputs.logits, labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Average training loss: {avg_loss:.4f}")


In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # Access the logits
            _, preds = torch.max(logits, dim=1)  # Get the predicted class labels

            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    acc = accuracy_score(actual_labels, predictions)
    macro_f1 = f1_score(actual_labels, predictions, average='macro')
    weighted_f1 = f1_score(actual_labels, predictions, average='weighted')

    return acc, macro_f1, weighted_f1

In [None]:
def predict_news_category(text, model, tokenizer, device, encoder, max_length=128):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted_label = torch.max(logits, dim=1)

    predicted_category = encoder.inverse_transform(predicted_label.cpu().numpy())[0]
    return predicted_category


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [None]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

In [None]:
from transformers import AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model, TaskType

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model = AutoModelForSequenceClassification.from_pretrained(
    bert_model_name,
    num_labels=num_classes
)
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.0,
    bias="lora_only"
)

model = get_peft_model(base_model, lora_config).to(device)

print(f"Using device: {device}")

pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
batch_size = 16
seq_len = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_data = {
    "input_ids": torch.zeros((batch_size, seq_len), dtype=torch.long).to(device),
    "attention_mask": torch.ones((batch_size, seq_len), dtype=torch.long).to(device)
}

summary(model, input_data=input_data)

Layer (type:depth-idx)                                                      Output Shape              Param #
PeftModelForSequenceClassification                                          [16, 40]                  --
├─LoraModel: 1-1                                                            [16, 40]                  --
│    └─BertForSequenceClassification: 2-1                                   --                        --
│    │    └─BertModel: 3-1                                                  [16, 128]                 4,394,112
│    │    └─Dropout: 3-2                                                    [16, 128]                 --
│    │    └─ModulesToSaveWrapper: 3-3                                       [16, 40]                  10,320
Total params: 4,404,432
Trainable params: 13,864
Non-trainable params: 4,390,568
Total mult-adds (Units.MEGABYTES): 69.41
Input size (MB): 0.03
Forward/backward pass size (MB): 61.49
Params size (MB): 17.60
Estimated Total Size (MB): 79.12

In [None]:
batch = next(iter(train_dataloader))
print(batch.keys())


dict_keys(['input_ids', 'attention_mask', 'label'])


In [None]:
for name, param in model.named_parameters():
    if "bias" in name:
        print(f"{name}: requires_grad={param.requires_grad}")

base_model.model.bert.embeddings.LayerNorm.bias: requires_grad=False
base_model.model.bert.encoder.layer.0.attention.self.query.base_layer.bias: requires_grad=True
base_model.model.bert.encoder.layer.0.attention.self.key.bias: requires_grad=False
base_model.model.bert.encoder.layer.0.attention.self.value.base_layer.bias: requires_grad=True
base_model.model.bert.encoder.layer.0.attention.output.dense.bias: requires_grad=False
base_model.model.bert.encoder.layer.0.attention.output.LayerNorm.bias: requires_grad=False
base_model.model.bert.encoder.layer.0.intermediate.dense.bias: requires_grad=False
base_model.model.bert.encoder.layer.0.output.dense.bias: requires_grad=False
base_model.model.bert.encoder.layer.0.output.LayerNorm.bias: requires_grad=False
base_model.model.bert.encoder.layer.1.attention.self.query.base_layer.bias: requires_grad=True
base_model.model.bert.encoder.layer.1.attention.self.key.bias: requires_grad=False
base_model.model.bert.encoder.layer.1.attention.self.value.ba

In [None]:
from itertools import product
def quick_grid_search_lora(train_dataloader, val_dataloader, bert_model_name, num_classes, device, num_epochs=1):
    r_list = [4, 8]
    alpha_list = [16, 32]
    dropout_list = [0.0, 0.1]
    lr_list = [2e-5, 3e-5]

    best_config = None
    best_macro_f1 = 0.0

    search_space = list(product(r_list, alpha_list, dropout_list, lr_list))

    for i, (r, alpha, dropout, lr) in enumerate(search_space):
        print(f"\n[{i+1}/{len(search_space)}] Testing: r={r}, alpha={alpha}, dropout={dropout}, lr={lr}")

        lora_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=r,
            lora_alpha=alpha,
            lora_dropout=dropout,
            bias="lora_only"
        )

        base_model = AutoModelForSequenceClassification.from_pretrained(
            bert_model_name,
            num_labels=num_classes
        )
        model = get_peft_model(base_model, lora_config).to(device)

        optimizer = AdamW(model.parameters(), lr=lr)
        total_steps = len(train_dataloader) * num_epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, 0, total_steps)


        for epoch in range(num_epochs):
            train(model, train_dataloader, optimizer, scheduler, device)


        _, macro_f1, _ = evaluate(model, val_dataloader, device)
        print(f"Macro F1: {macro_f1:.4f}")

        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_config = {
                "r": r,
                "lora_alpha": alpha,
                "lora_dropout": dropout,
                "learning_rate": lr
            }

    print("\n best config:", best_config)
    print(f"best Macro F1: {best_macro_f1:.4f}")
    return best_config


In [None]:
best_lora_config = quick_grid_search_lora(
    train_dataloader,
    val_dataloader,
    bert_model_name=bert_model_name,
    num_classes=num_classes,
    device=device,
    num_epochs=2
)


[1/16] Testing: r=4, alpha=16, dropout=0.0, lr=2e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 5022/5022 [02:58<00:00, 28.14it/s]


Average training loss: 3.2305


Training: 100%|██████████| 5022/5022 [02:58<00:00, 28.09it/s]


Average training loss: 2.9941


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Macro F1: 0.0290

[2/16] Testing: r=4, alpha=16, dropout=0.0, lr=3e-05


Training: 100%|██████████| 5022/5022 [02:58<00:00, 28.09it/s]


Average training loss: 3.1299


Training: 100%|██████████| 5022/5022 [02:59<00:00, 28.05it/s]


Average training loss: 2.7678


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Macro F1: 0.0571

[3/16] Testing: r=4, alpha=16, dropout=0.1, lr=2e-05


Training: 100%|██████████| 5022/5022 [02:59<00:00, 27.92it/s]


Average training loss: 3.2267


Training: 100%|██████████| 5022/5022 [02:59<00:00, 27.90it/s]


Average training loss: 2.9452


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Macro F1: 0.0309

[4/16] Testing: r=4, alpha=16, dropout=0.1, lr=3e-05


Training: 100%|██████████| 5022/5022 [02:59<00:00, 27.99it/s]


Average training loss: 3.1525


Training: 100%|██████████| 5022/5022 [02:59<00:00, 27.93it/s]


Average training loss: 2.8159


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Macro F1: 0.0482

[5/16] Testing: r=4, alpha=32, dropout=0.0, lr=2e-05


Training: 100%|██████████| 5022/5022 [02:58<00:00, 28.07it/s]


Average training loss: 3.2324


Training: 100%|██████████| 5022/5022 [02:58<00:00, 28.16it/s]


Average training loss: 2.9748


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Macro F1: 0.0286

[6/16] Testing: r=4, alpha=32, dropout=0.0, lr=3e-05


Training: 100%|██████████| 5022/5022 [02:58<00:00, 28.20it/s]


Average training loss: 3.1117


Training: 100%|██████████| 5022/5022 [02:58<00:00, 28.06it/s]


Average training loss: 2.7577
Macro F1: 0.0558

[7/16] Testing: r=4, alpha=32, dropout=0.1, lr=2e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 5022/5022 [02:59<00:00, 28.00it/s]


Average training loss: 3.1891


Training: 100%|██████████| 5022/5022 [02:59<00:00, 27.95it/s]


Average training loss: 2.9052


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Macro F1: 0.0308

[8/16] Testing: r=4, alpha=32, dropout=0.1, lr=3e-05


Training: 100%|██████████| 5022/5022 [02:59<00:00, 27.97it/s]


Average training loss: 3.0839


Training: 100%|██████████| 5022/5022 [02:59<00:00, 27.95it/s]


Average training loss: 2.7660


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Macro F1: 0.0397

[9/16] Testing: r=8, alpha=16, dropout=0.0, lr=2e-05


Training: 100%|██████████| 5022/5022 [02:57<00:00, 28.25it/s]


Average training loss: 3.1963


Training: 100%|██████████| 5022/5022 [02:57<00:00, 28.24it/s]


Average training loss: 2.9248


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Macro F1: 0.0311

[10/16] Testing: r=8, alpha=16, dropout=0.0, lr=3e-05


Training: 100%|██████████| 5022/5022 [02:57<00:00, 28.26it/s]


Average training loss: 3.1015


Training: 100%|██████████| 5022/5022 [02:57<00:00, 28.25it/s]


Average training loss: 2.7443


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Macro F1: 0.0558

[11/16] Testing: r=8, alpha=16, dropout=0.1, lr=2e-05


Training: 100%|██████████| 5022/5022 [02:59<00:00, 27.98it/s]


Average training loss: 3.2183


Training: 100%|██████████| 5022/5022 [02:58<00:00, 28.06it/s]


Average training loss: 2.9846


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Macro F1: 0.0293

[12/16] Testing: r=8, alpha=16, dropout=0.1, lr=3e-05


Training: 100%|██████████| 5022/5022 [02:59<00:00, 28.01it/s]


Average training loss: 3.1275


Training: 100%|██████████| 5022/5022 [03:00<00:00, 27.81it/s]


Average training loss: 2.7758


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Macro F1: 0.0548

[13/16] Testing: r=8, alpha=32, dropout=0.0, lr=2e-05


Training: 100%|██████████| 5022/5022 [02:59<00:00, 27.96it/s]


Average training loss: 3.2176


Training: 100%|██████████| 5022/5022 [02:58<00:00, 28.08it/s]


Average training loss: 2.9282


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Macro F1: 0.0309

[14/16] Testing: r=8, alpha=32, dropout=0.0, lr=3e-05


Training: 100%|██████████| 5022/5022 [02:58<00:00, 28.12it/s]


Average training loss: 3.0665


Training: 100%|██████████| 5022/5022 [02:58<00:00, 28.20it/s]


Average training loss: 2.6841


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Macro F1: 0.0728

[15/16] Testing: r=8, alpha=32, dropout=0.1, lr=2e-05


Training: 100%|██████████| 5022/5022 [02:59<00:00, 28.03it/s]


Average training loss: 3.2070


Training: 100%|██████████| 5022/5022 [02:59<00:00, 28.02it/s]


Average training loss: 2.9247


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Macro F1: 0.0290

[16/16] Testing: r=8, alpha=32, dropout=0.1, lr=3e-05


Training: 100%|██████████| 5022/5022 [02:59<00:00, 28.01it/s]


Average training loss: 3.0800


Training: 100%|██████████| 5022/5022 [02:58<00:00, 28.06it/s]


Average training loss: 2.6973
Macro F1: 0.0558

✅ 最佳配置: {'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.0, 'learning_rate': 3e-05}
🎯 最佳 Macro F1: 0.0728


In [None]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, macro_f1, weighted_f1 = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(f"Macro F1: {macro_f1:.4f}")
        print(f"Weighted F1: {weighted_f1:.4f}")

Epoch 1/10


Training:   4%|▍         | 223/5022 [00:08<02:53, 27.68it/s]

model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Training: 100%|██████████| 5022/5022 [02:57<00:00, 28.29it/s]


Average training loss: 3.0357
Validation Accuracy: 0.3287
Macro F1: 0.0642
Weighted F1: 0.2070
Epoch 2/10


Training: 100%|██████████| 5022/5022 [02:55<00:00, 28.54it/s]


Average training loss: 2.4968
Validation Accuracy: 0.3990
Macro F1: 0.1156
Weighted F1: 0.2924
Epoch 3/10


Training: 100%|██████████| 5022/5022 [02:56<00:00, 28.45it/s]


Average training loss: 2.2663
Validation Accuracy: 0.4344
Macro F1: 0.1585
Weighted F1: 0.3421
Epoch 4/10


Training: 100%|██████████| 5022/5022 [02:56<00:00, 28.42it/s]


Average training loss: 2.1589
Validation Accuracy: 0.4568
Macro F1: 0.1922
Weighted F1: 0.3748
Epoch 5/10


Training: 100%|██████████| 5022/5022 [02:56<00:00, 28.45it/s]


Average training loss: 2.0956
Validation Accuracy: 0.4712
Macro F1: 0.2117
Weighted F1: 0.3945
Epoch 6/10


Training: 100%|██████████| 5022/5022 [02:56<00:00, 28.46it/s]


Average training loss: 2.0547
Validation Accuracy: 0.4785
Macro F1: 0.2223
Weighted F1: 0.4046
Epoch 7/10


Training: 100%|██████████| 5022/5022 [02:56<00:00, 28.50it/s]


Average training loss: 2.0281
Validation Accuracy: 0.4836
Macro F1: 0.2305
Weighted F1: 0.4119
Epoch 8/10


Training: 100%|██████████| 5022/5022 [02:56<00:00, 28.43it/s]


Average training loss: 2.0117
Validation Accuracy: 0.4864
Macro F1: 0.2360
Weighted F1: 0.4162
Epoch 9/10


Training: 100%|██████████| 5022/5022 [02:56<00:00, 28.52it/s]


Average training loss: 1.9980
Validation Accuracy: 0.4874
Macro F1: 0.2380
Weighted F1: 0.4180
Epoch 10/10


Training: 100%|██████████| 5022/5022 [02:56<00:00, 28.51it/s]


Average training loss: 1.9921
Validation Accuracy: 0.4884
Macro F1: 0.2390
Weighted F1: 0.4189


In [None]:
torch.save(model.state_dict(), "bert_classifier.pth")

In [None]:
texts, labels, label_classes = load_news_data("/content/drive/MyDrive/News_Category_Dataset_v2.json")

encoder = LabelEncoder()
encoder.classes_ = np.array(label_classes)

test_text = "NASA launches new space telescope to explore exoplanets."
predicted_category = predict_news_category(test_text, model, tokenizer, device, encoder)

print(f"Headline: {test_text}")
print(f"Predicted Category: {predicted_category}")

The dataset contains 40 unique categories.
Headline: NASA launches new space telescope to explore exoplanets.
Predicted Category: SCIENCE


In [None]:

from transformers import AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model, TaskType
from copy import deepcopy

best_model = None
best_accuracy = 0
best_config = None

for r in [4, 8, 16]:
    for alpha in [16, 32]:
        for dropout in [0.0, 0.1]:
            print(f"Trying LoRA config: r={r}, alpha={alpha}, dropout={dropout}")
            base_model = AutoModelForSequenceClassification.from_pretrained(
                bert_model_name, num_labels=num_classes
            )
            lora_config = LoraConfig(
                task_type=TaskType.SEQ_CLS,
                r=r,
                lora_alpha=alpha,
                lora_dropout=dropout,
                bias="lora_only"
            )
            model = get_peft_model(base_model, lora_config).to(device)

            optimizer = AdamW(model.parameters(), lr=learning_rate)
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*num_epochs
            )

            for epoch in range(num_epochs):
                print(f"Epoch {epoch + 1}/{num_epochs}")
                train(model, train_dataloader, optimizer, scheduler, device)

            acc, macro_f1, weighted_f1 = evaluate(model, val_dataloader, device)
            print(f"Validation Accuracy: {acc:.4f} | F1: {macro_f1:.4f}")

            if acc > best_accuracy:
                best_accuracy = acc
                best_config = deepcopy(lora_config)
                best_model = deepcopy(model)


In [None]:

best_model.save_pretrained("best_lora_model")
tokenizer.save_pretrained("best_lora_model")


In [None]:

from transformers import AutoModelForSequenceClassification
from peft import PeftModel

# 加载基础模型
base_model = AutoModelForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_classes)

# 加载最佳LoRA配置训练的模型
model = PeftModel.from_pretrained(base_model, "best_lora_model").to(device)
print("Loaded model with best LoRA configuration.")
