In [1]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [2]:
import os
import torch
from torch import nn
from tqdm import tqdm
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from torch.ao.quantization.qconfig import float_qparams_weight_only_qconfig
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import sklearn
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
from torch.optim import AdamW
from torchinfo import summary
from transformers import AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model, TaskType
import wandb
import torch.nn.utils.prune as prune
import torch.quantization as quantization

wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhl6151[0m ([33mhl6151-new-york-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
# preprocessing for the raw data
def load_news_data(data_file):

    df = pd.read_json(data_file, lines=True)
    df.head()

    df['category'] = df['category'].map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)

    df['headline'] = df['headline'].apply(lambda x: str(x).lower())
    df['short_description'] = df['short_description'].apply(lambda x: str(x).lower())

    df['text'] = df['headline'] + " " + df['short_description']
    encoder = LabelEncoder()
    df['label'] = encoder.fit_transform(df['category'])
    print(f"The dataset contains {df['category'].nunique()} unique categories.")

    return df['text'].tolist(), df['label'].tolist(), encoder.classes_.tolist()

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
data_file = "/content/drive/MyDrive/News_Category_Dataset_v2.json"
texts, labels, label_names = load_news_data(data_file)

# The relationship between the actual label and their number label
for idx, name in enumerate(label_names):
    print(f"{idx} → {name}")

The dataset contains 40 unique categories.
0 → ARTS
1 → ARTS & CULTURE
2 → BLACK VOICES
3 → BUSINESS
4 → COLLEGE
5 → COMEDY
6 → CRIME
7 → CULTURE & ARTS
8 → DIVORCE
9 → EDUCATION
10 → ENTERTAINMENT
11 → ENVIRONMENT
12 → FIFTY
13 → FOOD & DRINK
14 → GOOD NEWS
15 → GREEN
16 → HEALTHY LIVING
17 → HOME & LIVING
18 → IMPACT
19 → LATINO VOICES
20 → MEDIA
21 → MONEY
22 → PARENTING
23 → PARENTS
24 → POLITICS
25 → QUEER VOICES
26 → RELIGION
27 → SCIENCE
28 → SPORTS
29 → STYLE
30 → STYLE & BEAUTY
31 → TASTE
32 → TECH
33 → TRAVEL
34 → WEDDINGS
35 → WEIRD NEWS
36 → WELLNESS
37 → WOMEN
38 → WORLD NEWS
39 → WORLDPOST


In [6]:
# Select parameters and model for model training
bert_model_name = 'bert-base-uncased'
num_classes = len(label_names)
max_length = 256
batch_size = 32
num_epochs = 1
learning_rate = 2e-5

In [7]:
# wandb initialization
wandb.init(project="News classification", name="v3", config={
    "Model_name": bert_model_name,
    "Epoch": num_epochs,
    "Batch_size": batch_size,
    "Learning_rate": learning_rate,
    "Max_length": max_length
})

In [8]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        """
        Dataset for text classification tasks.

        Args:
            texts (List[str]): List of input texts.
            labels (List[int]): List of corresponding labels.
            tokenizer: Tokenizer instance (e.g., from HuggingFace Transformers).
            max_length (int): Maximum sequence length after tokenization.
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        """
        Returns:
            int: Total number of samples in the dataset.
        """
        return len(self.texts)

    def __getitem__(self, idx):
        """
        Retrieves the tokenized representation and label for a given index.

        Args:
            idx (int): Index of the sample.

        Returns:
            dict: Dictionary with 'input_ids', 'attention_mask', and 'label'.
        """
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text with padding and truncation
        encoding = self.tokenizer(
            text,
            return_tensors='pt',
            max_length=self.max_length,
            padding='max_length',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),           # Token IDs (1D tensor)
            'attention_mask': encoding['attention_mask'].flatten(), # Attention mask (1D tensor)
            'label': torch.tensor(label)                            # Target label (scalar tensor)
        }

In [9]:
# Without mixed precision
# 都加了wandb来记录训练过程，最后跑一次10epoch的
from torch.profiler import profile, record_function, ProfilerActivity

def train_original(model, dataloader, optimizer, scheduler, device, epoch=None):
    model.train()
    total_loss = 0

    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA] if device.type == "cuda" else [ProfilerActivity.CPU],
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=2, repeat=1),
        on_trace_ready=torch.profiler.tensorboard_trace_handler(f"./log_train_profiler"),
        record_shapes=True,
        profile_memory=True,
        with_stack=True
    ) as profiler:

        for step, batch in enumerate(tqdm(dataloader, desc=f"Training Epoch {epoch if epoch is not None else ''}")):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()

            with record_function("forward_pass"):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = nn.CrossEntropyLoss()(logits, labels)

            with record_function("backward_pass"):
                loss.backward()
                optimizer.step()

            scheduler.step()
            profiler.step()  # ✅ 每个 step 结束标记

            total_loss += loss.item()
            wandb.log({"train/loss_batch": loss.item()})

    print(profiler.key_averages().table(sort_by="cuda_time_total", row_limit=20))
    print(profiler.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=20))
    print(profiler.key_averages().table(sort_by="cpu_time_total", row_limit=20))
    avg_loss = total_loss / len(dataloader)
    print(f"Average training loss: {avg_loss:.4f}")
    wandb.log({"train/loss_epoch": avg_loss})


In [10]:
# use Mixed Precision
from torch.profiler import profile, record_function, ProfilerActivity
from torch.cuda.amp import GradScaler

scaler = torch.amp.GradScaler('cuda')

def train(model, dataloader, optimizer, scheduler, device, epoch=None):
    model.train()
    total_loss = 0

    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA] if device.type == "cuda" else [ProfilerActivity.CPU],
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=2, repeat=1),
        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),
        record_shapes=True,
        profile_memory=True,
        with_stack=True
    ) as profiler:

        for step, batch in enumerate(tqdm(dataloader, desc=f"Training Epoch {epoch if epoch is not None else ''}")):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()

            with torch.amp.autocast('cuda'):
                with record_function("forward_pass"):
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                    logits = outputs.logits
                    loss = nn.CrossEntropyLoss()(logits, labels)

            with record_function("backward_pass"):
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()

            scheduler.step()
            profiler.step()

            total_loss += loss.item()
            wandb.log({"train/loss_batch": loss.item()})

    print(profiler.key_averages().table(sort_by="cuda_time_total", row_limit=20))
    print(profiler.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=20))
    print(profiler.key_averages().table(sort_by="cpu_time_total", row_limit=20))
    avg_loss = total_loss / len(dataloader)
    print(f"Average training loss: {avg_loss:.4f}")
    wandb.log({"train/loss_epoch": avg_loss})


In [11]:
# Add wandb to track loss and metrics for evaluation function
def evaluate(model, data_loader, device, epoch=None):
    """
    Evaluate the model on the validation or test set and log metrics using Weights & Biases (wandb).

    Args:
        model (nn.Module): Trained model to evaluate.
        data_loader (DataLoader): DataLoader for validation/test dataset.
        device (torch.device): Device to run evaluation on ('cuda' or 'cpu').
        epoch (int, optional): Current epoch number (optional for logging).

    Returns:
        Tuple[float, float, float]: accuracy, macro F1 score, weighted F1 score
    """
    model.eval()  # Set model to evaluation mode
    predictions = []
    actual_labels = []

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for batch in data_loader:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Get predicted class by choosing the max logit
            _, preds = torch.max(logits, dim=1)

            # Store predictions and true labels for metric computation
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())

    # Compute evaluation metrics
    acc = accuracy_score(actual_labels, predictions)
    macro_f1 = f1_score(actual_labels, predictions, average='macro')
    weighted_f1 = f1_score(actual_labels, predictions, average='weighted')

    # Log metrics to Weights & Biases
    wandb.log({
        "eval/accuracy": acc,
        "eval/macro_f1": macro_f1,
        "eval/weighted_f1": weighted_f1
    })

    return acc, macro_f1, weighted_f1


In [27]:
from torch.profiler import profile, record_function, ProfilerActivity, schedule, tensorboard_trace_handler
import time
def predict_news_category(text, model, tokenizer, device, encoder, max_length=128):
    model.eval()

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA] if device.type == "cuda" else [ProfilerActivity.CPU],
        schedule=schedule(wait=1, warmup=1, active=5, repeat=1),  # 跳过第 1 次，记录后 5 次
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
        on_trace_ready=tensorboard_trace_handler("./log_predict_base_warmup")
    ) as profiler:
        for i in range(7):  # 总共运行 7 次
            with torch.no_grad():
                with record_function(f"model_inference_{i}"):
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                    logits = outputs.logits
                    _, predicted_label = torch.max(logits, dim=1)
            profiler.step()

    print(profiler.key_averages().table(sort_by="cuda_time_total", row_limit=20))
    print(profiler.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=20))
    print(profiler.key_averages().table(sort_by="cpu_time_total", row_limit=20))

    predicted_category = encoder.inverse_transform(predicted_label.cpu().numpy())[0]
    print(f"✅ Warmup profiling done. Use TensorBoard:\n\n  %tensorboard --logdir=./log_predict_base_warmup")
    return predicted_category

In [28]:
from torch.cuda.amp import autocast
from torch.profiler import profile, record_function, ProfilerActivity

def predict_news_category_amp(text, model, tokenizer, device, encoder, max_length=128):
    """
    Predict news category using AMP with torch.profiler to measure performance.
    """

    model.eval()

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA] if device.type == "cuda" else [ProfilerActivity.CPU],
        schedule=schedule(wait=1, warmup=1, active=5, repeat=1),
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
        on_trace_ready=torch.profiler.tensorboard_trace_handler("./log_amp_predict")
    ) as profiler:
        for i in range(7):
            with torch.no_grad():
                with record_function("inference"):
                    with torch.amp.autocast('cuda'):
                        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                        logits = outputs.logits
                        _, predicted_label = torch.max(logits, dim=1)
            profiler.step()

    print(profiler.key_averages().table(sort_by="cuda_time_total", row_limit=20))
    print(profiler.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=20))
    print(profiler.key_averages().table(sort_by="cpu_time_total", row_limit=20))
    predicted_category = encoder.inverse_transform(predicted_label.cpu().numpy())[0]
    print("✅ Profiling complete. Run: tensorboard --logdir=./log_amp_predict")
    return predicted_category

In [14]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [15]:
# Load pre-trained tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

# Create datasheet
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)

# Wrap training dataset in a DataLoader for batching and shuffling
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
# Check the available device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained model from hugging face
base_model = AutoModelForSequenceClassification.from_pretrained(
    bert_model_name,
    num_labels=num_classes
)

# LoRA configuration, only train part of parameters
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,                          # percentage of parameter compress
    lora_alpha=32,                # scale factor, control update speed
    lora_dropout=0.0,             # dropout
    bias="lora_only"              # only trains the bias for LoRA layer
)

model = get_peft_model(base_model, lora_config).to(device)

print(f"Using device: {device}")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


In [17]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [18]:
seq_len = 128

input_data = {
    "input_ids": torch.zeros((batch_size, seq_len), dtype=torch.long).to(device),
    "attention_mask": torch.ones((batch_size, seq_len), dtype=torch.long).to(device)
}

summary(model, input_data=input_data)

Layer (type:depth-idx)                                                      Output Shape              Param #
PeftModelForSequenceClassification                                          [32, 40]                  --
├─LoraModel: 1-1                                                            [32, 40]                  --
│    └─BertForSequenceClassification: 2-1                                   --                        --
│    │    └─BertModel: 3-1                                                  [32, 768]                 109,777,152
│    │    └─Dropout: 3-2                                                    [32, 768]                 --
│    │    └─ModulesToSaveWrapper: 3-3                                       [32, 40]                  61,520
Total params: 109,838,672
Trainable params: 344,104
Non-trainable params: 109,494,568
Total mult-adds (Units.GIGABYTES): 3.50
Input size (MB): 0.07
Forward/backward pass size (MB): 4008.65
Params size (MB): 439.23
Estimated Total Size (MB): 4447.9

In [19]:
batch = next(iter(train_dataloader))
print(batch.keys())

dict_keys(['input_ids', 'attention_mask', 'label'])


In [20]:
for name, param in model.named_parameters():
    if "bias" in name:
        print(f"{name}: requires_grad={param.requires_grad}")

base_model.model.bert.embeddings.LayerNorm.bias: requires_grad=False
base_model.model.bert.encoder.layer.0.attention.self.query.base_layer.bias: requires_grad=True
base_model.model.bert.encoder.layer.0.attention.self.key.bias: requires_grad=False
base_model.model.bert.encoder.layer.0.attention.self.value.base_layer.bias: requires_grad=True
base_model.model.bert.encoder.layer.0.attention.output.dense.bias: requires_grad=False
base_model.model.bert.encoder.layer.0.attention.output.LayerNorm.bias: requires_grad=False
base_model.model.bert.encoder.layer.0.intermediate.dense.bias: requires_grad=False
base_model.model.bert.encoder.layer.0.output.dense.bias: requires_grad=False
base_model.model.bert.encoder.layer.0.output.LayerNorm.bias: requires_grad=False
base_model.model.bert.encoder.layer.1.attention.self.query.base_layer.bias: requires_grad=True
base_model.model.bert.encoder.layer.1.attention.self.key.bias: requires_grad=False
base_model.model.bert.encoder.layer.1.attention.self.value.ba

In [21]:
# LoRA and Mixed Precision，训练结束后会打印出来表格
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, macro_f1, weighted_f1 = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(f"Macro F1: {macro_f1:.4f}")
        print(f"Weighted F1: {weighted_f1:.4f}")

Epoch 1/1


Training Epoch :   8%|▊         | 386/5022 [00:38<07:41, 10.05it/s]


KeyboardInterrupt: 

In [22]:
# Only LoRA，训练结束后会打印出来表格
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train_original(model, train_dataloader, optimizer, scheduler, device)
        accuracy, macro_f1, weighted_f1 = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(f"Macro F1: {macro_f1:.4f}")
        print(f"Weighted F1: {weighted_f1:.4f}")

Epoch 1/1


Training Epoch :   1%|          | 41/5022 [00:13<26:49,  3.10it/s]


KeyboardInterrupt: 

In [None]:
model.base_model.model.save_pretrained("bert-base-checkpoint")
model.save_pretrained("lora-adapter-checkpoint")

In [None]:
# 从训练模型的大小证明LoRA好处：
def get_dir_size(path):
    return sum(os.path.getsize(os.path.join(dp, f)) for dp, _, fn in os.walk(path) for f in fn)

print(f"Base size: {get_dir_size('bert-base-checkpoint') / 1024**2:.2f} MB")
print(f"LoRA size: {get_dir_size('lora-adapter-checkpoint') / 1024**2:.2f} MB")

In [29]:
# Evaluation时间 对于LoRA Only
texts, labels, label_classes = load_news_data("/content/drive/MyDrive/News_Category_Dataset_v2.json")

encoder = LabelEncoder()
encoder.classes_ = np.array(label_classes)

test_text = "NASA launches new space telescope to explore exoplanets."
predicted_category = predict_news_category(test_text, base_model, tokenizer, device, encoder)

print(f"Headline: {test_text}")
print(f"Predicted Category: {predicted_category}")

The dataset contains 40 unique categories.
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      model_inference_5         0.00%       0.000us         0.00%       0.000us       0.000us      29.234ms       111.79%      29.234ms      29.234ms           0

In [30]:
# Evaluation时间 对于LoRA+Mixed Precision
texts, labels, label_classes = load_news_data("/content/drive/MyDrive/News_Category_Dataset_v2.json")

encoder = LabelEncoder()
encoder.classes_ = np.array(label_classes)

test_text = "NASA launches new space telescope to explore exoplanets."
predicted_category = predict_news_category_amp(test_text, model, tokenizer, device, encoder)

print(f"Headline: {test_text}")
print(f"Predicted Category: {predicted_category}")

The dataset contains 40 unique categories.
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                              inference         0.00%       0.000us         0.00%       0.000us       0.000us     205.025ms      1389.35%     205.025ms      41.005ms           0