<a href="https://colab.research.google.com/github/bnbryan/hpml-project/blob/baseline/prune_on_gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load the model

install library

In [None]:
!pip install transformers
!pip install datasets

In [None]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

Load pre-trained gpt2 model from library

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer.pad_token = tokenizer.eos_token

## Data pre-processing

In [20]:
from datasets import load_dataset

# 加载 WikiText-2 数据集
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

# 分词
def tokenize_text(examples):
  tokenized_inputs = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

  labels = tokenized_inputs["input_ids"].copy()
  labels = [
      [(label if label != tokenizer.pad_token_id else -100) for label in label_example]
      for label_example in labels
  ]
  tokenized_inputs["labels"] = labels
  return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_text, batched=True)

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

## Finetune the pre-trained model on WikiText-2

### prepare training data & testing data

In [None]:
train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=8, shuffle=True)
eval_dataloader = DataLoader(tokenized_dataset["validation"], batch_size=8)

### finetuning

In [None]:
from transformers import TrainingArguments, Trainer

# 定义训练参数
training_args = TrainingArguments(
    output_dir="./results",           # 模型保存路径
    evaluation_strategy="epoch",     # 每个 epoch 验证一次
    learning_rate=5e-5,              # 学习率
    num_train_epochs=3,              # 训练轮数
    per_device_train_batch_size=8,   # 每个设备的 batch size
    save_total_limit=2,              # 最多保存2个检查点
    logging_dir="./logs",            # 日志文件
    logging_steps=100,               # 每100步记录一次日志
    save_steps=500,                  # 每500步保存一次模型
    report_to="none"                 # 禁用日志工具，如 wandb
)

trainer = Trainer(
    model=model,                       # 要微调的模型
    args=training_args,                # 训练参数
    train_dataset=tokenized_dataset["train"],  # 训练数据集
    eval_dataset=tokenized_dataset["validation"],  # 验证数据集
    tokenizer=tokenizer                # 分词器
)

trainer.train()

## Get the baseline accuracy

In [None]:
torch.cuda.empty_cache()

In [None]:
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.eval()

total_loss = 0
total_tokens = 0

# 创建 DataLoader
dataloader = DataLoader(tokenized_dataset, batch_size=8, shuffle=False)

# 遍历数据集
with tqdm(dataloader, desc="Evaluating", unit="batch") as pbar:
    for batch in pbar:
        # 确保 input_ids 是张量
        torch.cuda.empty_cache()
        tokens = torch.stack(batch["input_ids"]).to(device)
        labels = torch.stack(batch["labels"]).to(device)

        # 禁用梯度计算
        with torch.no_grad():
            outputs = model(input_ids=tokens, labels=tokens)
            loss = outputs.loss

            active_loss = labels != -100
            num_active_tokens = active_loss.sum().item()
            total_loss += loss.item() * num_active_tokens
            total_tokens += num_active_tokens

Evaluating:  72%|███████▏  | 391/545 [02:18<00:55,  2.78batch/s]

In [None]:
# 计算准确率
import math
print(f"Total Loss: {total_loss}")
print(f"Total Tokens: {total_tokens}")
print(f"Average Loss: {total_loss / total_tokens}")
perplexity = math.exp(total_loss / total_tokens)
print(f"Perplexity: {perplexity:.4f}")

# Pruning