# IMDB数据集分类-----AlBert

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AlbertTokenizer, AlbertForSequenceClassification
import os
import glob

# 检查是否有可用的 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 数据集类，用于加载和处理数据
class IMDBDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        # Tokenizer encode
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# 数据预处理函数，读取数据并分词
def preprocess_data(data_dir):
    texts = []
    labels = []
    for label in ["pos", "neg"]:
        file_paths = glob.glob(os.path.join(data_dir, label, "*.txt"))
        for file_path in file_paths:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read().strip()
                texts.append(text)
                labels.append(0 if label == "pos" else 1)  # 标签：正面为0，负面为1
    return texts, labels

# 加载 IMDB 数据集
train_data_dir = "./aclImdb/train"
test_data_dir = "./aclImdb/test"

train_texts, train_labels = preprocess_data(train_data_dir)
test_texts, test_labels = preprocess_data(test_data_dir)

# 加载本地 ALBERT tokenizer 和模型
model_dir = "./albert_base_v2/"  # 本地模型路径
tokenizer = AlbertTokenizer.from_pretrained(model_dir)
model = AlbertForSequenceClassification.from_pretrained(model_dir, num_labels=2).to(device)

# 定义数据集和 DataLoader
train_dataset = IMDBDataset(train_texts, train_labels, tokenizer)
test_dataset = IMDBDataset(test_texts, test_labels, tokenizer)

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 定义优化器和损失函数
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# 训练模型
num_epochs = 8
for epoch in range(num_epochs):
    model.train()
    total_loss, total_correct = 0, 0

    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += (logits.argmax(1) == labels).sum().item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_dataloader):.4f}, "
          f"Accuracy: {total_correct / len(train_dataset):.4f}")

# 测试模型
model.eval()
total_correct = 0
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        total_correct += (logits.argmax(1) == labels).sum().item()

test_accuracy = total_correct / len(test_dataset)
print(f"Test Accuracy: {test_accuracy:.4f}")


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ./albert_base_v2/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  attention_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1/8, Loss: 0.3352, Accuracy: 0.8510
Epoch 2/8, Loss: 0.2309, Accuracy: 0.9055
Epoch 3/8, Loss: 0.1687, Accuracy: 0.9337
Epoch 4/8, Loss: 0.1243, Accuracy: 0.9544
Epoch 5/8, Loss: 0.0932, Accuracy: 0.9682
Epoch 6/8, Loss: 0.0498, Accuracy: 0.9832
Epoch 7/8, Loss: 0.0438, Accuracy: 0.9854
Epoch 8/8, Loss: 0.0393, Accuracy: 0.9866
Test Accuracy: 0.8828
