# 中文文本分类 - BERT 模型

这个笔记本实现了一个基于 BERT 的中文文本分类模型。我们将使用 transformers 库中的中文 BERT 模型。

安装依赖的代码单元格：

In [None]:
# 安装必要的库
!pip install transformers datasets torch scikit-learn pandas tqdm

导入库的代码单元格

In [None]:
# 导入必要的库
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt
from google.colab import drive

挂载 Drive 的代码单元格：

In [None]:
# 挂载 Google Drive
drive.mount('/content/drive')

# 创建保存结果的目录
!mkdir -p "/content/drive/MyDrive/text_classification_results"

准备数据的代码单元格

In [None]:
# 创建示例数据
texts = [
    "这部电影很精彩，演员的表演非常出色",
    "画面很差，故事情节也很烂",
    "音乐很动听，节奏感很强",
    "服务态度恶劣，等了很久才上菜",
    "风景优美，空气清新，是个度假的好地方",
    "价格太贵了，性价比很低",
    "质量不错，用着很舒服",
    "外观设计很差，做工粗糙"
]

labels = [1, 0, 1, 0, 1, 0, 1, 0]  # 1表示正面评价，0表示负面评价

# 创建DataFrame
df = pd.DataFrame({
    'text': texts,
    'label': labels
})

# 划分训练集和测试集
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(f"训练集大小：{len(train_df)}")
print(f"测试集大小：{len(test_df)}")

## 3. 定义数据集类

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## 4. 定义模型

In [None]:
class TextClassifier(nn.Module):
    def __init__(self, n_classes):
        super(TextClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.drop = nn.Dropout(p=0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]
        output = self.drop(pooled_output)
        return self.fc(output)

## 5. 训练模型

In [None]:
# 设置参数
MAX_LEN = 64
BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 2e-5

# 初始化tokenizer和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = TextClassifier(n_classes=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# 创建数据加载器
train_dataset = TextDataset(
    texts=train_df.text.values,
    labels=train_df.label.values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

# 优化器和损失函数
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

# 训练循环
train_losses = []
train_accs = []

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{EPOCHS}')
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        progress_bar.set_postfix({'loss': loss.item()})
    
    epoch_loss = total_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    train_losses.append(epoch_loss)
    train_accs.append(epoch_acc)
    
    print(f'Epoch {epoch + 1}/{EPOCHS}:')
    print(f'Average Loss: {epoch_loss:.4f}')
    print(f'Accuracy: {epoch_acc:.2f}%\n')

## 6. 评估模型

In [None]:
# 创建测试数据集
test_dataset = TextDataset(
    texts=test_df.text.values,
    labels=test_df.label.values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE
)

# 评估模型
model.eval()
test_correct = 0
test_total = 0
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs.data, 1)
        
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()
        
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_accuracy = 100 * test_correct / test_total
print(f'测试集准确率: {test_accuracy:.2f}%')

# 绘制混淆矩阵
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(all_labels, all_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('混淆矩阵')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.show()

## 7. 保存模型

In [None]:
# 保存模型
save_path = '/content/drive/MyDrive/text_classification_results/bert_classifier.pth'
torch.save(model.state_dict(), save_path)

# 绘制训练过程
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(train_losses)
plt.title('训练损失')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(1, 2, 2)
plt.plot(train_accs)
plt.title('训练准确率')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')

plt.tight_layout()
plt.savefig('/content/drive/MyDrive/text_classification_results/training_metrics.png')
plt.show()

## 8. 测试自定义文本

In [None]:
def predict_sentiment(text):
    # 对文本进行编码
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # 进行预测
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs.data, 1)
        prob = torch.softmax(outputs, dim=1)
    
    sentiment = '正面' if predicted.item() == 1 else '负面'
    confidence = prob[0][predicted.item()].item()
    
    return sentiment, confidence

# 测试一些新的评论
test_texts = [
    "这家餐厅的菜品非常美味，服务也很周到",
    "产品质量太差了，一点都不耐用",
    "这部电影剧情紧凑，演技在线"
]

for text in test_texts:
    sentiment, confidence = predict_sentiment(text)
    print(f'\n文本: {text}')
    print(f'情感: {sentiment}')
    print(f'置信度: {confidence:.2%}')