# 第5课：NLP 自然语言处理入门

## 学习目标
- 理解文本预处理流程
- 掌握词嵌入技术
- 学会构建文本分类模型
- 了解常见 NLP 任务

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import re

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

## 1. 文本预处理

In [None]:
# 示例文本
texts = [
    "I love this movie! It's amazing.",
    "This film is terrible, I hated it.",
    "Great acting and wonderful story.",
    "Boring and waste of time.",
    "Best movie I've ever seen!",
    "Don't watch this, it's awful."
]
labels = [1, 0, 1, 0, 1, 0]  # 1=正面, 0=负面

# 文本清洗函数
def clean_text(text):
    # 转小写
    text = text.lower()
    # 移除标点符号
    text = re.sub(r'[^\w\s]', '', text)
    # 分词
    tokens = text.split()
    return tokens

# 清洗所有文本
cleaned_texts = [clean_text(text) for text in texts]
print("清洗后的文本:")
for i, (original, cleaned) in enumerate(zip(texts, cleaned_texts)):
    print(f"  原始: {original}")
    print(f"  清洗: {cleaned}\n")

In [None]:
# 构建词汇表
class Vocabulary:
    def __init__(self, min_freq=1):
        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
        self.idx2word = {0: '<PAD>', 1: '<UNK>'}
        self.word_freq = Counter()
        self.min_freq = min_freq
    
    def build_vocab(self, texts):
        # 统计词频
        for tokens in texts:
            self.word_freq.update(tokens)
        
        # 添加词到词汇表
        for word, freq in self.word_freq.items():
            if freq >= self.min_freq and word not in self.word2idx:
                idx = len(self.word2idx)
                self.word2idx[word] = idx
                self.idx2word[idx] = word
    
    def encode(self, tokens):
        return [self.word2idx.get(token, 1) for token in tokens]  # 1 是 <UNK>
    
    def decode(self, indices):
        return [self.idx2word.get(idx, '<UNK>') for idx in indices]
    
    def __len__(self):
        return len(self.word2idx)

# 构建词汇表
vocab = Vocabulary()
vocab.build_vocab(cleaned_texts)

print(f"词汇表大小: {len(vocab)}")
print(f"词汇表: {vocab.word2idx}")

# 编码示例
encoded = vocab.encode(cleaned_texts[0])
print(f"\n原文: {cleaned_texts[0]}")
print(f"编码: {encoded}")

In [None]:
# 填充序列
def pad_sequences(sequences, max_len, pad_value=0):
    padded = []
    for seq in sequences:
        if len(seq) > max_len:
            padded.append(seq[:max_len])
        else:
            padded.append(seq + [pad_value] * (max_len - len(seq)))
    return padded

# 编码所有文本
encoded_texts = [vocab.encode(tokens) for tokens in cleaned_texts]
max_len = max(len(seq) for seq in encoded_texts)
padded_texts = pad_sequences(encoded_texts, max_len)

print(f"最大长度: {max_len}")
print(f"填充后的序列:")
for seq in padded_texts:
    print(f"  {seq}")

## 2. 词嵌入（Word Embeddings）

In [None]:
# 词嵌入层
vocab_size = len(vocab)
embed_dim = 64

embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

# 嵌入示例
sample_input = torch.LongTensor([padded_texts[0]])
embedded = embedding(sample_input)

print(f"输入形状: {sample_input.shape}")
print(f"嵌入后形状: {embedded.shape}")
print(f"\n输入: {sample_input[0]}")
print(f"嵌入向量 (前3个词的前5维):")
print(embedded[0, :3, :5])

In [None]:
# 可视化词嵌入
from sklearn.decomposition import PCA

# 获取所有词的嵌入
all_words = list(vocab.word2idx.keys())
all_indices = torch.LongTensor([vocab.word2idx[w] for w in all_words])
all_embeddings = embedding(all_indices).detach().numpy()

# PCA 降维
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(all_embeddings)

# 可视化
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.7)

for i, word in enumerate(all_words):
    plt.annotate(word, (embeddings_2d[i, 0], embeddings_2d[i, 1]))

plt.title('词嵌入可视化 (PCA)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

## 3. 文本分类模型

In [None]:
# 文本分类数据集
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        tokens = clean_text(self.texts[idx])
        encoded = self.vocab.encode(tokens)
        
        # 填充或截断
        if len(encoded) > self.max_len:
            encoded = encoded[:self.max_len]
        else:
            encoded = encoded + [0] * (self.max_len - len(encoded))
        
        return torch.LongTensor(encoded), torch.LongTensor([self.labels[idx]])

# 创建数据集
dataset = TextDataset(texts, labels, vocab, max_len)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [None]:
# LSTM 文本分类器
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=2):
        super(TextClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, 
                           batch_first=True, bidirectional=True, dropout=0.3)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        # x: (batch, seq_len)
        embedded = self.embedding(x)  # (batch, seq_len, embed_dim)
        
        lstm_out, (h_n, c_n) = self.lstm(embedded)
        # h_n: (num_layers*2, batch, hidden_dim)
        
        # 连接前向和后向的最后隐藏状态
        hidden = torch.cat((h_n[-2], h_n[-1]), dim=1)
        
        output = self.dropout(hidden)
        output = self.fc(output)
        return output

# 创建模型
model = TextClassifier(
    vocab_size=len(vocab),
    embed_dim=64,
    hidden_dim=128,
    num_classes=2
).to(device)

print(model)

In [None]:
# CNN 文本分类器
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, num_filters=100):
        super(TextCNN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # 多个不同大小的卷积核
        self.conv1 = nn.Conv1d(embed_dim, num_filters, kernel_size=2)
        self.conv2 = nn.Conv1d(embed_dim, num_filters, kernel_size=3)
        self.conv3 = nn.Conv1d(embed_dim, num_filters, kernel_size=4)
        
        self.fc = nn.Linear(num_filters * 3, num_classes)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        # x: (batch, seq_len)
        embedded = self.embedding(x)  # (batch, seq_len, embed_dim)
        embedded = embedded.permute(0, 2, 1)  # (batch, embed_dim, seq_len)
        
        # 卷积 + ReLU + 全局最大池化
        c1 = F.relu(self.conv1(embedded))
        c1 = F.max_pool1d(c1, c1.size(2)).squeeze(2)
        
        c2 = F.relu(self.conv2(embedded))
        c2 = F.max_pool1d(c2, c2.size(2)).squeeze(2)
        
        c3 = F.relu(self.conv3(embedded))
        c3 = F.max_pool1d(c3, c3.size(2)).squeeze(2)
        
        # 连接
        concat = torch.cat((c1, c2, c3), dim=1)
        output = self.dropout(concat)
        output = self.fc(output)
        
        return output

# 创建 TextCNN 模型
cnn_model = TextCNN(
    vocab_size=len(vocab),
    embed_dim=64,
    num_classes=2
).to(device)

print(cnn_model)

## 4. 更大的数据集示例

In [None]:
# 生成更多训练数据
positive_templates = [
    "I love this {}",
    "This {} is amazing",
    "Great {} highly recommend",
    "Best {} ever",
    "Wonderful {} experience",
    "Fantastic {} will buy again"
]

negative_templates = [
    "I hate this {}",
    "This {} is terrible",
    "Awful {} waste of money",
    "Worst {} ever",
    "Horrible {} experience",
    "Bad {} never again"
]

items = ['product', 'movie', 'book', 'service', 'food', 'app']

# 生成数据
train_texts = []
train_labels = []

for template in positive_templates:
    for item in items:
        train_texts.append(template.format(item))
        train_labels.append(1)

for template in negative_templates:
    for item in items:
        train_texts.append(template.format(item))
        train_labels.append(0)

print(f"训练数据量: {len(train_texts)}")
print(f"正样本: {sum(train_labels)}, 负样本: {len(train_labels) - sum(train_labels)}")

In [None]:
# 重新构建词汇表
cleaned_train = [clean_text(t) for t in train_texts]
vocab = Vocabulary()
vocab.build_vocab(cleaned_train)

print(f"词汇表大小: {len(vocab)}")

# 创建数据集和数据加载器
train_dataset = TextDataset(train_texts, train_labels, vocab, max_len=10)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [None]:
# 训练模型
model = TextClassifier(
    vocab_size=len(vocab),
    embed_dim=64,
    hidden_dim=64,
    num_classes=2
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch_x, batch_y in train_loader:
        batch_x = batch_x.to(device)
        batch_y = batch_y.squeeze().to(device)
        
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        pred = output.argmax(dim=1)
        correct += (pred == batch_y).sum().item()
        total += batch_y.size(0)
    
    if (epoch + 1) % 5 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}, Acc: {correct/total:.4f}')

In [None]:
# 测试模型
def predict(model, text, vocab, max_len):
    model.eval()
    tokens = clean_text(text)
    encoded = vocab.encode(tokens)
    
    if len(encoded) > max_len:
        encoded = encoded[:max_len]
    else:
        encoded = encoded + [0] * (max_len - len(encoded))
    
    x = torch.LongTensor([encoded]).to(device)
    
    with torch.no_grad():
        output = model(x)
        prob = F.softmax(output, dim=1)
        pred = output.argmax(dim=1).item()
    
    return pred, prob[0].cpu().numpy()

# 测试
test_texts = [
    "I love this product",
    "This service is terrible",
    "Amazing experience",
    "Horrible waste of time"
]

print("预测结果:")
for text in test_texts:
    pred, prob = predict(model, text, vocab, max_len=10)
    sentiment = "正面" if pred == 1 else "负面"
    print(f"  '{text}'")
    print(f"    预测: {sentiment}, 概率: 负面={prob[0]:.3f}, 正面={prob[1]:.3f}\n")

## 5. 常见 NLP 任务概览

In [None]:
print("""
常见 NLP 任务:

1. 文本分类
   - 情感分析
   - 垃圾邮件检测
   - 主题分类

2. 序列标注
   - 命名实体识别 (NER)
   - 词性标注 (POS)
   - 分词

3. 文本生成
   - 机器翻译
   - 文本摘要
   - 对话系统

4. 语义理解
   - 问答系统
   - 阅读理解
   - 文本蕴含

5. 信息抽取
   - 关系抽取
   - 事件抽取
   - 知识图谱构建
""")

## 6. 练习题

### 练习：改进文本分类模型

In [None]:
# 在这里编写代码
# 1. 添加更多训练数据
# 2. 尝试使用 TextCNN 模型
# 3. 添加注意力机制
# 4. 比较不同模型的性能


## 7. 本课小结

1. **文本预处理**：清洗、分词、编码、填充
2. **词汇表**：word2idx、idx2word 映射
3. **词嵌入**：将离散词转为连续向量
4. **模型**：LSTM、TextCNN 用于分类
5. **常见任务**：分类、标注、生成、理解