In [1]:
import torch
import random

def generate_sample_data(num_samples=100, num_features=10, num_classes=3):
    """
    生成用于分类任务的随机样例数据集。

    Args:
        num_samples (int): 生成的样本数量。
        num_features (int): 每个样本的特征数量。
        num_classes (int): 类别数量。

    Returns:
        tuple: (features_tensor, labels_tensor)
               features_tensor: torch.Tensor, 形状 (num_samples, num_features)
               labels_tensor: torch.Tensor, 形状 (num_samples,)
    """
    # 随机生成特征数据 (X)
    features = torch.randn(num_samples, num_features)

    # 随机生成标签 (y)，标签是 0 到 num_classes-1 之间的整数
    labels = torch.randint(0, num_classes, (num_samples,))

    print(f"生成的样本数量: {num_samples}")
    print(f"每个样本特征数: {num_features}")
    print(f"类别数量: {num_classes}")
    print(f"特征张量形状: {features.shape}")
    print(f"标签张量形状: {labels.shape}")
    print(f"部分特征示例:\n{features[:3]}")
    print(f"部分标签示例:\n{labels[:3]}")

    return features, labels

if __name__ == '__main__':
    # 演示数据生成
    X_data, y_data = generate_sample_data(num_samples=20, num_features=5, num_classes=4)

    # 模拟包含填充 token 的序列标签数据
    # 假设我们有一个批次，其中一些序列被填充，填充 token 的 ID 是 0
    # 在交叉熵损失中，我们希望忽略这些填充 token 的损失
    print("\n--- 模拟带有 ignore_index 的序列标签数据 ---")
    # 这是一个批次，包含 2 个序列，每个序列最大长度为 5
    # 类别有 3 个 (0, 1, 2)
    # 假设 0 是填充 token 的 ID (我们想忽略它)
    mock_logits_sequence = torch.randn(2, 5, 3) # batch_size=2, seq_len=5, num_classes=3
    mock_labels_sequence = torch.tensor([
        [0, 1, 2, 0, 0], # 第一个序列，标签 0, 1, 2，后面两个是填充
        [1, 0, 1, 2, 0]  # 第二个序列，标签 1, 0, 1, 2，最后一个是填充
    ])
    print(f"模拟的序列 logits 形状: {mock_logits_sequence.shape}")
    print(f"模拟的序列 labels 形状: {mock_labels_sequence.shape}")
    print(f"模拟的序列 labels:\n{mock_labels_sequence}")

生成的样本数量: 20
每个样本特征数: 5
类别数量: 4
特征张量形状: torch.Size([20, 5])
标签张量形状: torch.Size([20])
部分特征示例:
tensor([[-0.8310,  0.1339,  0.6688, -0.1853, -0.6301],
        [ 0.9359, -0.4704, -1.3103,  0.4475, -0.8904],
        [-1.0550,  1.7600, -0.3360,  0.3945,  1.3438]])
部分标签示例:
tensor([0, 3, 2])

--- 模拟带有 ignore_index 的序列标签数据 ---
模拟的序列 logits 形状: torch.Size([2, 5, 3])
模拟的序列 labels 形状: torch.Size([2, 5])
模拟的序列 labels:
tensor([[0, 1, 2, 0, 0],
        [1, 0, 1, 2, 0]])


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# --- 1. 定义一个简单的分类模型 ---
class SimpleClassifier(nn.Module):
    def __init__(self, num_features, num_classes):
        super(SimpleClassifier, self).__init__()
        self.fc1 = nn.Linear(num_features, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        # 注意：这里不需要 sigmoid 或 softmax，CrossEntropyLoss 会自动处理
        x = self.fc2(x)
        return x

# --- 2. 生成样例数据集 ---
num_samples = 1000
num_features = 20
num_classes = 5
X_data, y_data = generate_sample_data(num_samples, num_features, num_classes)

# --- 3. 准备 DataLoader ---
dataset = TensorDataset(X_data, y_data)
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# --- 4. 实例化模型、损失函数和优化器 ---
model = SimpleClassifier(num_features, num_classes)

# CrossEntropyLoss:
# - input (logits): (N, C)
# - target (labels): (N)
# 内部自动进行 LogSoftmax + NLLLoss
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# --- 5. 训练循环 ---
num_epochs = 10
print("\n--- 开始训练 ---")
for epoch in range(num_epochs):
    model.train() # 设置模型为训练模式
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch_features, batch_labels in dataloader:
        # 1. 前向传播
        outputs = model(batch_features) # outputs 是 logits (N, num_classes)

        # 2. 计算损失
        # outputs: (batch_size, num_classes)
        # batch_labels: (batch_size)
        loss = loss_fn(outputs, batch_labels)

        # 3. 反向传播和优化
        optimizer.zero_grad() # 清除之前的梯度
        loss.backward()       # 计算梯度
        optimizer.step()      # 更新模型参数

        total_loss += loss.item()

        # 计算准确率 (用于演示)
        _, predicted = torch.max(outputs.data, 1) # 获取预测的类别索引
        total_samples += batch_labels.size(0)
        correct_predictions += (predicted == batch_labels).sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = 100 * correct_predictions / total_samples
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

print("\n--- 训练完成 ---")

# --- 6. 演示带有 `ignore_index` 的 CrossEntropyLoss ---
print("\n--- 演示带有 `ignore_index` 的 CrossEntropyLoss ---")
# 模拟序列任务的 logits 和 labels
# 假设有 2 个序列，每个序列长度为 5，3 个类别 (0, 1, 2)
# 假设 0 是填充 token 的 ID，我们希望忽略它的损失贡献
mock_logits_sequence = torch.randn(2, 5, 3) # batch_size=2, seq_len=5, num_classes=3
mock_labels_sequence = torch.tensor([
    [0, 1, 2, 0, 0], # 第一个序列，标签 0, 1, 2，后面两个 0 是填充
    [1, 0, 1, 2, 0]  # 第二个序列，标签 1, 0, 1, 2，最后一个 0 是填充
])

# 重新形状 logits 以适应 CrossEntropyLoss 的输入要求
# (N, C) N=batch_size * seq_len
# 将 mock_logits_sequence 形状从 (2, 5, 3) 变为 (2*5, 3) = (10, 3)
reshaped_logits = mock_logits_sequence.view(-1, num_classes)
# 将 mock_labels_sequence 形状从 (2, 5) 变为 (2*5) = (10,)
reshaped_labels = mock_labels_sequence.view(-1)

print(f"重塑后的 logits 形状: {reshaped_logits.shape}")
print(f"重塑后的 labels 形状: {reshaped_labels.shape}")
print(f"重塑后的 labels (含填充): {reshaped_labels}")

# 实例化带有 ignore_index 的损失函数
# 假设我们的填充 token 的 ID 是 0
loss_fn_ignore = nn.CrossEntropyLoss(ignore_index=0) # 忽略标签为 0 的损失

loss_with_ignore = loss_fn_ignore(reshaped_logits, reshaped_labels)
print(f"使用 ignore_index=0 计算的损失: {loss_with_ignore.item():.4f}")

# 比较：如果不忽略填充
loss_without_ignore = nn.CrossEntropyLoss()(reshaped_logits, reshaped_labels)
print(f"不使用 ignore_index 计算的损失: {loss_without_ignore.item():.4f}")
print("可以看出，当指定 ignore_index 后，损失值会发生变化，因为它忽略了特定标签的贡献。")

生成的样本数量: 1000
每个样本特征数: 20
类别数量: 5
特征张量形状: torch.Size([1000, 20])
标签张量形状: torch.Size([1000])
部分特征示例:
tensor([[ 0.3187,  0.1825,  0.7392, -2.5795, -0.7172,  0.8008,  1.6403,  1.4680,
          1.6394, -0.1548, -0.4266,  0.5731, -1.1909, -1.0562,  0.1423, -0.3859,
          1.4876,  0.9411,  0.3705,  1.4085],
        [ 0.4400,  0.2762,  0.0961, -1.0619, -1.6829,  1.0217, -1.4161, -0.8497,
         -1.1269,  1.8879,  0.2182, -0.6327,  2.1705,  0.3682, -1.2846, -0.6902,
         -0.1661,  1.5184, -0.1362, -0.4505],
        [-0.2543, -1.0076, -0.3582,  0.0697, -0.2602,  0.4723,  2.1217, -0.5414,
         -1.1001, -1.0351,  1.2462, -2.0808, -0.0427,  0.3726, -0.3560,  1.2486,
         -0.6592,  0.6755,  0.0840,  0.9176]])
部分标签示例:
tensor([3, 1, 1])

--- 开始训练 ---
Epoch [1/10], Loss: 1.6529, Accuracy: 17.10%
Epoch [2/10], Loss: 1.5700, Accuracy: 27.90%
Epoch [3/10], Loss: 1.5480, Accuracy: 29.60%
Epoch [4/10], Loss: 1.5036, Accuracy: 33.20%
Epoch [5/10], Loss: 1.4602, Accuracy: 36.00%
Epoch [6/1

ValueError: Expected input batch_size (6) to match target batch_size (10).