相似度数据集 AFQMC 作为语料

In [1]:
from torch.utils.data import Dataset
import json

class AFQMC(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)
    
    def load_data(self, data_file):
        Data = {}
        with open(data_file, 'rt') as f:
            for idx, line in enumerate(f):
                sample = json.loads(line.strip())
                Data[idx] = sample
        return Data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

train_data = AFQMC('data/afqmc_public/train.json')
valid_data = AFQMC('data/afqmc_public/dev.json')

print(train_data[0])

{'sentence1': '蚂蚁借呗等额还款可以换成先息后本吗', 'sentence2': '借呗有先息到期还本吗', 'label': '0'}


如果数据集非常巨大，难以一次性加载到内存中，我们也可以继承 IterableDataset 类构建迭代型数据集：

In [2]:
from torch.utils.data import IterableDataset
import json

class IterableAFQMC(IterableDataset):
    def __init__(self, data_file):
        self.data_file = data_file

    def __iter__(self):
        with open(self.data_file, 'rt') as f:
            for line in f:
                sample = json.loads(line.strip())
                yield sample

train_data = IterableAFQMC('data/afqmc_public/train.json')

print(next(iter(train_data)))

{'sentence1': '蚂蚁借呗等额还款可以换成先息后本吗', 'sentence2': '借呗有先息到期还本吗', 'label': '0'}


DataLoader
接下来就需要通过 DataLoader 库按批 (batch) 加载数据，并且将样本转换成模型可以接受的输入格式。对于 NLP 任务，这个环节就是将每个 batch 中的文本按照预训练模型的格式进行编码（包括 Padding、截断等操作）。

我们通过手工编写 DataLoader 的批处理函数 collate_fn 来实现。首先加载分词器，然后对每个 batch 中的所有句子对进行编码，同时把标签转换为张量格式：

In [6]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

checkpoint = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def collote_fn(batch_samples):
    batch_sentence_1, batch_sentence_2 = [], []
    batch_label = []
    for sample in batch_samples:
        batch_sentence_1.append(sample['sentence1'])
        batch_sentence_2.append(sample['sentence2'])
        batch_label.append(int(sample['label']))
    X = tokenizer(
        batch_sentence_1, 
        batch_sentence_2, 
        padding=True, 
        truncation=True, 
        return_tensors="pt"
    )
    y = torch.tensor(batch_label)
    return X, y

train_dataloader = DataLoader(train_data, batch_size=4, shuffle=False, collate_fn=collote_fn)
#train_dataloader = DataLoader(train_data, batch_size=4, shuffle=True, collate_fn=collote_fn)

batch_X, batch_y = next(iter(train_dataloader))
print('batch_X shape:', {k: v.shape for k, v in batch_X.items()})
print('batch_y shape:', batch_y.shape)
print(batch_X)
print(batch_y)

batch_X shape: {'input_ids': torch.Size([4, 30]), 'token_type_ids': torch.Size([4, 30]), 'attention_mask': torch.Size([4, 30])}
batch_y shape: torch.Size([4])
{'input_ids': tensor([[ 101, 6010, 6009,  955, 1446, 5023, 7583, 6820, 3621, 1377,  809, 2940,
         2768, 1044, 2622, 1400, 3315, 1408,  102,  955, 1446, 3300, 1044, 2622,
         1168, 3309, 6820, 3315, 1408,  102],
        [ 101, 6010, 6009, 5709, 1446, 6432, 2769, 6824, 5276,  671, 3613,  102,
         6010, 6009, 5709, 1446, 6824, 5276, 6121,  711, 3221,  784,  720,  102,
            0,    0,    0,    0,    0,    0],
        [ 101, 2376, 2769, 4692,  671,  678, 3315, 3299, 5709, 1446, 6572, 1296,
         3300, 3766, 3300, 5310, 3926,  102,  678, 3299, 5709, 1446, 6572, 1296,
          102,    0,    0,    0,    0,    0],
        [ 101, 6010, 6009,  955, 1446, 1914, 7270, 3198, 7313, 5341, 1394, 6397,
          844,  671, 3613,  102,  955, 1446, 2533, 6397,  844, 1914,  719,  102,
            0,    0,    0,    0,    0,   

这个错误是因为您的 train_data 是一个 IterableDataset，而 IterableDataset 不能直接使用 shuffle=True 参数。我来帮您修改代码并提供解决方案。

In [8]:
import torch
import random
from torch.utils.data import DataLoader, IterableDataset
from transformers import AutoTokenizer

checkpoint = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def collote_fn(batch_samples):
    batch_sentence_1, batch_sentence_2 = [], []
    batch_label = []
    for sample in batch_samples:
        batch_sentence_1.append(sample['sentence1'])
        batch_sentence_2.append(sample['sentence2'])
        batch_label.append(int(sample['label']))
    X = tokenizer(
        batch_sentence_1, 
        batch_sentence_2, 
        padding=True, 
        truncation=True, 
        return_tensors="pt"
    )
    y = torch.tensor(batch_label)
    return X, y

# 方法1：使用缓冲池实现IterableDataset的shuffle功能
class ShuffledIterableDataset(IterableDataset):
    def __init__(self, dataset, buffer_size=1000):
        self.dataset = dataset
        self.buffer_size = buffer_size

    def __iter__(self):
        buffer = []
        for sample in self.dataset:
            buffer.append(sample)
            if len(buffer) >= self.buffer_size:
                # 从缓冲区随机选择一个样本
                idx = random.randint(0, len(buffer) - 1)
                yield buffer.pop(idx)
        
        # 处理缓冲区剩余样本
        while buffer:
            idx = random.randint(0, len(buffer) - 1)
            yield buffer.pop(idx)

# 包装您的train_data
shuffled_train_data = ShuffledIterableDataset(train_data, buffer_size=1000)

# 使用shuffle=False（因为已经在ShuffledIterableDataset中实现了shuffle）
train_dataloader = DataLoader(shuffled_train_data, batch_size=4, shuffle=False, collate_fn=collote_fn)

batch_X, batch_y = next(iter(train_dataloader))
print('batch_X shape:', {k: v.shape for k, v in batch_X.items()})
print('batch_y shape:', batch_y.shape)
print(batch_X)
print(batch_y)

batch_X shape: {'input_ids': torch.Size([4, 67]), 'token_type_ids': torch.Size([4, 67]), 'attention_mask': torch.Size([4, 67])}
batch_y shape: torch.Size([4])
{'input_ids': tensor([[ 101, 2166,  802, 3221, 4500, 5709, 1446, 6820, 3221,  865, 7583,  102,
         5709, 1446,  679, 1377,  809, 4500, 2166,  802,  802, 3621, 1408,  102,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0],
        [ 101, 5709, 1446, 6818, 3309, 3833, 1220,  102, 5709, 1446, 3300, 3173,
         3833, 1220, 1408,  102,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,  

替代方案（更简单的方法）
如果您的数据集不是特别大，建议直接转换为Map-style数据集：

In [9]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

checkpoint = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def collote_fn(batch_samples):
    batch_sentence_1, batch_sentence_2 = [], []
    batch_label = []
    for sample in batch_samples:
        batch_sentence_1.append(sample['sentence1'])
        batch_sentence_2.append(sample['sentence2'])
        batch_label.append(int(sample['label']))
    X = tokenizer(
        batch_sentence_1, 
        batch_sentence_2, 
        padding=True, 
        truncation=True, 
        return_tensors="pt"
    )
    y = torch.tensor(batch_label)
    return X, y

# 将IterableDataset转换为普通的list（Map-style数据集）
train_data_list = list(train_data)

# 现在可以使用shuffle=True了
train_dataloader = DataLoader(train_data_list, batch_size=4, shuffle=True, collate_fn=collote_fn)

batch_X, batch_y = next(iter(train_dataloader))
print('batch_X shape:', {k: v.shape for k, v in batch_X.items()})
print('batch_y shape:', batch_y.shape)
print(batch_X)
print(batch_y)

batch_X shape: {'input_ids': torch.Size([4, 39]), 'token_type_ids': torch.Size([4, 39]), 'attention_mask': torch.Size([4, 39])}
batch_y shape: torch.Size([4])
{'input_ids': tensor([[ 101, 5709, 1446, 4385, 1762, 1108, 5310, 5543,  886, 4500, 1408,  102,
         2769, 4385, 5709, 1446, 6820, 5543, 4500,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0],
        [ 101,  955, 1446, 1377,  809,  679, 4500, 1146, 3309, 6820, 3621, 1408,
          102,  955, 1446, 1146, 3309, 2797, 3322, 1377,  809,  671, 3613, 2595,
          802, 1408,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0],
        [ 101, 1963, 3362, 6010, 6009,  955, 1446, 1728,  711, 2990, 1184, 6820,
         3621, 6158, 4881, 3632,  886, 4500, 8024,  809, 1400, 6820, 5543,  886,
         4500, 1408,  102, 6010, 6009,  955, 1446, 6874, 3309,  749, 6820, 5543,
         4500, 1408,  102],
        [ 101,

主要修改内容
修改1：添加了ShuffledIterableDataset类
功能：实现了缓冲池shuffle算法，专门为IterableDataset设计

原理：维护一个固定大小的缓冲区，从中随机选择样本输出

参数：buffer_size控制shuffle的效果（越大越随机）

修改2：包装原始数据集
将原始的 train_data 包装在 ShuffledIterableDataset 中

在 DataLoader 中使用 shuffle=False（因为shuffle已经在包装器中实现）

修改3：添加了必要的import
添加了 import random

添加了 from torch.utils.data import IterableDataset

💡 选择建议
如果数据集很大（无法全部加载到内存）：使用第一种方法（ShuffledIterableDataset）

如果数据集适中：使用第二种方法（转换为list），这样更简单直接

如果使用Hugging Face datasets：检查是否可以使用 .to_list() 方法转换