# 基于Transformer的多项选择

## 1.导包

In [88]:
import evaluate
# from accelerate.test_utils.scripts.external_deps.test_ds_alst_ulysses_sp import tokenizer
from datasets import Dataset,DatasetDict,load_dataset
from nltk import accuracy
from nltk.parse.corenlp import transform
from pandas.core.computation.parsing import tokenize_backtick_quoted_string
from transformers import AutoTokenizer,AutoModelForMultipleChoice,TrainingArguments,Trainer

## 2.加载数据集

In [89]:
c3 = load_dataset("dataset-org/c3", "mixed")
c3

DatasetDict({
    train: Dataset({
        features: ['documents', 'document_id', 'questions'],
        num_rows: 3138
    })
    test: Dataset({
        features: ['documents', 'document_id', 'questions'],
        num_rows: 1045
    })
    validation: Dataset({
        features: ['documents', 'document_id', 'questions'],
        num_rows: 1046
    })
})

In [90]:
def c3_function(examples):
    examples["context"] = examples["documents"]
    examples["question"] = [q["question"] for q in examples["questions"]]
    examples["answer"] = [q["answer"] for q in examples["questions"]]
    examples["choice"] = [q["choice"] for q in examples["questions"]]

    del examples["questions"]
    del examples["document_id"]
    del examples["documents"]
    return examples
c3 = c3.map(c3_function,batched=True)

In [91]:
c3

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer', 'choice'],
        num_rows: 3138
    })
    test: Dataset({
        features: ['context', 'question', 'answer', 'choice'],
        num_rows: 1045
    })
    validation: Dataset({
        features: ['context', 'question', 'answer', 'choice'],
        num_rows: 1046
    })
})

In [92]:
c3["test"][0]

{'context': ['一个孕妇在路上行走。一个小女孩走过来问她：“阿姨，您的肚子为什么这么大?”孕妇说：“因为肚子里有孩子啊!”小女孩说：“阿姨，您是怕麻烦吧?”孕妇说：“啊?为什么?”小女孩说：“您嫌孩子抱着不方便，就把他放进肚子里了嘛。”'],
 'question': ['请选出与试题内容一致的一项。'],
 'answer': ['小女孩认为孕妇怕麻烦'],
 'choice': [['小女孩怕麻烦', '小女孩认为孕妇怕麻烦', '孕妇怕麻烦', '孕妇很聪明']]}

In [93]:
c3.pop("test")

Dataset({
    features: ['context', 'question', 'answer', 'choice'],
    num_rows: 1045
})

In [94]:
c3["train"]["question"][0]

['动物的器官感觉与人的相比有什么不同?',
 '录音中提到能预报风暴的动物是什么?',
 '低频声波至少要达到每秒多少次才能被人感觉到?',
 '动物感觉到低频声波时会有怎样的表现?']

## 3.数据预处理

In [95]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base", clean_up_tokenization_spaces=True)

In [96]:
# 打印1条样本，检查context/question/choice/answer的类型
sample = c3["train"][0]
print("context类型:", type(sample["context"]), "值:", sample["context"])
print("question类型:", type(sample["question"]), "值:", sample["question"])
print("choice类型:", type(sample["choice"]), "值:", sample["choice"])
print("answer类型:", type(sample["answer"]), "值:", sample["answer"])

context类型: <class 'list'> 值: ['许多动物的某些器官感觉特别灵敏，它们能比人类提前知道一些灾害事件的发生，例如，海洋中的水母能预报风暴，老鼠能事先躲避矿井崩塌或有害气体，等等。地震往往能使一些动物的某些感觉器官受到刺激而发生异常反应。如一个地区的重力发生变异，某些动物可能通过它们的平衡器官感觉到；一种振动异常，某些动物的听觉器官也许能够察觉出来。地震前地下岩层早已在逐日缓慢活动，而断层面之间又具有强大的摩擦力。这种摩擦力会产生一种低于人的听觉所能感觉到的低频声波。人对每秒20次以上的声波才能感觉到，而动物则不然。那些感觉十分灵敏的动物，在感触到这种低声波时，便会惊恐万状，以至出现冬蛇出洞、鱼跃水面等异常现象。']
question类型: <class 'list'> 值: ['动物的器官感觉与人的相比有什么不同?', '录音中提到能预报风暴的动物是什么?', '低频声波至少要达到每秒多少次才能被人感觉到?', '动物感觉到低频声波时会有怎样的表现?']
choice类型: <class 'list'> 值: [['没有人的灵敏', '和人的差不多', '和人的一样好', '比人的灵敏'], ['蛇', '老鼠', '水母', '鱼'], ['20次', '20次以上', '20次以下', '以上都对'], ['兴奋', '逃跑', '跳跃', '害怕']]
answer类型: <class 'list'> 值: ['比人的灵敏', '水母', '20次以上', '害怕']


In [97]:
def process_function(examples):
    batch_size = len(examples["context"])
    all_input_ids = []
    all_attention_mask = []
    all_labels = []

    for i in range(batch_size):
        ctx = "\n".join(examples["context"][i])
        question = "".join(examples["question"][i])
        choices_data = examples["choice"][i]

        # 处理选项
        processed_choices = []
        for choice in choices_data:
            if isinstance(choice, list):
                choice_str = "".join(choice)
            else:
                choice_str = str(choice)
            processed_choices.append(choice_str.strip())

        # 处理答案
        answer_data = examples["answer"][i]
        if isinstance(answer_data, list):
            answer_str = "".join(answer_data).strip()
        else:
            answer_str = str(answer_data).strip()

        # 查找答案索引
        try:
            label = processed_choices.index(answer_str)
        except ValueError:
            label = 0

        # 为当前样本准备输入
        contexts = []
        questions = []
        for choice in processed_choices:
            contexts.append(ctx)
            questions.append(question + " " + choice)

        # 确保有4个选项
        while len(contexts) < 4:
            contexts.append(ctx)
            questions.append(question + " " + "不知道")

        # 只取前4个选项
        contexts = contexts[:4]
        questions = questions[:4]

        # Tokenize
        tokenized = tokenizer(
            contexts,
            questions,
            truncation="only_first",
            max_length=256,
            padding="max_length",
            return_tensors=None
        )

        # 添加到批次
        all_input_ids.append(tokenized["input_ids"])
        all_attention_mask.append(tokenized["attention_mask"])
        all_labels.append(label)

    return {
        "input_ids": all_input_ids,
        "attention_mask": all_attention_mask,
        "labels": all_labels
    }

# 重新处理数据


In [98]:
res = c3["train"].select(range(10)).map(process_function,batched=True)
print(res)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['context', 'question', 'answer', 'choice', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10
})


In [99]:
import numpy as np
np.array(res["input_ids"]).shape

(10, 4, 256)

In [100]:
tokenized_c3 = c3.map(process_function, batched=True, batch_size=1000)
tokenized_c3

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer', 'choice', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3138
    })
    validation: Dataset({
        features: ['context', 'question', 'answer', 'choice', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1046
    })
})

## 4.创建模型

In [101]:
model = AutoModelForMultipleChoice.from_pretrained("hfl/chinese-macbert-base")

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 5.评估函数

In [102]:
import numpy as np
accuracy = evaluate.load("accuracy")

def compute_metric(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions,axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

## 6.配置训练参数

In [103]:
args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    eval_strategy="epoch",
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    remove_unused_columns=False,  # 重要：不要自动删除未使用的列
)

## 7.创建训练器

In [104]:
from transformers import DataCollatorForMultipleChoice, TrainingArguments, Trainer

# 使用 DataCollatorForMultipleChoice
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)

# 更新训练器初始化，使用 processing_class 替代已弃用的 tokenizer 参数
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_c3["train"],
    eval_dataset=tokenized_c3["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
)

## 8.训练

In [105]:
# trainer.train()

## 9.预测