In [1]:
from EduNLP.Pretrain import PretrainedEduTokenizer, EduDataset
import os
import json


BASE_DIR = "../.."
data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/data/pretrain_test_models/pretrain/"


def stem_data():
    _data = []
    data_path = os.path.join(data_dir, "standard_luna_data.json")
    with open(data_path, encoding="utf-8") as f:
        for line in f.readlines():
            _data.append(json.loads(line))
    return _data

train_items = stem_data()

test_items = [
    {'ques_content': '有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$，\
            如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,\
            若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'},
    {'ques_content': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \
            若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'}
]

  import cryptography.exceptions


# 1. PretrainedEduTokenizer

该类主要用于处理预训练模型的输入语料，主要成分包括词表(vocab) 和 基础令牌话容器，负责将输入语料处理为适合模型的输入格式。

## 1.1 构造令牌化容器

In [2]:
corpus_items = train_items + test_items

# 定义参数
tokenizer_params = {
    "add_specials": True,
    "tokenize_method": "pure_text",
}

tokenizer = PretrainedEduTokenizer(**tokenizer_params)
print(len(tokenizer))


# 设置预训练语料，训练令牌话容器
tokenizer.set_vocab(corpus_items, key=lambda x: x['ques_content'])
print(len(tokenizer))

# 保存令牌话容器
pretrained_tokenizer_dir = output_dir
tokenizer.save_pretrained(pretrained_tokenizer_dir)

14
306


## 1.2 使用令牌化容器

In [4]:
# 加载令牌话容器
tokenizer = PretrainedEduTokenizer.from_pretrained(pretrained_tokenizer_dir)

# 按batch进行padding
encodes = tokenizer(test_items, key=lambda x: x['ques_content'])
print(list(encodes.keys()))
print(encodes["seq_idx"].shape)
print()

# 按max_length进行padding
encodes = tokenizer(test_items, key=lambda x: x['ques_content'], padding="max_length", max_length=100)
print(list(encodes.keys()))
print(encodes["seq_idx"].shape)
print()

# 不返回tensor
encodes = tokenizer(test_items, key=lambda x: x['ques_content'], padding="max_length", max_length=100, return_tensors=False)
print(encodes["seq_idx"])
print()

# 保留tokens
encodes = tokenizer(test_items, key=lambda x: x['ques_content'], padding="max_length", max_length=100, return_text=True)
print(list(encodes.keys()))
print()

['seq_idx', 'seq_len']
torch.Size([2, 17])

['seq_idx', 'seq_len']
torch.Size([2, 100])

[[305, 305, 238, 6, 20, 33, 86, 166, 9, 40, 17, 20, 41, 140, 86, 175, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [238, 6, 20, 33, 86, 166, 9, 40, 17, 20, 41, 140, 86, 175, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

['seq_idx', 'seq_len', 'seq_token']



## 1.3 其他操作

扩充词表

编码/解码 句子

修改基础令牌化容器

# EduDataset

# BaseModel