In [1]:
from EduNLP.Pretrain import PretrainedEduTokenizer, EduDataset
import os
import json


BASE_DIR = "../.."
data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/data/pretrain_test_models/pretrain/"


def stem_data():
    _data = []
    data_path = os.path.join(data_dir, "standard_luna_data.json")
    with open(data_path, encoding="utf-8") as f:
        for line in f.readlines():
            _data.append(json.loads(line))
    return _data

train_items = stem_data()

test_items = [
    {'ques_content': '有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$，\
            如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,\
            若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'},
    {'ques_content': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \
            若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'}
]



# 1. PretrainedEduTokenizer

该类主要用于处理预训练模型的输入语料，主要成分包括词表(vocab) 和 基础令牌话容器，负责将输入语料处理为适合模型的输入格式。

## 1.1 构造令牌化容器

In [2]:
corpus_items = train_items + test_items

# 定义参数
tokenizer_params = {
    "add_specials": True,
    "tokenize_method": "pure_text",
}
# 可自定义pure_text的参数， 参考Tokenizer/PureTextTokenizer
text_params = {
    "granularity": "char",
    "stopwords": None,
}

tokenizer = PretrainedEduTokenizer(**tokenizer_params, text_params=text_params)
print(len(tokenizer))


# 设置预训练语料，训练令牌话容器
tokenizer.set_vocab(corpus_items, key=lambda x: x['ques_content'])
print(len(tokenizer))

# 保存令牌话容器
pretrained_tokenizer_dir = output_dir
tokenizer.save_pretrained(pretrained_tokenizer_dir)

14


Dump cache file failed.
Traceback (most recent call last):
  File "/data/qlh/anaconda3/envs/py36/lib/python3.6/site-packages/jieba/__init__.py", line 154, in initialize
    _replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmpk245c2ok' -> '/tmp/jieba.cache'


379


## 1.2 使用令牌化容器

In [3]:
# 加载令牌话容器
tokenizer = PretrainedEduTokenizer.from_pretrained(pretrained_tokenizer_dir)

# 按batch进行padding
encodes = tokenizer(test_items, key=lambda x: x['ques_content'])
print(list(encodes.keys()))
print(encodes["seq_idx"].shape)
print()

# 按max_length进行padding
encodes = tokenizer(test_items, key=lambda x: x['ques_content'], padding="max_length", max_length=100)
print(list(encodes.keys()))
print(encodes["seq_idx"].shape)
print()

# 不返回tensor
encodes = tokenizer(test_items, key=lambda x: x['ques_content'], padding="max_length", max_length=100, return_tensors=False)
print(encodes["seq_idx"])
print()

# 保留tokens
encodes = tokenizer(test_items, key=lambda x: x['ques_content'], padding="max_length", max_length=100, return_text=True)
print(list(encodes.keys()))
print()

['seq_idx', 'seq_len']
torch.Size([2, 17])

['seq_idx', 'seq_len']
torch.Size([2, 100])

[[1, 1, 1, 6, 22, 35, 130, 1, 9, 45, 19, 22, 46, 211, 130, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 6, 22, 35, 130, 1, 9, 45, 19, 22, 46, 211, 130, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

['seq_idx', 'seq_len', 'seq_token']



## 1.3 其他操作

扩充词表

In [4]:
print(tokenizer.vocab._special_tokens)
print()

# 增加特殊词
tokenizer.add_specials(["[special]"])
print(tokenizer.tokenize("[special]"))
print(tokenizer.vocab._special_tokens)
print()

# 增加词
tokenizer.add_tokens(["[token]"])
print(tokenizer.tokenize("[token]"))

['[PAD]', '[UNK]', '[BOS]', '[EOS]', '[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]', '[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]']

['special']
['[PAD]', '[UNK]', '[BOS]', '[EOS]', '[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]', '[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]', '[special]']

['token']


编码/解码 句子

In [5]:
encode_idxs = tokenizer.encode('公式 公 式')
print(encode_idxs)

encode_tokens = tokenizer.decode(encode_idxs)
print(encode_tokens)

[1, 370, 371]
['[UNK]', '公', '式']


修改基础令牌化容器

In [6]:
# 可自定义参数
formula_params = {
    "skip_figure_formula": True,
    "symbolize_figure_formula": False
}

tokenizer._set_basic_tokenizer("ast_formula", formula_params=formula_params)

保存与加载

In [7]:
# 保存
save_dir = "./tmp"
tokenizer.save_pretrained(save_dir)

# 加载
tokenizer = PretrainedEduTokenizer.from_pretrained(save_dir)

# EduDataset

## 直接使用

In [8]:
# 使用EduDataset
dataset = EduDataset(tokenizer, items=train_items,
                     stem_key="ques_content")
print(dataset[0].keys())

  0%|          | 0/1 [00:00<?, ?ba/s]

dict_keys(['seq_idx', 'seq_len'])


In [9]:
dataset = EduDataset(tokenizer, items=train_items,
                     stem_key="ques_content", label_key="difficulty")
print(dataset[0].keys())

  0%|          | 0/1 [00:00<?, ?ba/s]

dict_keys(['labels', 'seq_idx', 'seq_len'])


In [10]:
dataset = EduDataset(tokenizer, items=train_items,
                     stem_key="ques_content", label_key="difficulty", feature_keys=["know_list"])
print(dataset[0].keys())

  0%|          | 0/1 [00:00<?, ?ba/s]

dict_keys(['know_list', 'labels', 'seq_idx', 'seq_len'])


## 保存与加载

考虑到预处理耗时久，若希望下次能直接使用处理后的数据，可将预处理后的数据保存在本地。

In [11]:
dataset.to_disk(output_dir)

In [12]:
# # 保存
dataset.to_disk(output_dir)

# # 加载
dataset1 = EduDataset(tokenizer, ds_disk_path=output_dir)
print(dataset1[0].keys())

dataset2 = EduDataset(tokenizer, ds_disk_path=output_dir, label_key="difficulty", feature_keys=["know_list"])
print(dataset2[0].keys())

dict_keys(['seq_idx', 'seq_len'])
dict_keys(['know_list', 'labels', 'seq_idx', 'seq_len'])


## 并行预处理
在题目数据量过大时，令牌化等预处理操作耗时较长，可通过并行处理加速。

In [13]:
import time

s = time.time()
# 使用并行加速
dataset = EduDataset(tokenizer, items=train_items*100,
                    stem_key="ques_content",
                    num_processor=4)
print(dataset[0].keys())
e = time.time()
print(f"spand time: {(e - s):.4}s")

s = time.time()
# 不使用并行加速
dataset = EduDataset(tokenizer, items=train_items*100,
                    stem_key="ques_content",)
print(dataset[0].keys())
e = time.time()
print(f"spand time: {(e - s):.4}s")

       

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

dict_keys(['seq_idx', 'seq_len'])
spand time: 1.641s


  0%|          | 0/3 [00:00<?, ?ba/s]

dict_keys(['seq_idx', 'seq_len'])
spand time: 4.484s
