In [1]:
import torch
import numpy as np
import os
import json
from EduNLP.Pretrain import BertTokenizer, finetune_bert
from EduNLP.Vector import T2V
from EduNLP.I2V import Bert, get_pretrained_i2v

import os
os.environ["WANDB_DISABLED"] = "true"



# 训练自己的Bert模型
## 1. 数据

In [2]:
# 设置你的数据路径和输出路径
BASE_DIR = "/your/own/base/path"

data_dir = f"{BASE_DIR}/static/test_data/OpenLUNA"
output_dir = f"{BASE_DIR}/examples/test_model/data/bert"

In [3]:
def raw_data():
    _data = []
    data_path = os.path.join(data_dir, "OpenLUNA.json")
    with open(data_path, encoding="utf-8") as f:
        for line in f.readlines():
            _data.append(json.loads(line))
    return _data

def stem_data(data):
    _data = []
    tokenizer = BertTokenizer()
    for e in data:
        d = tokenizer(e["stem"])
        if d is not None:
            _data.append(d)
    assert _data
    return _data

raw_data = raw_data()
train_items = stem_data(raw_data)

## 2. 训练和评估

In [4]:
# 自定义训练参数
train_params = {
  'epochs': 1,
  'save_steps': 5,
  'batch_size': 4,
  'logging_steps': 3
}

finetune_bert(
  train_items[:50],
  output_dir,
  train_params=train_params
)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 50
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumula

  0%|          | 0/13 [00:00<?, ?it/s]

{'loss': 1.307, 'learning_rate': 3.846153846153846e-05, 'epoch': 0.23}


Saving model checkpoint to /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/checkpoint-5
Configuration saved in /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/checkpoint-5/config.json
Model weights saved in /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/checkpoint-5/pytorch_model.bin
tokenizer config file saved in /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/checkpoint-5/tokenizer_config.json
Special tokens file saved in /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/checkpoint-5/special_tokens_map.json


{'loss': 1.9422, 'learning_rate': 2.6923076923076923e-05, 'epoch': 0.46}
{'loss': 1.5072, 'learning_rate': 1.5384615384615387e-05, 'epoch': 0.69}


Saving model checkpoint to /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/checkpoint-10
Configuration saved in /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/checkpoint-10/config.json
Model weights saved in /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/checkpoint-10/pytorch_model.bin
tokenizer config file saved in /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/checkpoint-10/tokenizer_config.json
Special tokens file saved in /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/checkpoint-10/special_tokens_map.json


{'loss': 1.8462, 'learning_rate': 3.846153846153847e-06, 'epoch': 0.92}




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert
Configuration saved in /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/config.json


{'train_runtime': 87.1213, 'train_samples_per_second': 0.574, 'train_steps_per_second': 0.149, 'train_loss': 1.5740307661203237, 'epoch': 1.0}


Model weights saved in /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/pytorch_model.bin
tokenizer config file saved in /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/tokenizer_config.json
Special tokens file saved in /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/special_tokens_map.json
tokenizer config file saved in /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/tokenizer_config.json
Special tokens file saved in /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/special_tokens_map.json


## 3.使用模型

### 3.1 使用训练好的Bert模型

In [5]:
item = [
        {'stem': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \
        若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'},
        {'stem': '已知圆$x^{2}+y^{2}-6 x=0$，过点(1,2)的直线被该圆所截得的弦的长度的最小值为'}
]

tokenizer_kwargs = {"tokenizer_config_dir": output_dir}
i2v = Bert('bert', 'bert', output_dir, tokenizer_kwargs=tokenizer_kwargs)

# 可以对单个题目进行表征
i_vec, t_vec = i2v(item[0]['stem'])
print(i_vec.shape) # == torch.Size([x, x])
print(t_vec.shape) # == torch.Size([x, x, x])
print()

# 也可以对题目列表进行表征
i_vec, t_vec = i2v([ item[0]['stem'], item[1]['stem'] ])
print(i_vec.shape) # == torch.Size([x, x])
print(t_vec.shape) # == torch.Size([x, x, x])

loading configuration file /Users/yutingning/Desktop/Project/EduNLP//examples/test_model/data/bert/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.10.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21132
}

loading w

torch.Size([1, 768])
torch.Size([1, 21, 768])

torch.Size([2, 768])
torch.Size([2, 32, 768])


### 3.2 使用BertTokenizer

In [6]:
# 在Bert-base-chinese的基础上初始化tokenizer
tokenizer = BertTokenizer(add_special_tokens=True, pretrain_model="bert-base-chinese")

# 对题目文本进行令牌化
items = [
    "有公式$\\FormFigureID{wrong1?}$，如图$\\FigureID{088f15ea-xxx}$,\
    若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$",
    "已知圆$x^{2}+y^{2}-6 x=0$，过点(1,2)的直线被该圆所截得的弦的长度的最小值为"
]

# 可以对单个题目进行令牌化
print(tokenizer(items[0]))
print()

# 也可以对题目列表进行令牌化
token_items = tokenizer(items)
print(token_items)
print()

# 可以使用return_tensors参数指定返回张量的类型
print(tokenizer(items[0], return_tensors='pt'))

loading configuration file https://huggingface.co/bert-base-chinese/resolve/main/config.json from cache at /Users/yutingning/.cache/huggingface/transformers/6cc404ca8136bc87bae0fb24f2259904943d776a6c5ddc26598bbdc319476f42.0f9bcd8314d841c06633e7b92b04509f1802c16796ee67b0f1177065739e24ae
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "tran

{'input_ids': [101, 1062, 2466, 1963, 1745, 21129, 166, 117, 167, 5276, 3338, 3340, 816, 1062, 2466, 102, 168, 134, 166, 116, 128, 8179, 3297, 1920, 966, 21130, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

{'input_ids': [[101, 1062, 2466, 1963, 1745, 21129, 166, 117, 167, 5276, 3338, 3340, 816, 1062, 2466, 102, 168, 134, 166, 116, 128, 8179, 3297, 1920, 966, 21130, 102, 0, 0, 0, 0, 0, 0, 0], [101, 2347, 4761, 1749, 166, 141, 169, 123, 171, 116, 167, 141, 169, 123, 171, 118, 127, 8206, 134, 121, 6814, 4157, 4684, 5296, 1749, 2779, 2533, 2478, 7270, 2428, 3297, 2207, 966, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1

In [7]:
# 可以使用tokenize方法查看令牌化后的文本
print(tokenizer.tokenize(items[0]))
print(tokenizer.tokenize(items))

['公', '式', '如', '图', '[FIGURE]', 'x', ',', 'y', '约', '束', '条', '件', '公', '式', '[SEP]', 'z', '=', 'x', '+', '7', '##y', '最', '大', '值', '[MARK]']
[['公', '式', '如', '图', '[FIGURE]', 'x', ',', 'y', '约', '束', '条', '件', '公', '式', '[SEP]', 'z', '=', 'x', '+', '7', '##y', '最', '大', '值', '[MARK]'], ['已', '知', '圆', 'x', '^', '{', '2', '}', '+', 'y', '^', '{', '2', '}', '-', '6', '##x', '=', '0', '过', '点', '直', '线', '圆', '截', '得', '弦', '长', '度', '最', '小', '值']]


## 3.3 使用EduNLP中公开的预训练模型

In [8]:
# 获取公开的预训练模型
pretrained_dir = f"{BASE_DIR}/examples/test_model/data/bert"
i2v = get_pretrained_i2v("luna_pub_bert_math_base", model_dir=pretrained_dir)

EduNLP, INFO model_path: /Users/yutingning/Desktop/Project/EduNLP/examples/test_model/data/bert/luna_pub_bert_math_base
EduNLP, INFO Use pretrained t2v model luna_pub_bert_math_base
downloader, INFO http://base.ustc.edu.cn/data/model_zoo/modelhub/bert_pub/1/luna_pub_bert_math_base.zip is saved as /Users/yutingning/Desktop/Project/EduNLP/examples/test_model/data/bert/luna_pub_bert_math_base.zip
downloader, INFO file existed, skipped
loading configuration file /Users/yutingning/Desktop/Project/EduNLP/examples/test_model/data/bert/luna_pub_bert_math_base/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddi

In [11]:
items = [
    "有公式$\\FormFigureID{wrong1?}$，如图$\\FigureID{088f15ea-xxx}$,\
    若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$",
    "已知圆$x^{2}+y^{2}-6 x=0$，过点(1,2)的直线被该圆所截得的弦的长度的最小值为"
]
i_vec, t_vec = i2v(items)
print(i_vec.shape)
print(t_vec.shape)
print()

# 也可以单独获取题目表征和各个token的表征
i_vec = i2v.infer_item_vector(items)
print(i_vec.shape)
t_vec = i2v.infer_token_vector(items)
print(t_vec.shape)
print()

# 同样，可以获取单个题目的表征
i_vec, t_vec = i2v(item[0])
print(i_vec.shape)
print(t_vec.shape)

torch.Size([2, 768])
torch.Size([2, 32, 768])

torch.Size([2, 768])
torch.Size([2, 32, 768])

torch.Size([1, 768])
torch.Size([1, 2, 768])
