In [1]:
import torch
import numpy as np
import os
from EduNLP.Pretrain import DisenQTokenizer, train_disenqnet
from EduNLP.Vector import DisenQModel, T2V
from EduNLP.I2V import DisenQ, get_pretrained_i2v
from EduNLP.ModelZoo import load_items

os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"]= "0,1"
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")



# 训练自己的disenQNet模型
## 1. 数据

In [2]:
BASE_DIR = "../.."

data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/data/disenq"

disen_data_train = load_items(f"{data_dir}/disenq_train.json")
disen_data_test = load_items(f"{data_dir}/disenq_test.json")

## 2. 训练和评估

In [3]:
tokenizer = DisenQTokenizer(max_length=250, tokenize_method="space")

train_params = {
    # data params
    "trim_min": 2,
    "w2v_workers": 1,
    # model params
    "hidden": 128,
    "dropout": 0.2,
    "pos_weight": 1,
    "cp": 1.5,
    "mi": 1.0,
    "dis": 2.0,
    # training params
    "epoch": 5,
    "batch": 64,
    "lr": 1e-3,
    "step": 20,
    "gamma": 0.5,
    "warm_up": 1,
    "adv": 10,
    "device": "cuda"
}
data_formation = {
    "content": "content",
    "knowledge": "knowledge"
}
train_disenqnet(
    disen_data_train,
    tokenizer,
    output_dir,
    output_dir,
    train_params=train_params,
    test_items=disen_data_test,
)

load vocab from ../../examples/test_model/data/disenq\vocab.list
load concept from ../../examples/test_model/data/disenq\concept.list
load word2vec from ../../examples/test_model/data/disenq\wv.th
processing raw data for QuestionDataset...
vocab size: 6827
concept size: 5
load vocab from ../../examples/test_model/data/disenq\vocab.list
load concept from ../../examples/test_model/data/disenq\concept.list
load word2vec from ../../examples/test_model/data/disenq\wv.th
processing raw data for QuestionDataset...
Start training the disenQNet...
[Epoch  1] train loss: 1.5524
[Epoch  2] train loss: 1.5753, eval loss: 1.5866
[Epoch  3] train loss: 1.4864, eval loss: 1.4997
[Epoch  4] train loss: 1.4471, eval loss: 1.4667
[Epoch  5] train loss: 1.3798, eval loss: 1.4123
[Epoch  6] train loss: 1.3078, eval loss: 1.3586


## 3.使用模型

### 3.1 使用I2V将题目转为向量

In [4]:
tokenizer_kwargs = {
    "tokenizer_config_dir": output_dir,
}
i2v = DisenQ('disenq', 'disenq', output_dir, tokenizer_kwargs=tokenizer_kwargs, device="cuda")

test_items = [
    {"content": "10 米 的 (2/5) = 多少 米 的 (1/2),有 公 式"},
    {"content": "10 米 的 (2/5) = 多少 米 的 (1/2),有 公 式 , 如 图 , 若 $x,y$ 满 足 约 束 条 件 公 式"},
]

t_vec = i2v.infer_token_vector(test_items, key=lambda x: x["content"])
i_vec_k = i2v.infer_item_vector(test_items, key=lambda x: x["content"], vector_type="k")
i_vec_i = i2v.infer_item_vector(test_items, key=lambda x: x["content"], vector_type="i")

print(t_vec.shape) # == torch.Size([2, 23, 128])
print(i_vec_k.shape) # == torch.Size([2, 128])
print(i_vec_i.shape) # == torch.Size([2, 128])

t_vec = i2v.infer_token_vector(test_items[0], key=lambda x: x["content"])
i_vec_k = i2v.infer_item_vector(test_items[0], key=lambda x: x["content"], vector_type="k")
i_vec_i = i2v.infer_item_vector(test_items, key=lambda x: x["content"], vector_type="i")

print(t_vec.shape) # == torch.Size([1, 11, 128])
print(i_vec_k.shape) # == torch.Size([1, 128])
print(i_vec_i.shape) # == torch.Size([2, 128])

torch.Size([2, 23, 128])
torch.Size([2, 128])
torch.Size([2, 128])
torch.Size([1, 11, 128])
torch.Size([1, 128])
torch.Size([2, 128])


### 3.2 使用DisenQTokenizer先分词，再用T2V向量化
#### 使用DisenQTokenizer

In [5]:
# 在Bert-base-chinese的基础上初始化tokenizer
tokenizer = DisenQTokenizer(max_length=250, tokenize_method="space")

# 对题目文本进行令牌化
items = [
    "有 公 式 $\\FormFigureID{wrong1?}$ ，如 图 $\\FigureID{088f15ea-xxx}$",
    "已知 圆 $x^{2}+y^{2}-6 x=0$ ，过 点 (1,2) 的 直 线 被 该 圆 所 截 得 的 弦 的 长度 的 最小 值 为"
]
tokenizer.set_vocab(items, silent=False)

# 可以对单个题目进行令牌化
print(tokenizer(items[0]))
print()

# 也可以对题目列表进行令牌化
token_items = tokenizer(items)
print(token_items)
print()

save words(trim_min_count=1): 27/27 = 1.0000 with frequency 31/31=1.0000
{'content_idx': tensor([[20, 10, 14,  4, 28, 11,  3]]), 'content_len': tensor([7])}

{'content_idx': tensor([[20, 10, 14,  4, 28, 11,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
          2,  2,  2,  2,  2,  2],
        [13, 12,  5,  7, 29, 21,  6, 22, 23, 24, 25, 26, 12, 18, 17, 16, 22, 15,
         22, 27, 22, 19,  9,  8]]), 'content_len': tensor([ 7, 24])}



In [6]:
# 可以使用tokenize方法查看令牌化后的文本
print(tokenizer.tokenize(items[0]))
print(tokenizer.tokenize(items))

['有', '公', '式', '$\\FormFigureID{wrong1?}$', '，如', '图', '$\\FigureID{088f15ea-xxx}$']
[['有', '公', '式', '$\\FormFigureID{wrong1?}$', '，如', '图', '$\\FigureID{088f15ea-xxx}$'], ['已知', '圆', '$x^{2}+y^{2}-6', 'x=0$', '，过', '点', '(1,2)', '的', '直', '线', '被', '该', '圆', '所', '截', '得', '的', '弦', '的', '长度', '的', '最小', '值', '为']]


#### 使用T2V加载模型

In [7]:

pretrained_dir = f"{BASE_DIR}/examples/test_model/data/disenq"
t2v = DisenQModel(pretrained_dir)

token_items = tokenizer(items)

# 获得句表征和词表征
t_vec, i_vec_k, i_vec_i = t2v(token_items)
print(i_vec_k.shape, i_vec_i.shape)
print(t_vec.shape)
print()

# 获得指定表征
i_vec_k, i_vec_i = t2v.infer_vector(token_items, vector_type="k")
t_vec = t2v.infer_tokens(token_items)

i_vec_k = t2v.infer_vector(token_items, vector_type="k")
i_vec_i = t2v.infer_vector(token_items, vector_type="i")

torch.Size([2, 128]) torch.Size([2, 128])
torch.Size([2, 24, 128])



### 3.3 使用EduNLP中公开的预训练模型

In [8]:
# 获取公开的预训练模型
pretrained_dir = f"{BASE_DIR}/examples/test_model/data/disenq_pub"
i2v = get_pretrained_i2v("disenq_pub_128", model_dir=pretrained_dir)

EduNLP, INFO model_dir: ..\..\examples\test_model\data\disenq_pub\disenq_pub_128
EduNLP, INFO Use pretrained t2v model disenq_pub_128
downloader, INFO http://base.ustc.edu.cn/data/model_zoo/modelhub/disenq_public/1/disenq_pub_128.zip is saved as ..\..\examples\test_model\data\disenq_pub\disenq_pub_128.zip
downloader, INFO file existed, skipped


In [9]:
test_items = [
    "有 公 式 $\\FormFigureID{wrong1?}$ ，如 图 $\\FigureID{088f15ea-xxx}$",
    "已知 圆 $x^{2}+y^{2}-6 x=0$ ，过 点 (1,2) 的 直 线 被 该 圆 所 截 得 的 弦 的 长度 的 最小 值 为"
]

# 获得句表征和词表征
i_vec, t_vec = i2v(test_items)
print(i_vec[0].shape, i_vec[1].shape)
print(t_vec.shape)
print()

i_vec_k, t_vec = i2v(test_items, vector_type="k")
print(i_vec_k.shape)
print(t_vec.shape)
print()

# 获得指定表征
i_vec_k = i2v.infer_item_vector(test_items, vector_type="k")
i_vec_i = i2v.infer_item_vector(test_items, vector_type="i")
t_vec = i2v.infer_token_vector(test_items)

print(i_vec_k.shape)
print(i_vec_i.shape)
print(t_vec.shape)

torch.Size([2, 128]) torch.Size([2, 128])
torch.Size([2, 24, 128])

torch.Size([2, 128])
torch.Size([2, 24, 128])

torch.Size([2, 128])
torch.Size([2, 128])
torch.Size([2, 24, 128])


In [10]:
test_items2 = [
    "有公式$\\FormFigureID{wrong1?}$，如图$\\FigureID{088f15ea-xxx}$,\
    若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$",
    "已知圆$x^{2}+y^{2}-6 x=0$，过点(1,2)的直线被该圆所截得的弦的长度的最小值为"
]

print("The text tokenization method of pretrained i2v: ",i2v.tokenizer.tokenize_method)

# if the test data is note the same formation as train data, you can change tokenzer_method! But it's not recommended.
i2v.tokenizer.set_text_tokenizer("pure_text")
print("Reset the text tokenization method of pretrained i2v: ",i2v.tokenizer.tokenize_method)

i_vec, t_vec = i2v(test_items2)
print(i_vec[0].shape, i_vec[1].shape)
print(t_vec.shape)
print()

The text tokenization method of pretrained i2v:  space
Reset the text tokenization method of pretrained i2v:  pure_text
torch.Size([2, 128]) torch.Size([2, 128])
torch.Size([2, 25, 128])

