In [1]:
import torch
import numpy as np
import os
import json
import codecs
from EduNLP.Pretrain import QuesNetTokenizer, pretrain_QuesNet
from EduNLP.Vector import T2V
from EduNLP.I2V import QuesNet, get_pretrained_i2v



# 训练自己的QuesNet模型
## 1. 数据

In [2]:
# 设置你的数据路径和输出路径
BASE_DIR = "/Users/yutingning/Desktop/Project/LUNA/EduNLP"

data_dir = f"{BASE_DIR}/tests/test_vec/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/data/quesnet"

In [3]:
def raw_data():
    _data = []
    data_path = os.path.join(data_dir, "quesnet_data.json")
    with codecs.open(data_path, encoding="utf-8") as f:
        for line in f.readlines():
            _data.append(json.loads(line))
    return _data

raw_data = raw_data()

## 2. 训练Tokenizer

In [4]:
tokenizer = QuesNetTokenizer(meta=['know_name'], max_length=50,
                             img_dir=os.path.join(data_dir, "quesnet_img"))

# 设置词表
tokenizer.set_vocab(raw_data, key=lambda x: x['ques_content'], trim_min_count=3, silent=False)

print("vocab_size: ", tokenizer.vocab_size)
print()


save words(3): 79/79 = 1.0000                  with frequency 10416/10416=1.0000
save meta information know_name: 12
vocab_size:  82



In [5]:
# 保存tokenizer
tokenizer.save_pretrained(output_dir)

## 3. 训练QuesNet

In [6]:
# 自定义训练参数
train_params = {
    'feat_size': 256,
    'save_every': 10,
    'emb_size': 256,
    'batch_size': 16
}
pretrain_QuesNet(os.path.join(data_dir, 'quesnet_data.json'),
                 output_dir, tokenizer, True, train_params)

335it [00:01, 170.71it/s]                         
EduNLP, INFO QuesNet Word Embedding loaded
0it [00:00, ?it/s]EduNLP, INFO [Epoch0][Batch0]Training image Embedding layer, loss:0.33119991421699524
6it [00:00, 58.39it/s]EduNLP, INFO [Epoch0][Batch10]Training image Embedding layer, loss:0.2898443043231964
11it [00:00, 64.49it/s]
EduNLP, INFO QuesNet Image Embedding loaded
  return torch.tensor(self.data[idx])
EduNLP, INFO [Epoch0][Batch0]Training meta Embedding layer, loss:0.6941184401512146
EduNLP, INFO [Epoch0][Batch10]Training meta Embedding layer, loss:0.46297648549079895
EduNLP, INFO [Epoch0][Batch20]Training meta Embedding layer, loss:0.3208733797073364
21it [00:00, 450.85it/s]
EduNLP, INFO QuesNet Meta Embedding loaded
EduNLP, INFO QuesNet Word, Image and Meta Embeddings training is done
  0%|          | 0/21 [00:00<?, ?it/s]EduNLP, INFO 0.0---loss: 12.606732368469238
 48%|████▊     | 10/21 [00:14<00:15,  1.40s/it]EduNLP, INFO 0.10---loss: 9.26207160949707
 95%|█████████▌| 20/21 

## 4. 使用模型

### 4.1 使用训练好的QuesNet Tokenzier

In [7]:
# 读取保存的tokenizer
tokenizer = QuesNetTokenizer.from_pretrained(output_dir,
                                             img_dir=os.path.join(data_dir, "quesnet_img"))

In [8]:
# tokenize
# 可以处理单个题目
print(tokenizer.tokenize(raw_data[0], key=lambda x: x['ques_content']))
print()
# 也可以处理题目列表
print(tokenizer.tokenize(raw_data[:5], key=lambda x: x['ques_content']))

print()

# 将token转换为index
print(tokenizer(raw_data[0], key=lambda x: x['ques_content'], return_text=True, padding=True))
print()
print(tokenizer(raw_data[:3], key=lambda x: x['ques_content'], padding=True))

['埃及', '胡夫', '金字塔', '古代', '世界', '建筑', '奇迹', '形状', '视为', '正四', '棱锥', '以该', '四', '棱锥', '高为', '边长', '正方形', '面积', '等于', '四', '棱锥', '侧面', '三角形', '面积', '侧面', '三角形', '底边', '高', '底面', '正方形', '边长', '比值', \FigureID{73d66b18-33a9-11ec-a11a-98fa9b625adb}]

[['埃及', '胡夫', '金字塔', '古代', '世界', '建筑', '奇迹', '形状', '视为', '正四', '棱锥', '以该', '四', '棱锥', '高为', '边长', '正方形', '面积', '等于', '四', '棱锥', '侧面', '三角形', '面积', '侧面', '三角形', '底边', '高', '底面', '正方形', '边长', '比值', \FigureID{73d66b18-33a9-11ec-a11a-98fa9b625adb}], ['某校', '课外', '学习', '小组', '研究', '作物', '发芽率', 'y', '温度', 'x', '单位', '^', '{', '\\circ', '}', '\\mathrm', '{', 'C', '}', '关系', '20', '温度', '条件', '种子', '发芽', '实验', '实验', '数据', '\\left', '(', 'x', '_', '{', 'i', '}', ',', 'y', '_', '{', 'i', '}', '\\right', ')', '(', 'i', '=', '1', ',', '2', ','], ['设', '函数', 'f', '(', 'x', ')', '=', '\\cos', '\\left', '(', '\\omega', 'x', '+', '\\frac', '{', '\\pi', '}', '{', '6', '}', '\\right', ')', '[', '-', '\\pi', ',', '\\pi', ']', '图像', '图', \FigureID{000004d6-0479-11e

### 4.2 使用训练好的QuesNet模型

In [9]:
tokenizer_kwargs = {
    'tokenizer_config_dir': output_dir,
}
i2v = QuesNet('quesnet', 'quesnet', output_dir,
              tokenizer_kwargs=tokenizer_kwargs, device="cpu")

In [11]:
# 获得单个题目的表征
i_vec, t_vec = i2v(raw_data[0], key=lambda x: x["ques_content"])
print(i_vec.shape)
print(t_vec.shape)
print()

# 也可以分别获得题目表征和各个token的表征
t_vec = i2v.infer_token_vector(raw_data[0], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[0], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
print()

# 获得题目列表的表征
t_vec = i2v.infer_token_vector(raw_data[:2], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[:2], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)

torch.Size([1, 256])
torch.Size([1, 34, 256])

torch.Size([1, 34, 256])
torch.Size([1, 256])

torch.Size([2, 51, 256])
torch.Size([2, 256])


### 4.3 使用EduNLP中公开的预训练模型

In [17]:
# 获取公开的预训练模型
pretrained_dir = f"{BASE_DIR}/examples/test_model/data/quesnet/quesnet_test"
i2v = get_pretrained_i2v("quesnet_test", model_dir=pretrained_dir)

EduNLP, INFO model_path: /Users/yutingning/Desktop/Project/LUNA/EduNLP/examples/test_model/data/quesnet/quesnet_test/quesnet_test
EduNLP, INFO Use pretrained t2v model quesnet_test
downloader, INFO http://base.ustc.edu.cn/data/model_zoo/modelhub/quesnet_pub_256/1/quesnet_test.zip is saved as /Users/yutingning/Desktop/Project/LUNA/EduNLP/examples/test_model/data/quesnet/quesnet_test/quesnet_test.zip


Downloading /Users/yutingning/Desktop/Project/LUNA/EduNLP/examples/test_model/data/quesnet/quesnet_test/quesnet_test.zip 100.00%: 13.9MB | 13.9MB


downloader, INFO /Users/yutingning/Desktop/Project/LUNA/EduNLP/examples/test_model/data/quesnet/quesnet_test/quesnet_test.zip is unzip to /Users/yutingning/Desktop/Project/LUNA/EduNLP/examples/test_model/data/quesnet/quesnet_test/quesnet_test


In [18]:
# 用法和4.2中相同

# 获得单个题目的表征
i_vec, t_vec = i2v(raw_data[0], key=lambda x: x["ques_content"])
print(i_vec.shape)
print(t_vec.shape)
print()

# 也可以分别获得题目表征和各个token的表征
t_vec = i2v.infer_token_vector(raw_data[0], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[0], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
print()

# 获得题目列表的表征
t_vec = i2v.infer_token_vector(raw_data[:2], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[:2], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)

torch.Size([1, 256])
torch.Size([1, 34, 256])

torch.Size([1, 34, 256])
torch.Size([1, 256])

torch.Size([2, 51, 256])
torch.Size([2, 256])
