In [1]:
import torch
import numpy as np
import os
from EduNLP.Pretrain import DisenQTokenizer, train_disenqnet
from EduNLP.Vector import DisenQModel, T2V
from EduNLP.I2V import DisenQ
from EduNLP.ModelZoo import load_items

os.environ["CUDA_VISIBLE_DEVICES"]= "0,1"
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")



# 训练自己的disenQNet模型
## 1. 数据

In [2]:
BASE_DIR = "../.."

data_dir = f"{BASE_DIR}/tests/test_vec/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/data/disenq"

disen_data_train = load_items(f"{data_dir}/disenq_train.json")
disen_data_test = load_items(f"{data_dir}/disenq_test.json")

## 2. 训练和评估

In [3]:
tokenizer = DisenQTokenizer(max_length=250, tokenize_method="space")

train_params = {
    "epoch": 5,
    "batch": 64,
    "lr": 1e-3,
    "step": 20,
    "trim_min": 2,

    "gamma": 0.5,
    "warm_up": 1,
    "adv": 10,
    "hidden": 128,
    "dropout": 0.2,
    "pos_weight": 1,
    "cp": 1.5,
    "mi": 1.0,
    "dis": 2.0,

    "w2v_workers": 1,
    "device": "cuda",
}
data_formation = {
    "content": "content",
    "knowledge": "knowledge"
}
train_disenqnet(
    disen_data_train,
    tokenizer,
    output_dir,
    output_dir,
    train_params=train_params,
    test_items=disen_data_test,
)

load vocab from ../../examples/test_model/data/disenq\vocab.list
load concept from ../../examples/test_model/data/disenq\concept.list
load word2vec from ../../examples/test_model/data/disenq\wv.th
processing raw data for QuestionDataset...
vocab size: 6827
concept size: 5
load vocab from ../../examples/test_model/data/disenq\vocab.list
load concept from ../../examples/test_model/data/disenq\concept.list
load word2vec from ../../examples/test_model/data/disenq\wv.th
processing raw data for QuestionDataset...
Start training the disenQNet...
[Epoch  1] train loss: 1.5506
[Epoch  2] train loss: 1.5952, eval loss: 1.6069
[Epoch  3] train loss: 1.4762, eval loss: 1.4880
[Epoch  4] train loss: 1.4437, eval loss: 1.4640
[Epoch  5] train loss: 1.3889, eval loss: 1.4287
[Epoch  6] train loss: 1.3368, eval loss: 1.3825


## 3.使用模型

In [5]:
tokenizer_kwargs = {
    "tokenizer_config_dir": output_dir,
}
i2v = DisenQ('disenq', 'disenq', output_dir, tokenizer_kwargs=tokenizer_kwargs, device="cuda")

test_items = [
    {"content": "10 米 的 (2/5) = 多少 米 的 (1/2),有 公 式"},
    {"content": "10 米 的 (2/5) = 多少 米 的 (1/2),有 公 式 , 如 图 , 若 $x,y$ 满 足 约 束 条 件 公 式"},
]

t_vec = i2v.infer_token_vector(test_items, key=lambda x: x["content"])
i_vec = i2v.infer_item_vector(test_items, key=lambda x: x["content"], vector_type="k")

print(i_vec.shape) # == torch.Size([2, 128])
print(t_vec.shape) # == torch.Size([2, 23, 128])

t_vec = i2v.infer_token_vector(test_items[0], key=lambda x: x["content"])
i_vec = i2v.infer_item_vector(test_items[0], key=lambda x: x["content"], vector_type="k")

print(i_vec.shape) # == torch.Size([1, 128])
print(t_vec.shape) # == torch.Size([1, 11, 128])

torch.Size([2, 128])
torch.Size([2, 23, 128])
torch.Size([1, 128])
torch.Size([1, 11, 128])
