In [14]:
import torch
import numpy as np
import os
import json
from EduNLP.Pretrain import train_elmo, ElmoTokenizer
from EduNLP.Vector import ElmoModel, T2V
from EduNLP.I2V import Elmo, get_pretrained_i2v

# 训练自己的Elmo模型
## 1. 数据

In [3]:
# 设置你的数据路径和输出路径
BASE_DIR = "/your/own/base/path"

# http://base.ustc.edu.cn/data/OpenLUNA/
data_dir = f"{BASE_DIR}/static/test_data/OpenLUNA"
output_dir = f"{BASE_DIR}/examples/test_model/data/elmo"

In [3]:
def raw_data():
    _data = []
    data_path = os.path.join(data_dir, "OpenLUNA.json")
    with open(data_path, encoding="utf-8") as f:
        for line in f.readlines():
            _data.append(json.loads(line))
    return _data

def stem_data(data):
    _data = []
    tokenizer = ElmoTokenizer()
    for e in data:
        d = tokenizer.tokenize(item=e['stem'], freeze_vocab=False)
        if d is not None:
            _data.append(d)
    assert _data
    return _data

raw_data = raw_data()
train_items = stem_data(raw_data)

## 2. 训练和评估

In [4]:
# 自定义训练参数
train_params = {
  "emb_dim": 128,
  "hid_dim": 256,
  "batch_size": 4,
  "epochs": 1,
  "lr": 5e-3,
  "device": None
}

train_elmo(train_items, output_dir, **train_params)

[Global step 10, epoch 0, batch 9] Loss: 0.0060805706
[Global step 20, epoch 0, batch 19] Loss: 0.0055406204
[Global step 30, epoch 0, batch 29] Loss: 0.0047021039
[Global step 40, epoch 0, batch 39] Loss: 0.0059773619
[Global step 50, epoch 0, batch 49] Loss: 0.0064559857
[Global step 60, epoch 0, batch 59] Loss: 0.0052063332
[Global step 70, epoch 0, batch 69] Loss: 0.0043544602
[Global step 80, epoch 0, batch 79] Loss: 0.0039286360
[Global step 90, epoch 0, batch 89] Loss: 0.0038324460
[Global step 100, epoch 0, batch 99] Loss: 0.0038334341
[Global step 110, epoch 0, batch 109] Loss: 0.0034901381
[Global step 120, epoch 0, batch 119] Loss: 0.0048992285
[Global step 130, epoch 0, batch 129] Loss: 0.0036445243
[Global step 140, epoch 0, batch 139] Loss: 0.0032575284
[Global step 150, epoch 0, batch 149] Loss: 0.0035707197
[Global step 160, epoch 0, batch 159] Loss: 0.0036381901
[Global step 170, epoch 0, batch 169] Loss: 0.0034310359
[Global step 180, epoch 0, batch 179] Loss: 0.00400

'/Users/lipingzhi/Desktop/EduNLP/examples/test_model/data/elmo'


## 3.使用模型

### 3.1使用训练好的Elmo模型

In [6]:
item = [
        {'stem': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \
        若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'},
        {'stem': '已知圆$x^{2}+y^{2}-6 x=0$，过点(1,2)的直线被该圆所截得的弦的长度的最小值为'}
]

tokenizer_kwargs = {"path": os.path.join(output_dir, "vocab.json")}
i2v = Elmo('elmo', 'elmo', output_dir, tokenizer_kwargs=tokenizer_kwargs)

# 可以对单个题目进行表征
i_vec, t_vec = i2v(item[0]['stem'])
print(i_vec.shape) # == torch.Size([x])
print(t_vec.shape) # == torch.Size([x, x])

# 也可以对题目列表进行表征
i_vec, t_vec = i2v([ item[0]['stem'], item[1]['stem'] ])
print(i_vec.shape) # == torch.Size([2, x])
print(t_vec.shape) # == torch.Size([2, x, x]))


torch.Size([512])
torch.Size([15, 512])
torch.Size([2, 512])
torch.Size([2, 25, 512])


### 3.2使用Elmo Tokenizer

In [8]:
# 加载之前训练的模型tokenizer
tokenizer = ElmoTokenizer()
tokenizer.load_vocab(os.path.join(output_dir, "vocab.json"))

# 对题目文本进行令牌化
items = [
    "有公式$\\FormFigureID{wrong1?}$，如图$\\FigureID{088f15ea-xxx}$,\
    若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$",
    "已知圆$x^{2}+y^{2}-6 x=0$，过点(1,2)的直线被该圆所截得的弦的长度的最小值为"
]
# 可以对单个题目进行令牌化
print(tokenizer(items[0], freeze_vocab=True))
print()

# 也可以对题目列表进行令牌化
print(tokenizer(items, freeze_vocab=True))
print()



([527, 231, 3, 13, 26, 79, 159, 527, 6, 33, 10, 13, 34, 133, 79, 168, 4], 17)

([[527, 231, 3, 13, 26, 79, 159, 527, 6, 33, 10, 13, 34, 133, 79, 168, 4], [7, 104, 13, 15, 16, 17, 18, 34, 79, 15, 16, 17, 18, 19, 105, 13, 10, 23, 106, 107, 104, 108, 109, 110, 111]], [17, 25])



In [9]:
# 可以使用tokenize方法查看令牌化后的文本
print(tokenizer.tokenize(items[0], freeze_vocab=True))
print(tokenizer.tokenize(items, freeze_vocab=True))

['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']
[['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]'], ['已知', '圆', 'x', '^', '{', '2', '}', '+', 'y', '^', '{', '2', '}', '-', '6', 'x', '=', '0', '过点', '直线', '圆', '截得', '弦', '长度', '最小值']]


### 3.3使用EduNLP中公开的预训练模型

In [10]:
# 获取公开的预训练模型
pretrained_dir = f"{BASE_DIR}/examples/test_model/data/elmo"
i2v = get_pretrained_i2v("elmo_pub_math", model_dir=pretrained_dir)

EduNLP, INFO model_path: /Users/lipingzhi/Desktop/EduNLP/examples/test_model/data/elmo/elmo_pub_math
EduNLP, INFO Use pretrained t2v model elmo_pub_math
downloader, INFO http://base.ustc.edu.cn/data/model_zoo/modelhub/elmo_pub/1/elmo_pub_math.zip is saved as /Users/lipingzhi/Desktop/EduNLP/examples/test_model/data/elmo/elmo_pub_math.zip


Downloading /Users/lipingzhi/Desktop/EduNLP/examples/test_model/data/elmo/elmo_pub_math.zip 100.00%: 791MB | 791MB


downloader, INFO /Users/lipingzhi/Desktop/EduNLP/examples/test_model/data/elmo/elmo_pub_math.zip is unzip to /Users/lipingzhi/Desktop/EduNLP/examples/test_model/data/elmo/elmo_pub_math


In [13]:
items = [
    "有公式$\\FormFigureID{wrong1?}$，如图$\\FigureID{088f15ea-xxx}$,\
    若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$",
    "已知圆$x^{2}+y^{2}-6 x=0$，过点(1,2)的直线被该圆所截得的弦的长度的最小值为"
]
i_vec, t_vec = i2v(items)
print(i_vec.shape)
print(t_vec.shape)
print()

# 也可以单独获取题目表征和各个token的表征
i_vec = i2v.infer_item_vector(items)
print(i_vec.shape)
t_vec = i2v.infer_token_vector(items)
print(t_vec.shape)
print()

# 同样，可以获取单个题目的表征
i_vec, t_vec = i2v(items[0])
print(i_vec.shape)
print(t_vec.shape)

torch.Size([2, 2048])
torch.Size([2, 25, 2048])

torch.Size([2, 2048])
torch.Size([2, 25, 2048])

torch.Size([2048])
torch.Size([17, 2048])
