In [5]:
import torch
import numpy as np
import os
import json
from EduNLP.ModelZoo.rnn import ElmoLM
from EduNLP.Pretrain import train_elmo, ElmoTokenizer
from EduNLP.Vector import ElmoModel, T2V
from EduNLP.I2V import Elmo, get_pretrained_i2v

os.environ["WANDB_DISABLED"] = "true"

# 训练自己的Elmo模型
## 1. 数据

In [6]:
# 设置你的数据路径和输出路径
BASE_DIR = "../.."

data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/data/pretrain_test_models/elmo/"

In [7]:
def stem_data():
    _data = []
    data_path = os.path.join(data_dir, "standard_luna_data.json")
    with open(data_path, encoding="utf-8") as f:
        for line in f.readlines():
            _data.append(json.loads(line))
    return _data

train_items = stem_data()

## 2. 训练和评估

In [8]:
# 自定义训练参数
train_params = {
  # "emb_dim": 128,
  # "hid_dim": 256,
  # "batch_size": 4,
  # "epochs": 1,
  # "lr": 5e-3,
  # "device": None,
  
  "num_train_epochs": 1,
  "per_device_train_batch_size": 8,
  "save_steps": 50,
  "save_total_limit": 2,
  "logging_steps": 5,
  "gradient_accumulation_steps": 1,
  "learning_rate": 5e-4,
}

train_elmo(train_items, output_dir, train_params=train_params)

  0%|          | 0/1 [00:00<?, ?ba/s]

Model config PretrainedConfig {
  "architecture": "ElmoLM",
  "batch_first": true,
  "dropout_rate": 0.5,
  "embedding_dim": 300,
  "hidden_size": 300,
  "num_layers": 2,
  "transformers_version": "4.18.0",
  "use_pack_pad": false,
  "vocab_size": 305
}

Model config PretrainedConfig {
  "architecture": "ElmoLMForPreTraining",
  "batch_first": true,
  "dropout_rate": 0.5,
  "embedding_dim": 300,
  "hidden_size": 300,
  "transformers_version": "4.18.0",
  "use_pack_pad": false,
  "vocab_size": 305
}

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
*

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ../../data/pretrain_test_models/elmo/
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Configuration saved in ../../data/pretrain_test_models/elmo/config.json


'../../data/pretrain_test_models/elmo/'


## 3.使用模型

In [9]:
test_items = [
    {'ques_content': '有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$，\
            如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,\
            若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'},
    {'ques_content': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \
            若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'}
]

### 3.1 直接加载令牌容器和模型

In [10]:
pretrained_model_dir = output_dir

model = ElmoLM.from_pretrained(pretrained_model_dir)
tokenizer = ElmoTokenizer.from_pretrained(pretrained_model_dir)

encodes = tokenizer(test_items, lambda x: x['ques_content'])
model(**encodes)

Model config PretrainedConfig {
  "architecture": "ElmoLM",
  "batch_first": true,
  "dropout_rate": 0.5,
  "embedding_dim": 300,
  "hidden_size": 300,
  "num_layers": 2,
  "transformers_version": "4.18.0",
  "use_pack_pad": false,
  "vocab_size": 305
}

[EduNLP, INFO] All the weights of ElmoLM were initialized from the model checkpoint at ../../data/pretrain_test_models/elmo/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use ElmoLM for predictions without further training.


ElmoLMOutput([('pred_forward',
               tensor([[[0.0033, 0.0032, 0.0034,  ..., 0.0033, 0.0031, 0.0035],
                        [0.0032, 0.0032, 0.0035,  ..., 0.0033, 0.0031, 0.0033],
                        [0.0035, 0.0031, 0.0034,  ..., 0.0032, 0.0033, 0.0033],
                        ...,
                        [0.0032, 0.0032, 0.0036,  ..., 0.0030, 0.0031, 0.0033],
                        [0.0034, 0.0032, 0.0035,  ..., 0.0032, 0.0031, 0.0035],
                        [0.0034, 0.0031, 0.0032,  ..., 0.0033, 0.0031, 0.0033]],
               
                       [[0.0034, 0.0030, 0.0034,  ..., 0.0033, 0.0032, 0.0034],
                        [0.0035, 0.0031, 0.0037,  ..., 0.0031, 0.0031, 0.0035],
                        [0.0035, 0.0030, 0.0034,  ..., 0.0031, 0.0033, 0.0035],
                        ...,
                        [0.0032, 0.0032, 0.0032,  ..., 0.0032, 0.0032, 0.0034],
                        [0.0034, 0.0030, 0.0033,  ..., 0.0033, 0.0030, 0.0033],
              

### 3.2 使用I2V向量化

In [12]:
tokenizer_kwargs = {"tokenizer_config_dir": pretrained_model_dir}
i2v = Elmo('elmo', 'elmo', output_dir, tokenizer_kwargs=tokenizer_kwargs)

# 可以对单个题目进行表征
i_vec, t_vec = i2v(test_items[0], key=lambda x: x["ques_content"])
print(i_vec.shape) # == torch.Size([x])
print(t_vec.shape) # == torch.Size([x, x])

# 也可以对题目列表进行表征
i_vec, t_vec = i2v(test_items, key=lambda x: x["ques_content"])
print(i_vec.shape) # == torch.Size([2, x])
print(t_vec.shape) # == torch.Size([2, x, x]))


Model config PretrainedConfig {
  "architecture": "ElmoLM",
  "batch_first": true,
  "dropout_rate": 0.5,
  "embedding_dim": 300,
  "hidden_size": 300,
  "num_layers": 2,
  "transformers_version": "4.18.0",
  "use_pack_pad": false,
  "vocab_size": 305
}

[EduNLP, INFO] All the weights of ElmoLM were initialized from the model checkpoint at ../../data/pretrain_test_models/elmo/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use ElmoLM for predictions without further training.
  (outputs.forward_output[torch.arange(len(items["seq_len"])), torch.tensor(items["seq_len"]) - 1],


torch.Size([1, 600])
torch.Size([1, 15, 600])
torch.Size([2, 600])
torch.Size([2, 25, 600])


### 3.3 使用Tokenizer和T2V向量化

In [14]:
# 加载之前训练的模型tokenizer
tokenizer = ElmoTokenizer.from_pretrained(pretrained_model_dir)
encodes = tokenizer(test_items, key=lambda x: x['ques_content'])

t2v = ElmoModel(pretrained_model_dir)

i_vec = t2v(encodes)
print(i_vec.shape) # == torch.Size([2, x])
print()

i_vec = t2v.infer_vector(encodes)
t_vec = t2v.infer_tokens(encodes)
print(i_vec.shape) # == torch.Size([2, x])
print(t_vec.shape) # == torch.Size([2, x, x]))
print()

Model config PretrainedConfig {
  "architecture": "ElmoLM",
  "batch_first": true,
  "dropout_rate": 0.5,
  "embedding_dim": 300,
  "hidden_size": 300,
  "num_layers": 2,
  "transformers_version": "4.18.0",
  "use_pack_pad": false,
  "vocab_size": 305
}

[EduNLP, INFO] All the weights of ElmoLM were initialized from the model checkpoint at ../../data/pretrain_test_models/elmo/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use ElmoLM for predictions without further training.
  (outputs.forward_output[torch.arange(len(items["seq_len"])), torch.tensor(items["seq_len"]) - 1],


torch.Size([2, 600])

torch.Size([2, 600])
torch.Size([2, 25, 600])



### 3.4 使用EduNLP中公开的预训练模型

In [None]:
# 获取公开的预训练模型
pretrained_dir = f"{BASE_DIR}/examples/test_model/elmo"
i2v = get_pretrained_i2v("elmo_test", model_dir=pretrained_dir)

In [None]:
i_vec, t_vec = i2v(test_items)
print(i_vec.shape)
print(t_vec.shape)
print()

# 也可以单独获取题目表征和各个token的表征
i_vec = i2v.infer_item_vector(test_items, key=lambda x: x['ques_content'])
print(i_vec.shape)
t_vec = i2v.infer_token_vector(test_items, key=lambda x: x['ques_content'])
print(t_vec.shape)
print()

# 同样，可以获取单个题目的表征
i_vec, t_vec = i2v(test_items[0], key=lambda x: x['ques_content'])
print(i_vec.shape)
print(t_vec.shape)