Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion EduNLP/I2V/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# 2021/8/1 @ tongshiwei

from .i2v import I2V, get_pretrained_i2v
from .i2v import D2V, W2V
from .i2v import D2V, W2V, Bert
76 changes: 74 additions & 2 deletions EduNLP/I2V/i2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
import json
from EduNLP.constant import MODEL_DIR
from ..Vector import T2V, get_pretrained_t2v as get_t2v_pretrained_model
from ..Vector import PRETRAINED_MODELS
from longling import path_append
from ..Tokenizer import Tokenizer, get_tokenizer
from EduNLP.Pretrain import BertTokenizer
from EduNLP import logger

__all__ = ["I2V", "D2V", "W2V", "get_pretrained_i2v"]
__all__ = ["I2V", "D2V", "W2V", "Bert", "get_pretrained_i2v"]


class I2V(object):
Expand All @@ -34,6 +37,7 @@ class I2V(object):
kwargs:
the parameters passed to t2v

def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None, pretrained_t2v=False, **kwargs):
Examples
--------
>>> item = {"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, \
Expand All @@ -49,12 +53,16 @@ class I2V(object):
i2v model: I2V
"""
def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None, pretrained_t2v=False, **kwargs):
self.tokenizer: Tokenizer = get_tokenizer(tokenizer, **tokenizer_kwargs if tokenizer_kwargs is not None else {})
if pretrained_t2v:
logger.info("Use pretrained t2v model %s" % t2v)
self.t2v = get_t2v_pretrained_model(t2v, kwargs.get("model_dir", MODEL_DIR))
else:
self.t2v = T2V(t2v, *args, **kwargs)
if tokenizer == 'bert':
self.tokenizer = BertTokenizer(**tokenizer_kwargs if tokenizer_kwargs is not None else {})
else:
self.tokenizer: Tokenizer = get_tokenizer(tokenizer, **tokenizer_kwargs
if tokenizer_kwargs is not None else {})
self.params = {
"tokenizer": tokenizer,
"tokenizer_kwargs": tokenizer_kwargs,
Expand Down Expand Up @@ -246,6 +254,69 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
return cls("pure_text", name, pretrained_t2v=True, model_dir=model_dir)


class Bert(I2V):
"""
The model aims to transfer item and tokens to vector with Bert.

Bases
-------
I2V

Parameters
-----------
tokenizer: str
the tokenizer name
t2v: str
the name of token2vector model
args:
the parameters passed to t2v
tokenizer_kwargs: dict
the parameters passed to tokenizer
pretrained_t2v: bool
True: use pretrained t2v model
False: use your own t2v model
kwargs:
the parameters passed to t2v

Returns
-------
i2v model: Bert
"""
def infer_vector(self, items, tokenize=True, return_tensors='pt', *args, **kwargs) -> tuple:
'''
It is a function to switch item to vector. And before using the function, it is nesseary to load model.

Parameters
-----------
items: str or list
the text of question
tokenize:bool
True: tokenize the item
return_tensors: str
tensor type used in tokenizer
args:
the parameters passed to t2v
kwargs:
the parameters passed to t2v

Returns
--------
vector:list
'''
inputs = self.tokenize(items, return_tensors=return_tensors) if tokenize is True else items
return self.t2v(inputs, *args, **kwargs), self.t2v.infer_tokens(inputs, *args, **kwargs)

@classmethod
def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
model_path = path_append(model_dir, PRETRAINED_MODELS[name][0].split('/')[-1], to_str=True)
for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
model_path = model_path.replace(i, "")
logger.info("model_path: %s" % model_path)
tokenizer_kwargs = {"pretrain_model": model_path}
return cls("bert", name, pretrained_t2v=True, model_dir=model_dir,
tokenizer_kwargs=tokenizer_kwargs)


MODELS = {
"d2v_all_256": [D2V, "d2v_all_256"],
"d2v_sci_256": [D2V, "d2v_sci_256"],
Expand All @@ -255,6 +326,7 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
"w2v_lit_300": [W2V, "w2v_lit_300"],
"test_w2v": [W2V, "test_w2v"],
"test_d2v": [D2V, "test_d2v"],
'luna_bert': [Bert, 'luna_bert'],
}


Expand Down
1 change: 1 addition & 0 deletions EduNLP/Pretrain/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
# 2021/5/29 @ tongshiwei

from .gensim_vec import train_vector, GensimWordTokenizer, GensimSegTokenizer
from .bert_vec import BertTokenizer, finetune_bert
163 changes: 163 additions & 0 deletions EduNLP/Pretrain/bert_vec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
from EduNLP import logger
import multiprocessing
import transformers
from EduNLP.Tokenizer import PureTextTokenizer
from copy import deepcopy
from typing import Optional, Union
import itertools as it
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers.file_utils import TensorType
from torch.utils.data import Dataset
from EduNLP.SIF import Symbol, FORMULA_SYMBOL, FIGURE_SYMBOL, QUES_MARK_SYMBOL, TAG_SYMBOL, SEP_SYMBOL


__all__ = ["BertTokenizer", "finetune_bert"]


class BertTokenizer(object):
"""

Parameters
----------
pretrain_model:
used pretrained model

Returns
----------

Examples
----------
>>> tokenizer = BertTokenizer()
>>> item = "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"
>>> token_item = tokenizer(item)
>>> print(token_item.input_ids[:10])
[101, 1062, 2466, 1963, 1745, 21129, 166, 117, 167, 5276]
>>> print(tokenizer.tokenize(item)[:10])
['公', '式', '如', '图', '[FIGURE]', 'x', ',', 'y', '约', '束']
>>> items = [item, item]
>>> token_items = tokenizer(items, return_tensors='pt')
>>> print(token_items.input_ids.shape)
torch.Size([2, 27])
>>> print(len(tokenizer.tokenize(items)))
2
"""
def __init__(self, pretrain_model="bert-base-chinese"):
self.tokenizer = AutoTokenizer.from_pretrained(pretrain_model)
customize_tokens = []
for i in [FORMULA_SYMBOL, FIGURE_SYMBOL, QUES_MARK_SYMBOL, TAG_SYMBOL, SEP_SYMBOL]:
if i not in self.tokenizer.additional_special_tokens:
customize_tokens.append(Symbol(i))
if customize_tokens:
self.tokenizer.add_special_tokens({'additional_special_tokens': customize_tokens})
self.pure_text_tokenizer = PureTextTokenizer()

def __call__(self, item: (list, str), return_tensors: Optional[Union[str, TensorType]] = None, *args, **kwargs):
if isinstance(item, str):
item = ''.join(next(self.pure_text_tokenizer([item])))
else:
token_generation = self.pure_text_tokenizer(item)
item = [''.join(next(token_generation)) for i in range(len(item))]
return self.tokenizer(item, truncation=True, padding=True, return_tensors=return_tensors)

def tokenize(self, item: (list, str), *args, **kwargs):
if isinstance(item, str):
item = ''.join(next(self.pure_text_tokenizer([item])))
return self.tokenizer.tokenize(item)
else:
token_generation = self.pure_text_tokenizer(item)
item = [''.join(next(token_generation)) for i in range(len(item))]
item = [self.tokenizer.tokenize(i) for i in item]
return item


class FinetuneDataset(Dataset):
def __init__(self, items):
self.items = items
self.len = len(items)

def __getitem__(self, index):
return self.items[index]

def __len__(self):
return self.len


def finetune_bert(items, output_dir, pretrain_model="bert-base-chinese", train_params=None):
Copy link
Collaborator

@KenelmQLH KenelmQLH Oct 19, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please complete the code comments of funcitons

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"""

Parameters
----------
items:dict
the tokenization results of questions
output_dir: str
the path to save the model
pretrain_model: str
the name or path of pre-trained model
train_params: dict
the training parameters passed to Trainer

Examples
----------
>>> tokenizer = BertTokenizer()
>>> stems = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$",
... "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$"]
>>> token_item = [tokenizer(i) for i in stems]
>>> print(token_item[0].keys())
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
>>> finetune_bert(token_item, "examples/test_model/data/bert") #doctest: +ELLIPSIS
{'train_runtime': ..., ..., 'epoch': 1.0}
"""
model = AutoModelForMaskedLM.from_pretrained(pretrain_model)
tokenizer = BertTokenizer(pretrain_model)
# resize embedding for additional sepecial tokens
model.resize_token_embeddings(len(tokenizer.tokenizer))

# training parameters
if train_params:
mlm_probability = train_params['mlm_probability'] if 'mlm_probability' in train_params else 0.15
epochs = train_params['epochs'] if 'epochs' in train_params else 1
batch_size = train_params['batch_size'] if 'batch_size' in train_params else 64
save_steps = train_params['save_steps'] if 'save_steps' in train_params else 100
save_total_limit = train_params['save_total_limit'] if 'save_total_limit' in train_params else 2
logging_steps = train_params['logging_steps'] if 'logging_steps' in train_params else 5
gradient_accumulation_steps = train_params['gradient_accumulation_steps'] \
if 'gradient_accumulation_steps' in train_params else 1
else:
# default
mlm_probability = 0.15
epochs = 1
batch_size = 64
save_steps = 1000
save_total_limit = 2
logging_steps = 5
gradient_accumulation_steps = 1

data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer.tokenizer, mlm=True, mlm_probability=mlm_probability
)

dataset = FinetuneDataset(items)

training_args = TrainingArguments(
output_dir=output_dir,
overwrite_output_dir=True,
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
save_steps=save_steps,
save_total_limit=save_total_limit,
logging_steps=logging_steps,
gradient_accumulation_steps=gradient_accumulation_steps,
)

trainer = Trainer(
Copy link
Collaborator

@KenelmQLH KenelmQLH Oct 19, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that the Trainer uses raw items to train, which only use the original AutoTokenizer in BertTokenizer. In this case, the spectial tokens of EduNLP in items are not parsed by the PureTextTokenizer in BertTokenizer. If it should provide a data-prepossing before train?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The input items of function finetune_bert needs to be tokenized by BertTokenizer, which means the special tokens have been already mapped to token_ids. In this function, the tokenizer is not used actually, but only used for getting some attributes (e.g. the size of vocabularies). The example of this function can be found in tests/test_vec/test_bert.py.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will complete the code comments for better understanding

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good, i get it.

model=model,
args=training_args,
data_collator=data_collator,
tokenizer=tokenizer.tokenizer,
train_dataset=dataset,
)
trainer.train()
trainer.save_model(output_dir)
1 change: 1 addition & 0 deletions EduNLP/SIF/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@

from .sif import is_sif, to_sif, sif4sci
from .tokenization import link_formulas
from .constants import *
3 changes: 2 additions & 1 deletion EduNLP/Vector/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
from .gensim_vec import W2V, D2V, BowLoader, TfidfLoader
from .const import *
from .rnn import RNNModel
from .t2v import T2V, get_pretrained_t2v
from .t2v import T2V, get_pretrained_t2v, PRETRAINED_MODELS
from .embedding import Embedding
from .bert_vec import BertModel
57 changes: 57 additions & 0 deletions EduNLP/Vector/bert_vec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import numpy as np
from pathlib import PurePath
from transformers import AutoModel
from .const import UNK, PAD
from .meta import Vector
import torch


class BertModel(Vector):
"""
Examples
--------
>>> from EduNLP.Pretrain import BertTokenizer
>>> tokenizer = BertTokenizer("bert-base-chinese")
>>> model = BertModel("bert-base-chinese", tokenizer=tokenizer)
>>> item = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束",
... "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束"]
>>> inputs = tokenizer(item, return_tensors='pt')
>>> output = model(inputs)
>>> output.shape
torch.Size([2, 12, 768])
>>> tokens = model.infer_tokens(inputs)
>>> tokens.shape
torch.Size([2, 10, 768])
>>> tokens = model.infer_tokens(inputs, return_special_tokens=True)
>>> tokens.shape
torch.Size([2, 12, 768])
>>> item = model.infer_vector(inputs)
>>> item.shape
torch.Size([2, 768])
"""
def __init__(self, pretrained_model, tokenizer=None):
self.model = AutoModel.from_pretrained(pretrained_model)
if tokenizer:
self.model.resize_token_embeddings(len(tokenizer.tokenizer))

def __call__(self, items: dict):
# 1, sent_len, embedding_size
tokens = self.model(**items).last_hidden_state
return tokens

def infer_vector(self, items: dict) -> torch.Tensor:
vector = self(items)
return vector[:, 0, :]

def infer_tokens(self, items: dict, return_special_tokens=False) -> torch.Tensor:
tokens = self(items)
if return_special_tokens:
# include embedding of [CLS] and [SEP]
return tokens
else:
# ignore embedding of [CLS] and [SEP]
return tokens[:, 1:-1, :]

@property
def vector_size(self):
return self.model.config.hidden_size
8 changes: 6 additions & 2 deletions EduNLP/Vector/t2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,19 @@
from EduData import get_data
from .rnn import RNNModel
from .gensim_vec import W2V, D2V
from .bert_vec import BertModel
from .meta import Vector
from EduNLP.constant import MODEL_DIR


MODELS = {
"w2v": W2V,
"d2v": D2V,
"rnn": RNNModel,
"lstm": RNNModel,
"gru": RNNModel,
"elmo": RNNModel
"elmo": RNNModel,
'bert': BertModel
}


Expand Down Expand Up @@ -69,7 +72,8 @@ def vector_size(self) -> int:
"w2v_eng_300": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/w2v/general_english_300.zip", "w2v"],
"w2v_lit_300": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/w2v/general_literal_300.zip", "w2v"],
"test_w2v": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/w2v/test_w2v_256.zip", "w2v"],
"test_d2v": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/test_256.zip", "d2v"]
"test_d2v": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/test_256.zip", "d2v"],
"luna_bert": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/LUNABert.zip", "bert"]
}


Expand Down
Loading