Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
f8d3115
add text format segmentation
pingzhili Aug 12, 2021
5d0e931
Update AUTHORS.md
pingzhili Aug 12, 2021
fe104a4
remove class TextFSegment
pingzhili Aug 12, 2021
5ed4fc1
remove class TextFSegment
pingzhili Aug 12, 2021
51bbf2e
[feat] add key parameter
tswsxk Aug 13, 2021
5674f91
Add text format examples and fix type bug
pingzhili Aug 15, 2021
26c3a5f
Add text format examples
pingzhili Aug 15, 2021
d0024dc
[FEATURE] add W2V in I2V for get_pretrained_i2v
KenelmQLH Aug 15, 2021
b90c1ed
[FEATURE] add GeneralTokenizer in Tokenizer
KenelmQLH Aug 15, 2021
6f0403d
Update sif.py
pingzhili Aug 15, 2021
3e4d6fa
Update segment.py
pingzhili Aug 15, 2021
77a6f15
Rollback Parser process in sif.py
pingzhili Aug 16, 2021
0d110dc
Update parser.py
pingzhili Aug 16, 2021
3e49d88
Update parser.py
pingzhili Aug 17, 2021
45eaa71
Update parser.py
pingzhili Aug 17, 2021
83ba24b
Update sif.py
pingzhili Aug 17, 2021
74b9443
Update parser.py
pingzhili Aug 17, 2021
93408bf
add test for w2v in i2v
KenelmQLH Aug 18, 2021
f2582dc
[FEATURE] rename two Tokenizer
KenelmQLH Aug 18, 2021
dc8969a
[DOC] add two tokenizer examples
KenelmQLH Aug 18, 2021
1aaadb9
move examples dir
KenelmQLH Aug 18, 2021
24b6e97
tokenizer examples
KenelmQLH Aug 18, 2021
77e8108
Merge pull request #36 from KenelmQLH/i2v
tswsxk Aug 18, 2021
c3159c0
Update segment.py
pingzhili Aug 19, 2021
61ef101
Merge branch 'master' into i2v
KenelmQLH Aug 20, 2021
85621a3
Merge pull request #43 from bigdata-ustc/i2v
tswsxk Aug 20, 2021
b023a34
[feature] rename variable and pythonoicing code
tswsxk Aug 20, 2021
4ebaa35
[fix] flake8
tswsxk Aug 20, 2021
c078ce3
[fix] FLAKE8
tswsxk Aug 20, 2021
cc357e6
Merge pull request #30 from pingzhiLi/parser
tswsxk Aug 20, 2021
af4189e
[version] 0.0.6
tswsxk Aug 20, 2021
bc92aab
[docs] changes
tswsxk Aug 20, 2021
d14c317
add a warning of line breaks in formula
fannazya Aug 21, 2021
df8d9cf
fix an error
fannazya Aug 21, 2021
ef55974
Merge pull request #45 from fannazya/dev
tswsxk Aug 21, 2021
4c297a4
Revert "[FEATURE] Add a warning of line breaks in formula"
tswsxk Aug 21, 2021
7bcfec9
Merge pull request #46 from bigdata-ustc/revert-45-dev
tswsxk Aug 22, 2021
19a319a
Create AUTHORS.md
BAOOOOOM Aug 22, 2021
87c56b5
Create CHANGE.txt
BAOOOOOM Aug 22, 2021
f53a89b
Create AUTHORS.md
BAOOOOOM Aug 22, 2021
0d123b0
Create README.md
BAOOOOOM Aug 22, 2021
9511f8e
Create AUTHORS.md
BAOOOOOM Aug 22, 2021
5fe80e7
Create AUTHORS.md
BAOOOOOM Aug 22, 2021
e7f84e0
Merge pull request #6 from bigdata-ustc/dev
BAOOOOOM Aug 22, 2021
8d5262a
Create AUTHORS.md
BAOOOOOM Aug 22, 2021
6cde627
Create CHANGE.txt
BAOOOOOM Aug 22, 2021
0c4a373
Create CHANGE.txt
BAOOOOOM Aug 22, 2021
eb642f6
Create CHANGE.txt
BAOOOOOM Aug 22, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@

[Longhu Qin](https://github.com/KenelmQLH)

[Pingzhi Li](https://github.com/pingzhiLi)

[Meikai Bao](https://github.com/BAOOOOOM)


The stared contributors are the corresponding authors.
7 changes: 6 additions & 1 deletion CHANGE.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
v0.0.6:
1. dev: add half-pretrained rnn model
2. important!!!: rename TextTokenizer to PureTextTokenizer, and add a new tokenizer named TextTokenizer (the two have similar but not the same behaviours).
3. sif: add $\textf{}$ syntax

v0.0.5:
1. fix the missing stopwords.txt when use pip install

Expand All @@ -18,4 +23,4 @@ v0.0.2:
v0.0.1:
1. Add Formula class to parse latex formula, which will generate the abstract syntax tree.
2. Add SIF v0.0.2.
3. Add sif4sci function which serves as a preprocess function for downstream tasks.
3. Add sif4sci function which serves as a preprocess function for downstream tasks.
2 changes: 1 addition & 1 deletion EduNLP/I2V/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# 2021/8/1 @ tongshiwei

from .i2v import I2V, get_pretrained_i2v
from .i2v import D2V
from .i2v import D2V, W2V
32 changes: 25 additions & 7 deletions EduNLP/I2V/i2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ..Tokenizer import Tokenizer, get_tokenizer
from EduNLP import logger

__all__ = ["I2V", "D2V", "get_pretrained_i2v"]
__all__ = ["I2V", "D2V", "W2V", "get_pretrained_i2v"]


class I2V(object):
Expand All @@ -27,6 +27,7 @@ class I2V(object):
kwargs:
the parameters passed to t2v
"""

def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None, pretrained_t2v=False, **kwargs):

self.tokenizer: Tokenizer = get_tokenizer(tokenizer, **tokenizer_kwargs if tokenizer_kwargs is not None else {})
Expand All @@ -47,10 +48,11 @@ def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None, pretrai
def __call__(self, items, *args, **kwargs):
return self.infer_vector(items, *args, **kwargs)

def tokenize(self, items, indexing=True, padding=False, *args, **kwargs) -> list:
return self.tokenizer(items, *args, **kwargs)
def tokenize(self, items, indexing=True, padding=False, key=lambda x: x, *args, **kwargs) -> list:
return self.tokenizer(items, key=key, *args, **kwargs)

def infer_vector(self, items, tokenize=True, indexing=False, padding=False, *args, **kwargs) -> tuple:
def infer_vector(self, items, tokenize=True, indexing=False, padding=False, key=lambda x: x, *args,
**kwargs) -> tuple:
raise NotImplementedError

def infer_item_vector(self, tokens, *args, **kwargs) -> ...:
Expand Down Expand Up @@ -84,20 +86,36 @@ def vector_size(self):


class D2V(I2V):
def infer_vector(self, items, tokenize=True, indexing=False, padding=False, *args, **kwargs) -> tuple:
tokens = self.tokenize(items, return_token=True) if tokenize is True else items
def infer_vector(self, items, tokenize=True, indexing=False, padding=False, key=lambda x: x, *args,
**kwargs) -> tuple:
tokens = self.tokenize(items, return_token=True, key=key) if tokenize is True else items
tokens = [token for token in tokens]
return self.t2v(tokens, *args, **kwargs), None

@classmethod
def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
return cls("text", name, pretrained_t2v=True, model_dir=model_dir)
return cls("pure_text", name, pretrained_t2v=True, model_dir=model_dir)


class W2V(I2V):
def infer_vector(self, items, tokenize=True, indexing=False, padding=False, key=lambda x: x, *args,
**kwargs) -> tuple:
tokens = self.tokenize(items, return_token=True) if tokenize is True else items
tokens = [token for token in tokens]
return self.t2v(tokens, *args, **kwargs), self.t2v.infer_tokens(tokens, *args, **kwargs)

@classmethod
def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
return cls("pure_text", name, pretrained_t2v=True, model_dir=model_dir)


MODELS = {
"d2v_all_256": [D2V, "d2v_all_256"],
"d2v_sci_256": [D2V, "d2v_sci_256"],
"d2v_eng_256": [D2V, "d2v_eng_256"],
"d2v_lit_256": [D2V, "d2v_lit_256"],
"w2v_sci_300": [W2V, "w2v_sci_300"],
"w2v_lit_300": [W2V, "w2v_lit_300"],
}


Expand Down
2 changes: 1 addition & 1 deletion EduNLP/SIF/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def _is_formula_legal(self, formula_str):

"""
legal_tags = ['FormFigureID', 'FormFigureBase64', 'FigureID', 'FigureBase64',
'SIFBlank', 'SIFChoice', 'SIFTag', 'SIFSep', 'SIFUnderline']
'SIFBlank', 'SIFChoice', 'SIFTag', 'SIFSep', 'SIFUnderline', 'textf']
for tag in legal_tags:
if tag in formula_str:
return True
Expand Down
10 changes: 9 additions & 1 deletion EduNLP/SIF/segment/segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,11 @@ def __init__(self, item, figures: dict = None):
self._ques_mark_segments = []
self._tag_segments = []
self._sep_segments = []
segments = re.split(r"(\$.+?\$)", item)

# remove $\textf{*} from the item$
item_no_textf = "".join(re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", item))

segments = re.split(r"(\$.+?\$)", item_no_textf)
for segment in segments:
if not segment:
continue
Expand Down Expand Up @@ -294,6 +298,10 @@ def seg(item, figures=None, symbol=None):
>>> s2 = seg(test_item_1_str_2, symbol="fgm")
>>> s2.tag_segments
['\\SIFTag{stem}', '\\SIFTag{options}']
>>> test_item_2 = r"已知$y=x$,则以下说法中$\textf{正确,b}$的是"
>>> s2 = seg(test_item_2)
>>> s2.text_segments
['已知', ',则以下说法中正确的是']
"""
segments = SegmentList(item, figures)
if symbol is not None:
Expand Down
5 changes: 5 additions & 0 deletions EduNLP/SIF/sif.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,11 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No
[]
>>> tl3.ques_mark_segments
[['\\SIFChoice']]
>>> test_item_3 = r"已知$y=x$,则以下说法中$\textf{正确,b}$的是"
>>> tl4 = sif4sci(test_item_3)
Warning: there is some chinese characters in formula!
>>> tl4.text_segments
[['已知'], ['说法', '中', '正确']]
"""
try:
if safe is True and is_sif(item) is not True:
Expand Down
15 changes: 11 additions & 4 deletions EduNLP/SIF/tokenization/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,14 @@ def __init__(self, segment_list: SegmentList, text_params=None, formula_params=N
"s": []
}
self.text_params = text_params if text_params is not None else {}
if formula_params is not None and "symbolize_figure_formula" in formula_params:
self.symbolize_figure_formula = formula_params.pop("symbolize_figure_formula")
else:
self.symbolize_figure_formula = False
self.symbolize_figure_formula = False
self.skip_figure_formula = False
if formula_params is not None:
if "symbolize_figure_formula" in formula_params:
self.symbolize_figure_formula = formula_params.pop("symbolize_figure_formula")
if "skip_figure_formula" in formula_params:
self.skip_figure_formula = formula_params.pop("skip_figure_formula")

self.formula_params = formula_params if formula_params is not None else {"method": "linear"}
self.formula_tokenize_method = self.formula_params.get("method")
self.figure_params = figure_params if figure_params is not None else {}
Expand Down Expand Up @@ -175,6 +179,9 @@ def append_formula(self, segment, symbol=False, init=True):
if symbol is True:
self._formula_tokens.append(len(self._tokens))
self._tokens.append(segment)
elif self.skip_figure_formula and isinstance(segment, FigureFormulaSegment):
# skip the FigureFormulaSegment
pass
elif self.symbolize_figure_formula and isinstance(segment, FigureFormulaSegment):
self._formula_tokens.append(len(self._tokens))
self._tokens.append(Symbol(FORMULA_SYMBOL))
Expand Down
52 changes: 47 additions & 5 deletions EduNLP/Tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,41 +5,83 @@
from ..SIF.segment import seg
from ..SIF.tokenization import tokenize

__all__ = ["TOKENIZER", "Tokenizer", "TextTokenizer", "get_tokenizer"]
__all__ = ["TOKENIZER", "Tokenizer", "PureTextTokenizer", "TextTokenizer", "get_tokenizer"]


class Tokenizer(object):
def __call__(self, *args, **kwargs):
raise NotImplementedError


class TextTokenizer(Tokenizer):
class PureTextTokenizer(Tokenizer):
r"""

Examples
--------
>>> tokenizer = PureTextTokenizer()
>>> items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"]
>>> tokens = tokenizer(items)
>>> next(tokens)[:10]
['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z']
>>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"]
>>> tokenizer = TextTokenizer()
>>> tokens = tokenizer(items)
>>> next(tokens) # doctest: +NORMALIZE_WHITESPACE
['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<',
'0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',',
'\\quad', 'A', '\\cap', 'B', '=']
>>> items = [{
... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
... "options": ["1", "2"]
... }]
>>> tokens = tokenizer(items, key=lambda x: x["stem"])
>>> next(tokens) # doctest: +NORMALIZE_WHITESPACE
['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<',
'0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',',
'\\quad', 'A', '\\cap', 'B', '=']
"""

def __init__(self, *args, **kwargs):
self.tokenization_params = {
"formula_params": {
"method": "linear",
"skip_figure_formula": True
}
}

def __call__(self, items: Iterable, key=lambda x: x, *args, **kwargs):
for item in items:
yield tokenize(seg(key(item), symbol="gmas"), **self.tokenization_params).tokens


class TextTokenizer(Tokenizer):
r"""

Examples
----------
>>> tokenizer = TextTokenizer()
>>> items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"]
>>> tokens = tokenizer(items)
>>> next(tokens)[:10]
['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]']
"""

def __init__(self, *args, **kwargs):
self.tokenization_params = {
"formula_params": {
"method": "linear",
"symbolize_figure_formula": True
}
}

def __call__(self, items: Iterable, *args, **kwargs):
def __call__(self, items: Iterable, key=lambda x: x, *args, **kwargs):
for item in items:
yield tokenize(seg(item, symbol="gmas"), **self.tokenization_params).tokens
yield tokenize(seg(key(item), symbol="gmas"), **self.tokenization_params).tokens


TOKENIZER = {
"pure_text": PureTextTokenizer,
"text": TextTokenizer
}

Expand Down
13 changes: 11 additions & 2 deletions EduNLP/Vector/t2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ def __init__(self, model: str, *args, **kwargs):
def __call__(self, items, *args, **kwargs):
return self.i2v.infer_vector(items, *args, **kwargs)

def infer_vector(self, items, *args, **kwargs):
return self.i2v.infer_vector(items, *args, **kwargs)

def infer_tokens(self, items, *args, **kwargs):
return self.i2v.infer_tokens(items, *args, **kwargs)

@property
def vector_size(self) -> int:
return self.i2v.vector_size
Expand All @@ -41,6 +47,8 @@ def vector_size(self) -> int:
"d2v_sci_256": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_science_256.zip", "d2v"],
"d2v_eng_256": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_english_256.zip", "d2v"],
"d2v_lit_256": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_literal_256.zip", "d2v"],
"w2v_eng_300": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/w2v/general_english_300.zip", "w2v"],
"w2v_lit_300": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/w2v/general_literal_300.zip", "w2v"],
}


Expand All @@ -52,6 +60,7 @@ def get_pretrained_t2v(name, model_dir=MODEL_DIR):
)
url, model_name, *args = PRETRAINED_MODELS[name]
model_path = get_data(url, model_dir)
if model_name in ["d2v"]:
model_path = path_append(model_path, os.path.basename(model_path) + ".bin", to_str=True)
if model_name in ["d2v", "w2v"]:
postfix = ".bin" if model_name == "d2v" else ".kv"
model_path = path_append(model_path, os.path.basename(model_path) + postfix, to_str=True)
return T2V(model_name, model_path, *args)
Loading