Skip to content
Merged
8 changes: 2 additions & 6 deletions EduNLP/Formula/Formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

class Formula(object):
"""
The part transform a formula to the parsed abstracted syntax tree.

Parameters
----------
Expand All @@ -41,9 +42,6 @@ class Formula(object):
>>> f.elements
[{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}]

Returns
--------
the parsed abstracted syntax tree
"""
def __init__(self, formula: (str, List[Dict]), variable_standardization=False, const_mathord=None,
init=True, *args, **kwargs):
Expand Down Expand Up @@ -129,6 +127,7 @@ def resetable(self):

class FormulaGroup(object):
"""
The part transform a group of formula to the parsed abstracted syntax forest.

Parameters
----------
Expand Down Expand Up @@ -157,9 +156,6 @@ class FormulaGroup(object):
{'id': 1, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}, \
{'id': 2, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}]

Returns
--------
the parsed abstracted syntax forest
"""
def __init__(self,
formula_list: (list, List[str], List[Formula]),
Expand Down
12 changes: 10 additions & 2 deletions EduNLP/I2V/i2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,11 @@ def vector_size(self):

class D2V(I2V):
"""
The model aims to transfer item to vector directly.

Bases: I2V
Bases
-------
I2V

Parameters
-----------
Expand Down Expand Up @@ -144,6 +147,7 @@ class D2V(I2V):
def infer_vector(self, items, tokenize=True, indexing=False, padding=False, key=lambda x: x, *args,
**kwargs) -> tuple:
'''
It is a function to switch item to vector. And before using the function, it is nesseary to load model.

Parameters
-----------
Expand Down Expand Up @@ -175,8 +179,11 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):

class W2V(I2V):
"""
The model aims to transfer tokens to vector.

Bases: I2V
Bases
--------
I2V

Parameters
-----------
Expand Down Expand Up @@ -209,6 +216,7 @@ class W2V(I2V):
def infer_vector(self, items, tokenize=True, indexing=False, padding=False, key=lambda x: x, *args,
**kwargs) -> tuple:
'''
It is a function to switch item to vector. And before using the function, it is nesseary to load model.

Parameters
-----------
Expand Down
3 changes: 2 additions & 1 deletion EduNLP/ModelZoo/rnn/rnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ def forward(self, seq_idx, seq_len):

Returns
--------
a PackedSequence object
sequence
a PackedSequence object
"""
seq = self.embedding(seq_idx)
pack = pack_padded_sequence(seq, seq_len, batch_first=True)
Expand Down
5 changes: 3 additions & 2 deletions EduNLP/ModelZoo/utils/masker.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class Masker(object):
seed

Examples
-------
---------
>>> masker = Masker(per=0.5, seed=10)
>>> items = [[1, 1, 3, 4, 6], [2], [5, 9, 1, 4]]
>>> masked_seq, mask_label = masker(items)
Expand All @@ -39,7 +39,8 @@ class Masker(object):

Returns
----------
list:list of masked_seq and list of masked_list
list
list of masked_seq and list of masked_list
"""
def __init__(self, mask: (int, str, ...) = 0, per=0.2, seed=None):
self.seed = np.random.default_rng(seed)
Expand Down
6 changes: 4 additions & 2 deletions EduNLP/ModelZoo/utils/padder.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ class PadSequence(object):

Returns
-------
ret : list of number
ret
list of number
"""
def __init__(self, length, pad_val=0, clip=True):
self._length = length
Expand Down Expand Up @@ -53,7 +54,8 @@ def pad_sequence(sequence: list, max_length=None, pad_val=0, clip=True):

Returns
-------
Modified list:padding the sequence in the same size.
Modified list:list
padding the sequence in the same size.

Examples
--------
Expand Down
7 changes: 2 additions & 5 deletions EduNLP/Pretrain/gensim_vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,8 @@ def train_vector(items, w2v_prefix, embedding_dim=None, method="sg", binary=None
... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$")
>>> print(token_item[:10])
[['公式'], [\\FormFigureID{wrong1?}], ['如图'], ['[FIGURE]'],...['最大值'], ['[MARK]']]
>>> tokenizer = GensimSegTokenizer(symbol="fgm", depth=None)
>>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$")
>>> print(token_item[:10])
[['公式'], ['[FORMULA]'], ['如图'], ['[FIGURE]'], ['[FORMULA]'],...['最大值'], ['[MARK]']]
>>> train_vector(token_item[:10], "examples/test_model/data/gensim_luna_stem_t_", 100) #doctest: +ELLIPSIS
'examples/test_model/data/gensim_luna_stem_t_sg_100.kv'
"""
monitor = MonitorCallback(["word", "I", "less"])
_train_params = dict(
Expand Down
2 changes: 1 addition & 1 deletion EduNLP/SIF/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@


class Parser:
"""initial data and special variable"""
def __init__(self, data):
"""initial data and special variable"""
self.lookahead = 0
self.head = 0
self.text = data
Expand Down
17 changes: 15 additions & 2 deletions EduNLP/SIF/segment/segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ class SegmentList(object):

Returns
----------
list:tokenizated item
list
tokenizated item

Examples
--------
Expand Down Expand Up @@ -159,36 +160,45 @@ def append(self, segment) -> None:

@property
def segments(self):
"""return segments"""
if self._seg_idx is None:
return self._segments
else:
return [s for i, s in enumerate(self._segments) if i in self._seg_idx]

@property
def text_segments(self):
"""return text segments"""
return [self._segments[i] for i in self._text_segments]

@property
def formula_segments(self):
"""return formula segments"""
return [self._segments[i] for i in self._formula_segments]

@property
def figure_segments(self):
"""return figure segments"""
return [self._segments[i] for i in self._figure_segments]

@property
def ques_mark_segments(self):
"""return question mark segments"""
return [self._segments[i] for i in self._ques_mark_segments]

@property
def tag_segments(self):
"""return tag segments"""
return [self._segments[i] for i in self._tag_segments]

def to_symbol(self, idx, symbol):
"""switch element to its symbol"""
self._segments[idx] = symbol

def symbolize(self, to_symbolize="fgm"):
"""
Switch designated elements to symbol. \
It is a good way to protect or preserve the elements which we don't want to tokenize.

Parameters
----------
Expand Down Expand Up @@ -226,6 +236,7 @@ def symbolize(self, to_symbolize="fgm"):
@contextmanager
def filter(self, drop: (set, str) = "", keep: (set, str) = "*"):
"""
Output special element list selective.Drop means not show.Keep means show.

Parameters
----------
Expand Down Expand Up @@ -267,6 +278,7 @@ def describe(self):

def seg(item, figures=None, symbol=None):
r"""
It is a interface for SegmentList. And show it in an appropriate way.

Parameters
----------
Expand All @@ -276,7 +288,8 @@ def seg(item, figures=None, symbol=None):

Returns
-------
list:segmented item
list
segmented item

Examples
--------
Expand Down
24 changes: 15 additions & 9 deletions EduNLP/SIF/sif.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,19 @@

def is_sif(item):
r"""
the part aims to check whether the input is sif format

Parameters
----------
item:str
the text of question
a raw item which respects stem

Returns
-------
when item can not be parsed correctly, raise Error;
when item doesn't need to be modified, return Ture;
when item needs to be modified, return False;
bool
when item can not be parsed correctly, raise Error;
when item doesn't need to be modified, return Ture;
when item needs to be modified, return False;

Examples
--------
Expand All @@ -45,15 +48,17 @@ def is_sif(item):

def to_sif(item):
r"""
the part aims to switch item to sif formate

Parameters
----------
items:str
the text of question
a raw item which respects stem

Returns
-------
item:str
the question's text after sif correction
the item which accords with sif format

Examples
--------
Expand All @@ -77,7 +82,7 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No
Parameters
----------
item:str
the text of question
a raw item which respects stem
figures:dict
{"FigureID": Base64 encoding of the figure}

Expand Down Expand Up @@ -113,8 +118,9 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No

Returns
-------
When tokenization is False, return SegmentList;
When tokenization is True, return TokenList
list
When tokenization is False, return SegmentList;
When tokenization is True, return TokenList

Examples
--------
Expand Down
5 changes: 5 additions & 0 deletions EduNLP/SIF/tokenization/formula/ast_token.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@
# return nodes

def traversal_formula(ast, ord2token=False, var_numbering=False, strategy="post", *args, **kwargs):
"""
The part will run only when the return type is list. And it provides two strategy: post and linear.
Besides, tokens list will append node follow its type.
"""
tokens = []
if strategy == "post":
order = nx.dfs_postorder_nodes(ast)
Expand All @@ -58,6 +62,7 @@ def traversal_formula(ast, ord2token=False, var_numbering=False, strategy="post"

def ast_tokenize(formula, ord2token=False, var_numbering=False, return_type="formula", *args, **kwargs):
"""
According to return type, tokenizing formula by different methods.

Parameters
----------
Expand Down
1 change: 1 addition & 0 deletions EduNLP/SIF/tokenization/formula/formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

def tokenize(formula, method="linear", errors="raise", **kwargs):
"""
The total function to tokenize formula by linear or ast.

Parameters
----------
Expand Down
4 changes: 4 additions & 0 deletions EduNLP/SIF/tokenization/formula/linear_token.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
def cut(formula, preserve_braces=True, with_dollar=False,
preserve_dollar=False, number_as_tag=False, preserve_src=True): # pragma: no cover
"""
cut formula thoroughly

Parameters
----------
Expand All @@ -24,6 +25,7 @@ def cut(formula, preserve_braces=True, with_dollar=False,
Returns
--------
list
return a preliminary list which cut fully

Examples
----------
Expand Down Expand Up @@ -232,6 +234,8 @@ def latex_parse(formula, preserve_braces=True, with_dollar=True,

def linear_tokenize(formula, preserve_braces=True, number_as_tag=False, *args, **kwargs):
"""
linear tokenize formula.
It includes three processes:cut, reduce and connect_char.

Parameters
----------
Expand Down
1 change: 1 addition & 0 deletions EduNLP/SIF/tokenization/text/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

def tokenize(text, granularity="word", stopwords="default"):
"""
Using jieba library to tokenize item by word or char.

Parameters
----------
Expand Down
Loading