diff --git a/EduNLP/Formula/Formula.py b/EduNLP/Formula/Formula.py index 6b0cc38d..71038480 100644 --- a/EduNLP/Formula/Formula.py +++ b/EduNLP/Formula/Formula.py @@ -15,6 +15,7 @@ class Formula(object): """ + The part transform a formula to the parsed abstracted syntax tree. Parameters ---------- @@ -41,9 +42,6 @@ class Formula(object): >>> f.elements [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}] - Returns - -------- - the parsed abstracted syntax tree """ def __init__(self, formula: (str, List[Dict]), variable_standardization=False, const_mathord=None, init=True, *args, **kwargs): @@ -129,6 +127,7 @@ def resetable(self): class FormulaGroup(object): """ + The part transform a group of formula to the parsed abstracted syntax forest. Parameters ---------- @@ -157,9 +156,6 @@ class FormulaGroup(object): {'id': 1, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}, \ {'id': 2, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}] - Returns - -------- - the parsed abstracted syntax forest """ def __init__(self, formula_list: (list, List[str], List[Formula]), diff --git a/EduNLP/I2V/i2v.py b/EduNLP/I2V/i2v.py index 3b58401f..48e64b15 100644 --- a/EduNLP/I2V/i2v.py +++ b/EduNLP/I2V/i2v.py @@ -108,8 +108,11 @@ def vector_size(self): class D2V(I2V): """ + The model aims to transfer item to vector directly. - Bases: I2V + Bases + ------- + I2V Parameters ----------- @@ -144,6 +147,7 @@ class D2V(I2V): def infer_vector(self, items, tokenize=True, indexing=False, padding=False, key=lambda x: x, *args, **kwargs) -> tuple: ''' + It is a function to switch item to vector. And before using the function, it is nesseary to load model. Parameters ----------- @@ -175,8 +179,11 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): class W2V(I2V): """ + The model aims to transfer tokens to vector. - Bases: I2V + Bases + -------- + I2V Parameters ----------- @@ -209,6 +216,7 @@ class W2V(I2V): def infer_vector(self, items, tokenize=True, indexing=False, padding=False, key=lambda x: x, *args, **kwargs) -> tuple: ''' + It is a function to switch item to vector. And before using the function, it is nesseary to load model. Parameters ----------- diff --git a/EduNLP/ModelZoo/rnn/rnn.py b/EduNLP/ModelZoo/rnn/rnn.py index 97585170..ce3a2d34 100644 --- a/EduNLP/ModelZoo/rnn/rnn.py +++ b/EduNLP/ModelZoo/rnn/rnn.py @@ -91,7 +91,8 @@ def forward(self, seq_idx, seq_len): Returns -------- - a PackedSequence object + sequence + a PackedSequence object """ seq = self.embedding(seq_idx) pack = pack_padded_sequence(seq, seq_len, batch_first=True) diff --git a/EduNLP/ModelZoo/utils/masker.py b/EduNLP/ModelZoo/utils/masker.py index 18e5369b..00ba5df9 100644 --- a/EduNLP/ModelZoo/utils/masker.py +++ b/EduNLP/ModelZoo/utils/masker.py @@ -15,7 +15,7 @@ class Masker(object): seed Examples - ------- + --------- >>> masker = Masker(per=0.5, seed=10) >>> items = [[1, 1, 3, 4, 6], [2], [5, 9, 1, 4]] >>> masked_seq, mask_label = masker(items) @@ -39,7 +39,8 @@ class Masker(object): Returns ---------- - list:list of masked_seq and list of masked_list + list + list of masked_seq and list of masked_list """ def __init__(self, mask: (int, str, ...) = 0, per=0.2, seed=None): self.seed = np.random.default_rng(seed) diff --git a/EduNLP/ModelZoo/utils/padder.py b/EduNLP/ModelZoo/utils/padder.py index 443eb374..57f6219b 100644 --- a/EduNLP/ModelZoo/utils/padder.py +++ b/EduNLP/ModelZoo/utils/padder.py @@ -21,7 +21,8 @@ class PadSequence(object): Returns ------- - ret : list of number + ret + list of number """ def __init__(self, length, pad_val=0, clip=True): self._length = length @@ -53,7 +54,8 @@ def pad_sequence(sequence: list, max_length=None, pad_val=0, clip=True): Returns ------- - Modified list:padding the sequence in the same size. + Modified list:list + padding the sequence in the same size. Examples -------- diff --git a/EduNLP/Pretrain/gensim_vec.py b/EduNLP/Pretrain/gensim_vec.py index cd0823b7..51d408ae 100644 --- a/EduNLP/Pretrain/gensim_vec.py +++ b/EduNLP/Pretrain/gensim_vec.py @@ -194,11 +194,8 @@ def train_vector(items, w2v_prefix, embedding_dim=None, method="sg", binary=None ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") >>> print(token_item[:10]) [['公式'], [\\FormFigureID{wrong1?}], ['如图'], ['[FIGURE]'],...['最大值'], ['[MARK]']] - >>> tokenizer = GensimSegTokenizer(symbol="fgm", depth=None) - >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ - ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") - >>> print(token_item[:10]) - [['公式'], ['[FORMULA]'], ['如图'], ['[FIGURE]'], ['[FORMULA]'],...['最大值'], ['[MARK]']] + >>> train_vector(token_item[:10], "examples/test_model/data/gensim_luna_stem_t_", 100) #doctest: +ELLIPSIS + 'examples/test_model/data/gensim_luna_stem_t_sg_100.kv' """ monitor = MonitorCallback(["word", "I", "less"]) _train_params = dict( diff --git a/EduNLP/SIF/parser/parser.py b/EduNLP/SIF/parser/parser.py index cf7cbc30..cd2bf65e 100644 --- a/EduNLP/SIF/parser/parser.py +++ b/EduNLP/SIF/parser/parser.py @@ -3,8 +3,8 @@ class Parser: + """initial data and special variable""" def __init__(self, data): - """initial data and special variable""" self.lookahead = 0 self.head = 0 self.text = data diff --git a/EduNLP/SIF/segment/segment.py b/EduNLP/SIF/segment/segment.py index f93efbf0..2bbefdca 100644 --- a/EduNLP/SIF/segment/segment.py +++ b/EduNLP/SIF/segment/segment.py @@ -89,7 +89,8 @@ class SegmentList(object): Returns ---------- - list:tokenizated item + list + tokenizated item Examples -------- @@ -159,6 +160,7 @@ def append(self, segment) -> None: @property def segments(self): + """return segments""" if self._seg_idx is None: return self._segments else: @@ -166,29 +168,37 @@ def segments(self): @property def text_segments(self): + """return text segments""" return [self._segments[i] for i in self._text_segments] @property def formula_segments(self): + """return formula segments""" return [self._segments[i] for i in self._formula_segments] @property def figure_segments(self): + """return figure segments""" return [self._segments[i] for i in self._figure_segments] @property def ques_mark_segments(self): + """return question mark segments""" return [self._segments[i] for i in self._ques_mark_segments] @property def tag_segments(self): + """return tag segments""" return [self._segments[i] for i in self._tag_segments] def to_symbol(self, idx, symbol): + """switch element to its symbol""" self._segments[idx] = symbol def symbolize(self, to_symbolize="fgm"): """ + Switch designated elements to symbol. \ + It is a good way to protect or preserve the elements which we don't want to tokenize. Parameters ---------- @@ -226,6 +236,7 @@ def symbolize(self, to_symbolize="fgm"): @contextmanager def filter(self, drop: (set, str) = "", keep: (set, str) = "*"): """ + Output special element list selective.Drop means not show.Keep means show. Parameters ---------- @@ -267,6 +278,7 @@ def describe(self): def seg(item, figures=None, symbol=None): r""" + It is a interface for SegmentList. And show it in an appropriate way. Parameters ---------- @@ -276,7 +288,8 @@ def seg(item, figures=None, symbol=None): Returns ------- - list:segmented item + list + segmented item Examples -------- diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index 013ce87b..68787131 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -12,16 +12,19 @@ def is_sif(item): r""" + the part aims to check whether the input is sif format + Parameters ---------- item:str - the text of question + a raw item which respects stem Returns ------- - when item can not be parsed correctly, raise Error; - when item doesn't need to be modified, return Ture; - when item needs to be modified, return False; + bool + when item can not be parsed correctly, raise Error; + when item doesn't need to be modified, return Ture; + when item needs to be modified, return False; Examples -------- @@ -45,15 +48,17 @@ def is_sif(item): def to_sif(item): r""" + the part aims to switch item to sif formate + Parameters ---------- items:str - the text of question + a raw item which respects stem Returns ------- item:str - the question's text after sif correction + the item which accords with sif format Examples -------- @@ -77,7 +82,7 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No Parameters ---------- item:str - the text of question + a raw item which respects stem figures:dict {"FigureID": Base64 encoding of the figure} @@ -113,8 +118,9 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No Returns ------- - When tokenization is False, return SegmentList; - When tokenization is True, return TokenList + list + When tokenization is False, return SegmentList; + When tokenization is True, return TokenList Examples -------- diff --git a/EduNLP/SIF/tokenization/formula/ast_token.py b/EduNLP/SIF/tokenization/formula/ast_token.py index 67ca8ffd..22a2af23 100644 --- a/EduNLP/SIF/tokenization/formula/ast_token.py +++ b/EduNLP/SIF/tokenization/formula/ast_token.py @@ -35,6 +35,10 @@ # return nodes def traversal_formula(ast, ord2token=False, var_numbering=False, strategy="post", *args, **kwargs): + """ + The part will run only when the return type is list. And it provides two strategy: post and linear. + Besides, tokens list will append node follow its type. + """ tokens = [] if strategy == "post": order = nx.dfs_postorder_nodes(ast) @@ -58,6 +62,7 @@ def traversal_formula(ast, ord2token=False, var_numbering=False, strategy="post" def ast_tokenize(formula, ord2token=False, var_numbering=False, return_type="formula", *args, **kwargs): """ + According to return type, tokenizing formula by different methods. Parameters ---------- diff --git a/EduNLP/SIF/tokenization/formula/formula.py b/EduNLP/SIF/tokenization/formula/formula.py index 8afbe2da..eb08f418 100644 --- a/EduNLP/SIF/tokenization/formula/formula.py +++ b/EduNLP/SIF/tokenization/formula/formula.py @@ -9,6 +9,7 @@ def tokenize(formula, method="linear", errors="raise", **kwargs): """ + The total function to tokenize formula by linear or ast. Parameters ---------- diff --git a/EduNLP/SIF/tokenization/formula/linear_token.py b/EduNLP/SIF/tokenization/formula/linear_token.py index 6775da9f..1e3236bc 100644 --- a/EduNLP/SIF/tokenization/formula/linear_token.py +++ b/EduNLP/SIF/tokenization/formula/linear_token.py @@ -7,6 +7,7 @@ def cut(formula, preserve_braces=True, with_dollar=False, preserve_dollar=False, number_as_tag=False, preserve_src=True): # pragma: no cover """ + cut formula thoroughly Parameters ---------- @@ -24,6 +25,7 @@ def cut(formula, preserve_braces=True, with_dollar=False, Returns -------- list + return a preliminary list which cut fully Examples ---------- @@ -232,6 +234,8 @@ def latex_parse(formula, preserve_braces=True, with_dollar=True, def linear_tokenize(formula, preserve_braces=True, number_as_tag=False, *args, **kwargs): """ + linear tokenize formula. + It includes three processes:cut, reduce and connect_char. Parameters ---------- diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index cee23a50..2e063f85 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -9,6 +9,7 @@ def tokenize(text, granularity="word", stopwords="default"): """ + Using jieba library to tokenize item by word or char. Parameters ---------- diff --git a/EduNLP/SIF/tokenization/tokenization.py b/EduNLP/SIF/tokenization/tokenization.py index 49f31416..53f6fe75 100644 --- a/EduNLP/SIF/tokenization/tokenization.py +++ b/EduNLP/SIF/tokenization/tokenization.py @@ -16,22 +16,19 @@ class TokenList(object): """ + Parameters + ---------- + segment_list:list + segmented item + text_params:dict + formula_params:dict + figure_params:dict + Attributes ------------- """ def __init__(self, segment_list: SegmentList, text_params=None, formula_params=None, figure_params=None): - """ - - Parameters - ---------- - segment_list:list - segmented item - text_params:dict - formula_params:dict - figure_params:dict - - """ self._tokens = [] self._text_tokens = [] self._formula_tokens = [] @@ -64,6 +61,7 @@ def __init__(self, segment_list: SegmentList, text_params=None, formula_params=N self._token_idx = None def _variable_standardization(self): + """It makes same parmeters have the same number.""" if self.formula_tokenize_method == "ast": ast_formulas = [self._tokens[i] for i in self._formula_tokens if isinstance(self._tokens[i], Formula)] if ast_formulas: @@ -72,7 +70,7 @@ def _variable_standardization(self): @contextmanager def add_seg_type(self, seg_type, tar: list, add_seg_type=True, mode="delimiter"): """ - add seg tag in different position + Add seg tag in different position Parameters ---------- @@ -106,6 +104,7 @@ def add_seg_type(self, seg_type, tar: list, add_seg_type=True, mode="delimiter") def get_segments(self, add_seg_type=True, add_seg_mode="delimiter", keep="*", drop="", depth=None): # pragma: no cover r""" + call segment function. Parameters ---------- @@ -124,6 +123,8 @@ def get_segments(self, add_seg_type=True, add_seg_mode="delimiter", keep="*", dr Returns ------- + list + segmented item """ keep = set("tfgmas" if keep == "*" else keep) - set(drop) @@ -152,6 +153,7 @@ def get_segments(self, add_seg_type=True, add_seg_mode="delimiter", keep="*", dr return _segments def __get_segments(self, seg_type): + """It aims to understand letters' meaning.""" _segments = [] for i in self._seg_types[seg_type]: _segment = [] @@ -164,22 +166,27 @@ def __get_segments(self, seg_type): @property def text_segments(self): + """get text segment""" return self.__get_segments("t") @property def formula_segments(self): + """get formula segment""" return self.__get_segments("f") @property def figure_segments(self): + """get figure segment""" return self.__get_segments("g") @property def ques_mark_segments(self): + """get question mark segment""" return self.__get_segments("m") @property def tokens(self): + """add token to a list""" tokens = [] if self._token_idx is not None: for i, token in enumerate(self._tokens): @@ -191,6 +198,7 @@ def tokens(self): return tokens def append_text(self, segment, symbol=False): + """append text""" with self._append("t"): if symbol is False: tokens = text.tokenize(segment, **self.text_params) @@ -202,6 +210,7 @@ def append_text(self, segment, symbol=False): self._tokens.append(segment) def append_formula(self, segment, symbol=False, init=True): + """append formula by different methods""" with self._append("f"): if symbol is True: self._formula_tokens.append(len(self._tokens)) @@ -225,27 +234,32 @@ def append_formula(self, segment, symbol=False, init=True): self._tokens.append(token) def append_figure(self, segment, **kwargs): + """append figure""" with self._append("g"): self._figure_tokens.append(len(self._tokens)) self._tokens.append(segment) def append_ques_mark(self, segment, **kwargs): + """append question mark""" with self._append("m"): self._ques_mark_tokens.append(len(self._tokens)) self._tokens.append(segment) def append_tag(self, segment, **kwargs): + """append tag""" with self._append("a"): self._tag_tokens.append(len(self._tokens)) self._tokens.append(segment) def append_sep(self, segment, **kwargs): + """append sep""" with self._append("s"): self._sep_tokens.append(len(self._tokens)) self._tokens.append(segment) @contextmanager def _append(self, seg_type): + """It aims to understand letters' meaning.""" start = len(self._tokens) yield end = len(self._tokens) @@ -254,6 +268,7 @@ def _append(self, seg_type): def append(self, segment, lazy=False): """ + the total api for appending elements Parameters ---------- @@ -295,12 +310,14 @@ def append(self, segment, lazy=False): raise TypeError("Unknown segment type: %s" % type(segment)) def extend(self, segments): + """append every segment in turn""" for segment in segments: self.append(segment, True) self._variable_standardization() @property def text_tokens(self): + """return text tokens""" return [self._tokens[i] for i in self._text_tokens] def __add_token(self, token, tokens): @@ -322,6 +339,7 @@ def __add_token(self, token, tokens): @property def formula_tokens(self): + """return formula tokens""" tokens = [] for i in self._formula_tokens: self.__add_token(self._tokens[i], tokens) @@ -329,6 +347,7 @@ def formula_tokens(self): @property def figure_tokens(self): + """return figure tokens""" tokens = [] for i in self._figure_tokens: self.__add_token(self._tokens[i], tokens) @@ -336,6 +355,7 @@ def figure_tokens(self): @property def ques_mark_tokens(self): + """return question mark tokens""" return [self._tokens[i] for i in self._ques_mark_tokens] def __repr__(self): @@ -343,11 +363,13 @@ def __repr__(self): @property def inner_formula_tokens(self): + """return inner formula tokens""" return [self._tokens[i] for i in self._formula_tokens] @contextmanager def filter(self, drop: (set, str) = "", keep: (set, str) = "*"): """ + Output special element list selective.Drop means not show.Keep means show. Parameters ---------- @@ -358,7 +380,8 @@ def filter(self, drop: (set, str) = "", keep: (set, str) = "*"): Returns -------- - filted list + list + filted list """ _drop = {c for c in drop} if isinstance(drop, str) else drop if keep == "*": @@ -382,6 +405,7 @@ def filter(self, drop: (set, str) = "", keep: (set, str) = "*"): self._token_idx = None def describe(self): + """show the total number of each elements""" return { "t": len(self._text_tokens), "f": len(self._formula_tokens), @@ -407,7 +431,8 @@ def tokenize(segment_list: SegmentList, text_params=None, formula_params=None, f Returns ---------- - tokenized item + list + tokenized item Examples -------- @@ -421,6 +446,7 @@ def tokenize(segment_list: SegmentList, text_params=None, formula_params=None, f def link_formulas(*token_list: TokenList, link_vars=True): + """call formula function""" ast_formulas = [] for tl in token_list: if tl.formula_tokenize_method == "ast": diff --git a/EduNLP/Vector/gensim_vec.py b/EduNLP/Vector/gensim_vec.py index ece3a077..b34ac3a0 100644 --- a/EduNLP/Vector/gensim_vec.py +++ b/EduNLP/Vector/gensim_vec.py @@ -148,7 +148,7 @@ class D2V(Vector): Returns --------- - D2V + d2v model:D2V """ def __init__(self, filepath, method="d2v"): self._method = method diff --git "a/docs/source/_static/\346\226\260\346\265\201\347\250\213\345\233\276.png" "b/docs/source/_static/\346\226\260\346\265\201\347\250\213\345\233\276.png" new file mode 100644 index 00000000..bbfacbd3 Binary files /dev/null and "b/docs/source/_static/\346\226\260\346\265\201\347\250\213\345\233\276.png" differ diff --git "a/docs/source/_static/\346\265\201\347\250\213\345\233\276.jpg" "b/docs/source/_static/\346\265\201\347\250\213\345\233\276.jpg" index 7563e254..dfdb8737 100644 Binary files "a/docs/source/_static/\346\265\201\347\250\213\345\233\276.jpg" and "b/docs/source/_static/\346\265\201\347\250\213\345\233\276.jpg" differ diff --git a/docs/source/api/sif.rst b/docs/source/api/sif.rst index 95f355fb..7467b7cb 100644 --- a/docs/source/api/sif.rst +++ b/docs/source/api/sif.rst @@ -8,16 +8,16 @@ SIF :imported-members: -Segment ----------- -.. automodule:: EduNLP.SIF.segment.segment +Parser +-------- +.. automodule:: EduNLP.SIF.parser.parser.Parser :members: :imported-members: -Parser --------- -.. automodule:: EduNLP.SIF.parser.parser +Segment +---------- +.. automodule:: EduNLP.SIF.segment.segment :members: :imported-members: diff --git a/docs/source/index.rst b/docs/source/index.rst index 337dce47..27e9ed8e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -164,13 +164,12 @@ If this repository is helpful for you, please cite our work :hidden: :glob: - api/index - api/i2v api/sif - api/tokenizer + api/utils api/formula + api/tokenizer api/pretrain api/ModelZoo + api/i2v api/vector - api/utils diff --git a/docs/source/tutorial/zh/index.rst b/docs/source/tutorial/zh/index.rst index ef452a79..572d0459 100644 --- a/docs/source/tutorial/zh/index.rst +++ b/docs/source/tutorial/zh/index.rst @@ -16,7 +16,19 @@ 主要流程 ---------- -.. figure:: ../../_static/流程图.jpg +.. figure:: ../../_static/新流程图.png + +* **语法解析**:其作用是将传入的item转换为标准sif格式(即把字母、数字用 ``$...$`` 包裹起来,把选择填空的括号、下划线转换为特殊符号等)。 + +* **成分分解**:其作用是将传入的符合sif标准的item根据元素种类进行分割开来,从而服务于后面的令牌化环节(即可以将不同类型元素使用各自的方法令牌化)。 + +* **令牌化**:其作用是将传入的经过分词后的item元素列表进行令牌化分解,从而服务于后面的向量化模块。 + 其中通常情况下直接使用文本形式的令牌化方法即可,对于公式而言还可使用ast方法进行解析(调用formula模块); + +* **向量化**:此部分主要调用的是I2V类及其子类,其作用是将传入的令牌化后的item元素列表进行向量化操作,最终即可得到相应的静态向量。 + 对于向量化模块来说,可以调用自己训练好的模型,也可直接调用提供的预训练模型(调用get_pretrained_i2v模块即可)。 + +* **下游模型**:将得到的向量进一步处理,从而得到所需的结果。 示例 --------