diff --git a/EduNLP/Formula/Formula.py b/EduNLP/Formula/Formula.py index f4c868be..6eb80049 100644 --- a/EduNLP/Formula/Formula.py +++ b/EduNLP/Formula/Formula.py @@ -15,6 +15,18 @@ class Formula(object): """ + The part transform a formula to the parsed abstracted syntax tree. + + Parameters + ---------- + formula: str or List[Dict] + latex formula string or the parsed abstracted syntax tree + variable_standardization + const_mathord + init + args + kwargs + Examples -------- >>> f = Formula("x") @@ -29,22 +41,21 @@ class Formula(object): >>> f.elements [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}] - """ + Attributes + ------------ + ast + show all ast details + elements + just show elements' id, type, text and role + ast_graph + draw a ast graph + to_str + resetable + return bool + """ def __init__(self, formula: (str, List[Dict]), variable_standardization=False, const_mathord=None, init=True, *args, **kwargs): - """ - - Parameters - ---------- - formula: str or List[Dict] - latex formula string or the parsed abstracted syntax tree - variable_standardization - const_mathord - init - args - kwargs - """ self._formula = formula self._ast = None if init is True: @@ -55,6 +66,15 @@ def __init__(self, formula: (str, List[Dict]), variable_standardization=False, c ) def variable_standardization(self, inplace=False, const_mathord=None, variable_connect_dict=None): + """ + It makes same parmeters have the same number. + + Parameters + ---------- + inplace + const_mathord + variable_connect_dict + """ const_mathord = const_mathord if const_mathord is not None else CONST_MATHORD ast_tree = self._ast if inplace else deepcopy(self._ast) var_code = variable_connect_dict["var_code"] if variable_connect_dict is not None else {} @@ -118,6 +138,26 @@ def resetable(self): class FormulaGroup(object): """ + The part transform a group of formula to the parsed abstracted syntax forest. + + Attributes + ------------ + to_str + ast + show all ast details + elements + just show elements' id, type, text and role + ast_graph + draw a ast graph + + Parameters + ---------- + formula: str or List[Dict] or List[Formula] + latex formula string or the parsed abstracted syntax tree or a group of parsed abstracted syntax tree + variable_standardization + const_mathord + detach + Examples --------- >>> fg = FormulaGroup(["x + y", "y + x", "z + x"]) @@ -128,15 +168,16 @@ class FormulaGroup(object): ;;> >>> fg = FormulaGroup(["x", Formula("y"), "x"]) >>> fg.elements - [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'y', 'role': None},\ - {'id': 2, 'type': 'mathord', 'text': 'x', 'role': None}] + [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None}, \ +{'id': 1, 'type': 'mathord', 'text': 'y', 'role': None}, \ +{'id': 2, 'type': 'mathord', 'text': 'x', 'role': None}] >>> fg = FormulaGroup(["x", Formula("y"), "x"], variable_standardization=True) >>> fg.elements [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}, \ {'id': 1, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}, \ {'id': 2, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}] - """ + """ def __init__(self, formula_list: (list, List[str], List[Formula]), variable_standardization=False, @@ -186,6 +227,15 @@ def __contains__(self, item) -> bool: return item in self._formulas def variable_standardization(self, inplace=False, const_mathord=None, variable_connect_dict=None): + """ + It makes same parmeters have the same number. + + Parameters + ---------- + inplace + const_mathord + variable_connect_dict + """ ret = [] for formula in self._formulas: ret.append(formula.variable_standardization(inplace=inplace, const_mathord=const_mathord, @@ -220,6 +270,15 @@ def ast_graph(self) -> (nx.Graph, nx.DiGraph): def link_formulas(*formula: Formula, link_vars=True, **kwargs): + """ + + Parameters + ---------- + formula + the parsed abstracted syntax tree + link_vars + kwargs + """ forest = [] for form in formula: forest += form.reset_ast( diff --git a/EduNLP/Formula/ast/ast.py b/EduNLP/Formula/ast/ast.py index 52e44ac5..8b3af216 100644 --- a/EduNLP/Formula/ast/ast.py +++ b/EduNLP/Formula/ast/ast.py @@ -8,10 +8,12 @@ def katex_parse(formula): + """将公式传入katex进行语法解析""" return katex.katex.__parse(formula,{'displayMode':True,'trust': True}).to_list() def str2ast(formula: str, *args, **kwargs): + """给字符串的接口""" return ast(formula, is_str=True, *args, **kwargs) diff --git a/EduNLP/Formula/viz/__init__.py b/EduNLP/Formula/viz/__init__.py index 2d0ba898..a461dca7 100644 --- a/EduNLP/Formula/viz/__init__.py +++ b/EduNLP/Formula/viz/__init__.py @@ -2,5 +2,5 @@ # 2021/3/8 @ tongshiwei import warnings -warnings.warn("Do not use this package") +# warnings.warn("Do not use this package") from .tree_viz import TreePlotter, ForestPlotter diff --git a/EduNLP/I2V/i2v.py b/EduNLP/I2V/i2v.py index 4254fe1e..40c0248c 100644 --- a/EduNLP/I2V/i2v.py +++ b/EduNLP/I2V/i2v.py @@ -12,9 +12,11 @@ class I2V(object): """ + It just a api, so you shouldn't use it directly. \ + If you want to get vector from item, you can use other model like D2V and W2V. Parameters - ---------- + ----------- tokenizer: str the tokenizer name t2v: str @@ -24,12 +26,29 @@ class I2V(object): tokenizer_kwargs: dict the parameters passed to tokenizer pretrained_t2v: bool + + True: use pretrained t2v model + + False: use your own t2v model + kwargs: the parameters passed to t2v - """ - def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None, pretrained_t2v=False, **kwargs): + Examples + -------- + >>> item = {"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, \ + ... 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,\ + ... 此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$"} + >>> model_path = "examples/test_model/test_gensim_luna_stem_tf_d2v_256.bin" # doctest: +ELLIPSIS + >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) # doctest: +ELLIPSIS + >>> i2v(item) # doctest: +ELLIPSIS + ([array([...dtype=float32)], None) + Returns + ------- + i2v model: I2V + """ + def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None, pretrained_t2v=False, **kwargs): self.tokenizer: Tokenizer = get_tokenizer(tokenizer, **tokenizer_kwargs if tokenizer_kwargs is not None else {}) if pretrained_t2v: logger.info("Use pretrained t2v model %s" % t2v) @@ -46,9 +65,11 @@ def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None, pretrai } def __call__(self, items, *args, **kwargs): + """transfer item to vector""" return self.infer_vector(items, *args, **kwargs) def tokenize(self, items, indexing=True, padding=False, key=lambda x: x, *args, **kwargs) -> list: + # """tokenize item""" return self.tokenizer(items, key=key, *args, **kwargs) def infer_vector(self, items, tokenize=True, indexing=False, padding=False, key=lambda x: x, *args, @@ -86,8 +107,67 @@ def vector_size(self): class D2V(I2V): + """ + The model aims to transfer item to vector directly. + + Bases + ------- + I2V + + Parameters + ----------- + tokenizer: str + the tokenizer name + t2v: str + the name of token2vector model + args: + the parameters passed to t2v + tokenizer_kwargs: dict + the parameters passed to tokenizer + pretrained_t2v: bool + True: use pretrained t2v model + False: use your own t2v model + kwargs: + the parameters passed to t2v + + Examples + --------- + >>> item = {"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, \ + ... 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,\ + ... 此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$"} + >>> model_path = "examples/test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) + >>> i2v(item) + ([array([ ...dtype=float32)], None) + + Returns + ------- + i2v model: I2V + """ def infer_vector(self, items, tokenize=True, indexing=False, padding=False, key=lambda x: x, *args, **kwargs) -> tuple: + ''' + It is a function to switch item to vector. And before using the function, it is nesseary to load model. + + Parameters + ----------- + items:str + the text of question + tokenize:bool + True: tokenize the item + indexing:bool + padding:bool + key: lambda function + the parameter passed to tokenizer, select the text to be processed + args: + the parameters passed to t2v + kwargs: + the parameters passed to t2v + + Returns + -------- + vector:list + ''' tokens = self.tokenize(items, return_token=True, key=key) if tokenize is True else items tokens = [token for token in tokens] return self.t2v(tokens, *args, **kwargs), None @@ -98,8 +178,65 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): class W2V(I2V): + """ + The model aims to transfer tokens to vector. + + Bases + -------- + I2V + + Parameters + ----------- + tokenizer: str + the tokenizer name + t2v: str + the name of token2vector model + args: + the parameters passed to t2v + tokenizer_kwargs: dict + the parameters passed to tokenizer + pretrained_t2v: bool + True: use pretrained t2v model + False: use your own t2v model + kwargs: + the parameters passed to t2v + + Examples + --------- + >>> i2v = get_pretrained_i2v("test_w2v", "examples/test_model/data/w2v") + >>> item_vector, token_vector = i2v(["有学者认为:‘学习’,必须适应实际"]) + >>> item_vector # doctest: +ELLIPSIS + [array([...], dtype=float32)] + + Returns + -------- + i2v model: W2V + + """ def infer_vector(self, items, tokenize=True, indexing=False, padding=False, key=lambda x: x, *args, **kwargs) -> tuple: + ''' + It is a function to switch item to vector. And before using the function, it is nesseary to load model. + + Parameters + ----------- + items:str + the text of question + tokenize:bool + True: tokenize the item + indexing:bool + padding:bool + key: lambda function + the parameter passed to tokenizer, select the text to be processed + args: + the parameters passed to t2v + kwargs: + the parameters passed to t2v + + Returns + -------- + vector:list + ''' tokens = self.tokenize(items, return_token=True) if tokenize is True else items tokens = [token for token in tokens] return self.t2v(tokens, *args, **kwargs), self.t2v.infer_tokens(tokens, *args, **kwargs) @@ -116,21 +253,41 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): "d2v_lit_256": [D2V, "d2v_lit_256"], "w2v_sci_300": [W2V, "w2v_sci_300"], "w2v_lit_300": [W2V, "w2v_lit_300"], + "test_w2v": [W2V, "test_w2v"], + "test_d2v": [D2V, "test_d2v"], } def get_pretrained_i2v(name, model_dir=MODEL_DIR): """ + It is a good idea if you want to switch item to vector earily. Parameters - ---------- - name - model_dir + ----------- + name: str + the name of item2vector model + e.g.: + d2v_all_256 + d2v_sci_256 + d2v_eng_256 + d2v_lit_256 + w2v_sci_300 + w2v_lit_300 + model_dir:str + the path of model, default: MODEL_DIR = '~/.EduNLP/model' Returns - ------- + -------- i2v model: I2V + Examples + --------- + >>> item = {"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, \ + ... 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,\ + ... 此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$"} + >>> i2v = get_pretrained_i2v("test_d2v", "examples/test_model/data/d2v") + >>> print(i2v(item)) + ([array([ ...dtype=float32)], None) """ if name not in MODELS: raise KeyError( diff --git a/EduNLP/ModelZoo/rnn/rnn.py b/EduNLP/ModelZoo/rnn/rnn.py index c1594ac8..ce3a2d34 100644 --- a/EduNLP/ModelZoo/rnn/rnn.py +++ b/EduNLP/ModelZoo/rnn/rnn.py @@ -9,6 +9,20 @@ class LM(nn.Module): """ + + Parameters + ---------- + rnn_type:str + Legal types including RNN, LSTM, GRU,ELMO + vocab_size: int + embedding_dim: int + hidden_size: int + num_layers + bidirectional + embedding + model_params + kwargs + Examples -------- >>> import torch @@ -66,6 +80,20 @@ def __init__(self, rnn_type: str, vocab_size: int, embedding_dim: int, hidden_si load_net(model_params, self, allow_missing=True) def forward(self, seq_idx, seq_len): + """ + + Parameters + ---------- + seq_idx:Tensor + a list of indices + seq_len:Tensor + length + + Returns + -------- + sequence + a PackedSequence object + """ seq = self.embedding(seq_idx) pack = pack_padded_sequence(seq, seq_len, batch_first=True) h0 = torch.zeros(self.num_layers, seq.shape[0], self.hidden_size) diff --git a/EduNLP/ModelZoo/utils/masker.py b/EduNLP/ModelZoo/utils/masker.py index 401c16e7..00ba5df9 100644 --- a/EduNLP/ModelZoo/utils/masker.py +++ b/EduNLP/ModelZoo/utils/masker.py @@ -7,8 +7,15 @@ class Masker(object): """ + + Parameters + ---------- + mask: int, str + per + seed + Examples - ------- + --------- >>> masker = Masker(per=0.5, seed=10) >>> items = [[1, 1, 3, 4, 6], [2], [5, 9, 1, 4]] >>> masked_seq, mask_label = masker(items) @@ -29,17 +36,13 @@ class Masker(object): [['a', '[MASK]', 'c'], ['d', '[PAD]', '[PAD]'], ['hello', '[MASK]', '[PAD]']] >>> mask_label [[0, 1, 0], [0, 0, 0], [0, 1, 0]] - """ + Returns + ---------- + list + list of masked_seq and list of masked_list + """ def __init__(self, mask: (int, str, ...) = 0, per=0.2, seed=None): - """ - - Parameters - ---------- - mask: int, str - per - seed - """ self.seed = np.random.default_rng(seed) self.per = per self.mask = mask diff --git a/EduNLP/ModelZoo/utils/padder.py b/EduNLP/ModelZoo/utils/padder.py index ed86cfef..57f6219b 100644 --- a/EduNLP/ModelZoo/utils/padder.py +++ b/EduNLP/ModelZoo/utils/padder.py @@ -5,7 +5,8 @@ class PadSequence(object): - """Pad the sequence. + """ + Pad the sequence. Pad the sequence to the given `length` by inserting `pad_val`. If `clip` is set, sequence that has length larger than `length` will be clipped. @@ -17,24 +18,18 @@ class PadSequence(object): pad_val : number The pad value. Default 0 clip : bool - """ + Returns + ------- + ret + list of number + """ def __init__(self, length, pad_val=0, clip=True): self._length = length self._pad_val = pad_val self._clip = clip def __call__(self, sample: list): - """ - - Parameters - ---------- - sample : list of number - - Returns - ------- - ret : list of number - """ sample_length = len(sample) if sample_length >= self._length: if self._clip and sample_length > self._length: @@ -59,6 +54,8 @@ def pad_sequence(sequence: list, max_length=None, pad_val=0, clip=True): Returns ------- + Modified list:list + padding the sequence in the same size. Examples -------- diff --git a/EduNLP/Pretrain/gensim_vec.py b/EduNLP/Pretrain/gensim_vec.py index 17482b23..51d408ae 100644 --- a/EduNLP/Pretrain/gensim_vec.py +++ b/EduNLP/Pretrain/gensim_vec.py @@ -19,17 +19,24 @@ class GensimWordTokenizer(object): Parameters ---------- - symbol: - gm - fgm - gmas - fgmas - general: - True when item isn't in standard format, and want to tokenize formulas(except formulas in figure) linearly. - False when use 'ast' mothed to tokenize formulas instead of 'linear'. + symbol: str + select the methods to symbolize: + "t": text, + "f": formula, + "g": figure, + "m": question mark, + "a": tag, + "s": sep, + e.g.: gm, fgm, gmas, fgmas + general: bool + + True: when item isn't in standard format, and want to tokenize formulas(except formulas in figure) linearly. + + False: when use 'ast' mothed to tokenize formulas instead of 'linear'. Returns ---------- + tokenizer: Tokenizer Examples ---------- @@ -76,9 +83,39 @@ class GensimSegTokenizer(object): # pragma: no cover Parameters ---------- - symbol: - gms - fgm + symbol:str + select the methods to symbolize: + "t": text, + "f": formula, + "g": figure, + "m": question mark, + "a": tag, + "s": sep, + e.g. gms, fgm + + depth: int or None + + 0: only separate at \\SIFSep ; + 1: only separate at \\SIFTag ; + 2: separate at \\SIFTag and \\SIFSep ; + otherwise, separate all segments ; + + Returns + ---------- + tokenizer: Tokenizer + + Examples + ---------- + >>> tokenizer = GensimSegTokenizer(symbol="gms", depth=None) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item[:10]) + [['公式'], [\\FormFigureID{wrong1?}], ['如图'], ['[FIGURE]'],...['最大值'], ['[MARK]']] + >>> tokenizer = GensimSegTokenizer(symbol="fgm", depth=None) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item[:10]) + [['公式'], ['[FORMULA]'], ['如图'], ['[FIGURE]'], ['[FORMULA]'],...['[FORMULA]'], ['最大值'], ['[MARK]']] """ def __init__(self, symbol="gms", depth=None, flatten=False, **kwargs): self.symbol = symbol @@ -117,6 +154,7 @@ def __call__(self, item, flatten=None, **kwargs): class MonitorCallback(CallbackAny2Vec): + """record the loss in each epoch""" def __init__(self, test_words): self.epoch = 0 self._test_words = test_words @@ -127,6 +165,38 @@ def on_epoch_end(self, model): def train_vector(items, w2v_prefix, embedding_dim=None, method="sg", binary=None, train_params=None): + """ + + Parameters + ---------- + items:str + the text of question + w2v_prefix + embedding_dim:int + vector_size + method:str + the method of training, + e.g.: sg, cbow, fasttext, d2v, bow, tfidf + binary: model format + True:bin; + False:kv + train_params: dict + the training parameters passed to model + + Returns + ---------- + tokenizer: Tokenizer + + Examples + ---------- + >>> tokenizer = GensimSegTokenizer(symbol="gms", depth=None) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item[:10]) + [['公式'], [\\FormFigureID{wrong1?}], ['如图'], ['[FIGURE]'],...['最大值'], ['[MARK]']] + >>> train_vector(token_item[:10], "examples/test_model/data/gensim_luna_stem_t_", 100) #doctest: +ELLIPSIS + 'examples/test_model/data/gensim_luna_stem_t_sg_100.kv' + """ monitor = MonitorCallback(["word", "I", "less"]) _train_params = dict( min_count=0, diff --git a/EduNLP/SIF/parser/parser.py b/EduNLP/SIF/parser/parser.py index db290946..471bb450 100644 --- a/EduNLP/SIF/parser/parser.py +++ b/EduNLP/SIF/parser/parser.py @@ -1,7 +1,20 @@ from EduNLP.Formula.ast import str2ast, katex_parse +import re class Parser: + """ + initial data and special variable + + Attributes + ---------- + get_token + Get different elements in the item. + txt_list + show txt list + description_list + use Parser to process and describe the txt + """ def __init__(self, data): self.lookahead = 0 self.head = 0 @@ -98,6 +111,17 @@ def call_error(self): self.error_flag = 1 def get_token(self): + r""" + Get different elements in the item. + + Parameters + ---------- + + Returns + ------- + elements:chinese,alphabet,number,ch_pun_list,en_pun_list,latex formula + + """ if self.head >= len(self.text): return self.empty ch = self.text[self.head] @@ -222,7 +246,7 @@ def get_token(self): while self.head < len(self.text) and self.text[self.head] != '$': ch_informula = self.text[self.head] if flag and self.is_chinese(ch_informula): - # latex 中出现中文字符,打印且只打印一次 warning + # latex 中出现非法中文字符,打印且只打印一次 warning print("Warning: there is some chinese characters in formula!") self.warnning = 1 flag = 0 @@ -230,7 +254,7 @@ def get_token(self): if self.head >= len(self.text): self.call_error() return self.error - # 检查latex公式的完整性和可解析性 + # 检查 latex 公式的完整性和可解析性 if not self._is_formula_legal(self.text[formula_start:self.head]): self.call_error() return self.error diff --git a/EduNLP/SIF/segment/segment.py b/EduNLP/SIF/segment/segment.py index 93c5713f..39e2f869 100644 --- a/EduNLP/SIF/segment/segment.py +++ b/EduNLP/SIF/segment/segment.py @@ -16,16 +16,19 @@ class LatexFormulaSegment(str): class Figure(object): + """decode figure which has been encode by base64""" def __init__(self, is_base64=False): self.base64 = is_base64 self.figure = None @classmethod def base64_to_numpy(cls, figure: str): + """Creat a arrary in a designated buffer""" return np.frombuffer(base64.b64decode(figure), dtype=np.uint8) class FigureFormulaSegment(Figure): + """Duel with figureformula, especially coding in base64""" def __init__(self, src, is_base64=False, figure_instance: (dict, bool) = None): super(FigureFormulaSegment, self).__init__(is_base64) self.src = src @@ -45,6 +48,7 @@ def __repr__(self): class FigureSegment(Figure): + """Duel with figure, especially coding in base64""" def __init__(self, src, is_base64=False, figure_instance: (dict, bool) = None): super(FigureSegment, self).__init__(is_base64) self.src = src @@ -76,6 +80,41 @@ class SepSegment(str): class SegmentList(object): + """ + + Parameters + ---------- + item + figures:dict + + Returns + ---------- + list + tokenizated item + + Examples + -------- + >>> test_item = "如图所示,则三角形$ABC$的面积是$\\SIFBlank$。$\\FigureID{1}$" + >>> SegmentList(test_item) + ['如图所示,则三角形', 'ABC', '的面积是', '\\\\SIFBlank', '。', \\FigureID{1}] + + Attributes + ---------- + segments + show all segments + text_segments + show text segments + formula_segments + show formula segments + figure_segments + show figure sements + ques_mark_segments + show question mark segments + tag_segments + show tag segments + describe + show number of each elements + """ def __init__(self, item, figures: dict = None): self._segments = [] self._text_segments = [] @@ -119,6 +158,7 @@ def __len__(self): return len(self._segments) def append(self, segment) -> None: + """add segment to corresponding segments""" if isinstance(segment, TextSegment): self._text_segments.append(len(self)) elif isinstance(segment, (LatexFormulaSegment, FigureFormulaSegment)): @@ -137,6 +177,7 @@ def append(self, segment) -> None: @property def segments(self): + """return segments""" if self._seg_idx is None: return self._segments else: @@ -144,29 +185,37 @@ def segments(self): @property def text_segments(self): + """return text segments""" return [self._segments[i] for i in self._text_segments] @property def formula_segments(self): + """return formula segments""" return [self._segments[i] for i in self._formula_segments] @property def figure_segments(self): + """return figure segments""" return [self._segments[i] for i in self._figure_segments] @property def ques_mark_segments(self): + """return question mark segments""" return [self._segments[i] for i in self._ques_mark_segments] @property def tag_segments(self): + """return tag segments""" return [self._segments[i] for i in self._tag_segments] def to_symbol(self, idx, symbol): + """switch element to its symbol""" self._segments[idx] = symbol def symbolize(self, to_symbolize="fgm"): """ + Switch designated elements to symbol. \ + It is a good way to protect or preserve the elements which we don't want to tokenize. Parameters ---------- @@ -175,6 +224,8 @@ def symbolize(self, to_symbolize="fgm"): "f": formula "g": figure "m": question mark + "a": tag + "s": sep Returns ------- @@ -201,6 +252,16 @@ def symbolize(self, to_symbolize="fgm"): @contextmanager def filter(self, drop: (set, str) = "", keep: (set, str) = "*"): + """ + Output special element list selective.Drop means not show.Keep means show. + + Parameters + ---------- + drop: set or str + The alphabet should be included in "tfgmas", which means drop selected segments out of return value. + keep: set or str + The alphabet should be included in "tfgmas", which means only keep selected segments in return value. + """ _drop = {c for c in drop} if isinstance(drop, str) else drop if keep == "*": _keep = {c for c in "tfgmas" if c not in _drop} @@ -223,6 +284,7 @@ def filter(self, drop: (set, str) = "", keep: (set, str) = "*"): self._seg_idx = None def describe(self): + """show the length of different segments""" return { "t": len(self._text_segments), "f": len(self._formula_segments), @@ -233,6 +295,7 @@ def describe(self): def seg(item, figures=None, symbol=None): r""" + It is a interface for SegmentList. And show it in an appropriate way. Parameters ---------- @@ -242,6 +305,8 @@ def seg(item, figures=None, symbol=None): Returns ------- + list + segmented item Examples -------- @@ -282,18 +347,18 @@ def seg(item, figures=None, symbol=None): ... } >>> from EduNLP.utils import dict2str4sif >>> test_item_1_str = dict2str4sif(test_item_1) - >>> test_item_1_str # doctest: +ELLIPSIS + >>> test_item_1_str '$\\SIFTag{stem_begin}$...$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0...$\\SIFTag{options_end}$' >>> s1 = seg(test_item_1_str, symbol="tfgm") - >>> s1 # doctest: +ELLIPSIS + >>> s1 ['\\SIFTag{stem_begin}'...'\\SIFTag{stem_end}', '\\SIFTag{options_begin}', '\\SIFTag{list_0}', ...] >>> with s1.filter(keep="a"): - ... s1 # doctest: +ELLIPSIS + ... s1 [...'\\SIFTag{list_0}', '\\SIFTag{list_1}', '\\SIFTag{list_2}', '\\SIFTag{list_3}', '\\SIFTag{options_end}'] - >>> s1.tag_segments # doctest: +ELLIPSIS + >>> s1.tag_segments ['\\SIFTag{stem_begin}', '\\SIFTag{stem_end}', '\\SIFTag{options_begin}', ... '\\SIFTag{options_end}'] >>> test_item_1_str_2 = dict2str4sif(test_item_1, tag_mode="head", add_list_no_tag=False) - >>> seg(test_item_1_str_2, symbol="tfgmas") # doctest: +ELLIPSIS + >>> seg(test_item_1_str_2, symbol="tfgmas") ['[TAG]', ... '[TAG]', '[TEXT]', '[SEP]', '[TEXT]', '[SEP]', '[FORMULA]', '[SEP]', '[TEXT]'] >>> s2 = seg(test_item_1_str_2, symbol="fgm") >>> s2.tag_segments diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index af4fa63a..68787131 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -12,15 +12,19 @@ def is_sif(item): r""" + the part aims to check whether the input is sif format + Parameters ---------- - item + item:str + a raw item which respects stem Returns ------- - when item can not be parsed correctly, raise Error; - when item doesn't need to be modified, return Ture; - when item needs to be modified, return False; + bool + when item can not be parsed correctly, raise Error; + when item doesn't need to be modified, return Ture; + when item needs to be modified, return False; Examples -------- @@ -44,13 +48,17 @@ def is_sif(item): def to_sif(item): r""" + the part aims to switch item to sif formate + Parameters ---------- - item + items:str + a raw item which respects stem Returns ------- - item + item:str + the item which accords with sif format Examples -------- @@ -73,29 +81,46 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No Parameters ---------- - item - figures - safe - symbol - tokenization + item:str + a raw item which respects stem + figures:dict + {"FigureID": Base64 encoding of the figure} + + safe:bool + Check whether the text conforms to the sif format + + symbol:str + select the methods to symbolize: + "t": text + "f": formula + "g": figure + "m": question mark + "a": tag + "s": sep + + tokenization:bool + True: tokenize the item + tokenization_params: method: which tokenizer to be used, "linear" or "ast" - The parameters only useful for "linear": + + The parameters only useful for "linear": None The parameters only useful for "ast": ord2token: whether to transfer the variables (mathord) and constants (textord) to special tokens. var_numbering: whether to use number suffix to denote different variables errors: - warn - raise - coerce - strict + warn, + raise, + coerce, + strict, ignore Returns ------- - When tokenization is False, return SegmentList; - When tokenization is True, return TokenList + list + When tokenization is False, return SegmentList; + When tokenization is True, return TokenList Examples -------- diff --git a/EduNLP/SIF/tokenization/formula/ast_token.py b/EduNLP/SIF/tokenization/formula/ast_token.py index 67ca8ffd..22a2af23 100644 --- a/EduNLP/SIF/tokenization/formula/ast_token.py +++ b/EduNLP/SIF/tokenization/formula/ast_token.py @@ -35,6 +35,10 @@ # return nodes def traversal_formula(ast, ord2token=False, var_numbering=False, strategy="post", *args, **kwargs): + """ + The part will run only when the return type is list. And it provides two strategy: post and linear. + Besides, tokens list will append node follow its type. + """ tokens = [] if strategy == "post": order = nx.dfs_postorder_nodes(ast) @@ -58,6 +62,7 @@ def traversal_formula(ast, ord2token=False, var_numbering=False, strategy="post" def ast_tokenize(formula, ord2token=False, var_numbering=False, return_type="formula", *args, **kwargs): """ + According to return type, tokenizing formula by different methods. Parameters ---------- diff --git a/EduNLP/SIF/tokenization/formula/formula.py b/EduNLP/SIF/tokenization/formula/formula.py index 8afbe2da..eb08f418 100644 --- a/EduNLP/SIF/tokenization/formula/formula.py +++ b/EduNLP/SIF/tokenization/formula/formula.py @@ -9,6 +9,7 @@ def tokenize(formula, method="linear", errors="raise", **kwargs): """ + The total function to tokenize formula by linear or ast. Parameters ---------- diff --git a/EduNLP/SIF/tokenization/formula/linear_token.py b/EduNLP/SIF/tokenization/formula/linear_token.py index 7b5d1212..1e3236bc 100644 --- a/EduNLP/SIF/tokenization/formula/linear_token.py +++ b/EduNLP/SIF/tokenization/formula/linear_token.py @@ -6,6 +6,37 @@ def cut(formula, preserve_braces=True, with_dollar=False, preserve_dollar=False, number_as_tag=False, preserve_src=True): # pragma: no cover + """ + cut formula thoroughly + + Parameters + ---------- + formula:str + preserve_braces: + when it is False "{" and "}" will be filted + with_dollar: + have dollar or not + preserve_dollar: + keep "$" + number_as_tag: + whether switch number to tag, it just can idenify the number which is more than one bit. + preserve_src + + Returns + -------- + list + return a preliminary list which cut fully + + Examples + ---------- + >>> cut(r"${x + y}^\\frac{1}{2} + 12.1 = 0$") + ['{x + y}', '^', '\\\\f', 'r', 'a', 'c', '{1}', '{2}', '+', '12.1', '=', '0'] + >>> cut(r"${x + y}^\\frac{1}{2} + 12.1 = 0$",preserve_dollar=False) + ['{x + y}', '^', '\\\\f', 'r', 'a', 'c', '{1}', '{2}', '+', '12.1', '=', '0'] + >>> cut(r"${x + y}^\\frac{1}{2} + 12.1 = 0$",number_as_tag=True) + ['{x + y}', '^', '\\\\f', 'r', 'a', 'c', '{1}', '{2}', '+', '{decimal}', '=', '0'] + + """ class States(IntFlag): CHAR = 0 MATH = 1 @@ -135,6 +166,7 @@ class States(IntFlag): def reduce(fea): # pragma: no cover + """restore some formula""" rules = [ ('a r c s i n', 'arcsin'), ('a r c c o s', 'arccos'), @@ -165,6 +197,7 @@ def reduce(fea): # pragma: no cover def connect_char(words): # pragma: no cover + """connect and switch to list type""" result = [] buffer = "" for w in words: @@ -201,6 +234,8 @@ def latex_parse(formula, preserve_braces=True, with_dollar=True, def linear_tokenize(formula, preserve_braces=True, number_as_tag=False, *args, **kwargs): """ + linear tokenize formula. + It includes three processes:cut, reduce and connect_char. Parameters ---------- diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index cee23a50..2e063f85 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -9,6 +9,7 @@ def tokenize(text, granularity="word", stopwords="default"): """ + Using jieba library to tokenize item by word or char. Parameters ---------- diff --git a/EduNLP/SIF/tokenization/tokenization.py b/EduNLP/SIF/tokenization/tokenization.py index 299eaf62..93eb61a1 100644 --- a/EduNLP/SIF/tokenization/tokenization.py +++ b/EduNLP/SIF/tokenization/tokenization.py @@ -16,9 +16,30 @@ class TokenList(object): """ + Parameters + ---------- + segment_list:list + segmented item + text_params:dict + formula_params:dict + figure_params:dict + Attributes ------------- - + tokens + show all tokens + text_tokens + show text tokens + formula_tokens + show formula tokens + figure_tokens + show figure tokens + ques_mark_tokens + show question mark tokens + tag_tokens + show tag tokens + describe + show number of each elements """ def __init__(self, segment_list: SegmentList, text_params=None, formula_params=None, figure_params=None): self._tokens = [] @@ -53,6 +74,7 @@ def __init__(self, segment_list: SegmentList, text_params=None, formula_params=N self._token_idx = None def _variable_standardization(self): + """It makes same parmeters have the same number.""" if self.formula_tokenize_method == "ast": ast_formulas = [self._tokens[i] for i in self._formula_tokens if isinstance(self._tokens[i], Formula)] if ast_formulas: @@ -60,6 +82,22 @@ def _variable_standardization(self): @contextmanager def add_seg_type(self, seg_type, tar: list, add_seg_type=True, mode="delimiter"): + """ + Add seg tag in different position + + Parameters + ---------- + seg_type:str + t: text + f:formula + tar:list + add_seg_type + if the value==False, the function will not be executed. + mode:str + delimiter: both in the head and at the tail + head: only in the head + tail: only at the tail + """ if add_seg_type is True and mode in {"delimiter", "head"}: if seg_type == "t": tar.append(TEXT_BEGIN) @@ -79,6 +117,7 @@ def add_seg_type(self, seg_type, tar: list, add_seg_type=True, mode="delimiter") def get_segments(self, add_seg_type=True, add_seg_mode="delimiter", keep="*", drop="", depth=None): # pragma: no cover r""" + call segment function. Parameters ---------- @@ -97,6 +136,8 @@ def get_segments(self, add_seg_type=True, add_seg_mode="delimiter", keep="*", dr Returns ------- + list + segmented item """ keep = set("tfgmas" if keep == "*" else keep) - set(drop) @@ -125,6 +166,7 @@ def get_segments(self, add_seg_type=True, add_seg_mode="delimiter", keep="*", dr return _segments def __get_segments(self, seg_type): + """It aims to understand letters' meaning.""" _segments = [] for i in self._seg_types[seg_type]: _segment = [] @@ -137,22 +179,27 @@ def __get_segments(self, seg_type): @property def text_segments(self): + """get text segment""" return self.__get_segments("t") @property def formula_segments(self): + """get formula segment""" return self.__get_segments("f") @property def figure_segments(self): + """get figure segment""" return self.__get_segments("g") @property def ques_mark_segments(self): + """get question mark segment""" return self.__get_segments("m") @property def tokens(self): + """add token to a list""" tokens = [] if self._token_idx is not None: for i, token in enumerate(self._tokens): @@ -164,6 +211,7 @@ def tokens(self): return tokens def append_text(self, segment, symbol=False): + """append text""" with self._append("t"): if symbol is False: tokens = text.tokenize(segment, **self.text_params) @@ -175,6 +223,7 @@ def append_text(self, segment, symbol=False): self._tokens.append(segment) def append_formula(self, segment, symbol=False, init=True): + """append formula by different methods""" with self._append("f"): if symbol is True: self._formula_tokens.append(len(self._tokens)) @@ -198,27 +247,32 @@ def append_formula(self, segment, symbol=False, init=True): self._tokens.append(token) def append_figure(self, segment, **kwargs): + """append figure""" with self._append("g"): self._figure_tokens.append(len(self._tokens)) self._tokens.append(segment) def append_ques_mark(self, segment, **kwargs): + """append question mark""" with self._append("m"): self._ques_mark_tokens.append(len(self._tokens)) self._tokens.append(segment) def append_tag(self, segment, **kwargs): + """append tag""" with self._append("a"): self._tag_tokens.append(len(self._tokens)) self._tokens.append(segment) def append_sep(self, segment, **kwargs): + """append sep""" with self._append("s"): self._sep_tokens.append(len(self._tokens)) self._tokens.append(segment) @contextmanager def _append(self, seg_type): + """It aims to understand letters' meaning.""" start = len(self._tokens) yield end = len(self._tokens) @@ -226,6 +280,16 @@ def _append(self, seg_type): self._segments.append((start, end, seg_type)) def append(self, segment, lazy=False): + """ + the total api for appending elements + + Parameters + ---------- + segment + lazy + True:Doesn't distinguish parmeters. + False:It makes same parmeters have the same number. + """ if isinstance(segment, TextSegment): self.append_text(segment) elif isinstance(segment, (LatexFormulaSegment, FigureFormulaSegment)): @@ -259,15 +323,18 @@ def append(self, segment, lazy=False): raise TypeError("Unknown segment type: %s" % type(segment)) def extend(self, segments): + """append every segment in turn""" for segment in segments: self.append(segment, True) self._variable_standardization() @property def text_tokens(self): + """return text tokens""" return [self._tokens[i] for i in self._text_tokens] def __add_token(self, token, tokens): + """classify token to tokens""" if isinstance(token, Formula): if self.formula_params.get("return_type") == "list": tokens.extend(formula.traversal_formula(token.ast_graph, **self.formula_params)) @@ -285,6 +352,7 @@ def __add_token(self, token, tokens): @property def formula_tokens(self): + """return formula tokens""" tokens = [] for i in self._formula_tokens: self.__add_token(self._tokens[i], tokens) @@ -292,6 +360,7 @@ def formula_tokens(self): @property def figure_tokens(self): + """return figure tokens""" tokens = [] for i in self._figure_tokens: self.__add_token(self._tokens[i], tokens) @@ -299,6 +368,7 @@ def figure_tokens(self): @property def ques_mark_tokens(self): + """return question mark tokens""" return [self._tokens[i] for i in self._ques_mark_tokens] def __repr__(self): @@ -306,10 +376,26 @@ def __repr__(self): @property def inner_formula_tokens(self): + """return inner formula tokens""" return [self._tokens[i] for i in self._formula_tokens] @contextmanager def filter(self, drop: (set, str) = "", keep: (set, str) = "*"): + """ + Output special element list selective.Drop means not show.Keep means show. + + Parameters + ---------- + drop: set or str + The alphabet should be included in "tfgmas", which means drop selected segments out of return value. + keep: set or str + The alphabet should be included in "tfgmas", which means only keep selected segments in return value. + + Returns + -------- + list + filted list + """ _drop = {c for c in drop} if isinstance(drop, str) else drop if keep == "*": _keep = {c for c in "tfgmas" if c not in _drop} @@ -332,6 +418,7 @@ def filter(self, drop: (set, str) = "", keep: (set, str) = "*"): self._token_idx = None def describe(self): + """show the total number of each elements""" return { "t": len(self._text_tokens), "f": len(self._formula_tokens), @@ -341,10 +428,38 @@ def describe(self): def tokenize(segment_list: SegmentList, text_params=None, formula_params=None, figure_params=None): + """ + an actual api to tokenize item + + Parameters + ---------- + segment_list:list + segmented item + text_params:dict + the method to duel with text + formula_params:dict + the method to duel with formula + figure_params:dict + the method to duel with figure + + Returns + ---------- + list + tokenized item + + Examples + -------- + >>> items = "如图所示,则三角形$ABC$的面积是$\\SIFBlank$。$\\FigureID{1}$" + >>> tokenize(SegmentList(items)) + ['如图所示', '三角形', 'ABC', '面积', '\\\\SIFBlank', \\FigureID{1}] + >>> tokenize(SegmentList(items),formula_params={"method": "ast"}) + ['如图所示', '三角形', , '面积', '\\\\SIFBlank', \\FigureID{1}] + """ return TokenList(segment_list, text_params, formula_params, figure_params) def link_formulas(*token_list: TokenList, link_vars=True): + """call formula function""" ast_formulas = [] for tl in token_list: if tl.formula_tokenize_method == "ast": diff --git a/EduNLP/Tokenizer/tokenizer.py b/EduNLP/Tokenizer/tokenizer.py index 08b09e26..64e40970 100644 --- a/EduNLP/Tokenizer/tokenizer.py +++ b/EduNLP/Tokenizer/tokenizer.py @@ -15,6 +15,19 @@ def __call__(self, *args, **kwargs): class PureTextTokenizer(Tokenizer): r""" + Duel with text and plain text formula. + And filting special formula like $\\FormFigureID{…}$ and $\\FormFigureBase64{…}. + + Parameters + ---------- + items: str + key + args + kwargs + + Returns + ------- + token Examples -------- @@ -40,7 +53,6 @@ class PureTextTokenizer(Tokenizer): '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', '\\quad', 'A', '\\cap', 'B', '='] """ - def __init__(self, *args, **kwargs): self.tokenization_params = { "formula_params": { @@ -56,6 +68,18 @@ def __call__(self, items: Iterable, key=lambda x: x, *args, **kwargs): class TextTokenizer(Tokenizer): r""" + Duel with text and formula including special formula. + + Parameters + ---------- + items: str + key + args + kwargs + + Returns + ------- + token Examples ---------- @@ -72,7 +96,6 @@ class TextTokenizer(Tokenizer): >>> next(tokens)[:10] ['[TAG]', '复数', 'z', '=', '1', '+', '2', 'i', '+', 'i'] """ - def __init__(self, *args, **kwargs): self.tokenization_params = { "formula_params": { @@ -94,12 +117,16 @@ def __call__(self, items: Iterable, key=lambda x: x, *args, **kwargs): def get_tokenizer(name, *args, **kwargs): r""" + It is a total interface to use difference tokenizer. Parameters ---------- name: str - args - kwargs + the name of tokenizer, e.g. text, pure_text. + args: + the parameters passed to tokenizer + kwargs: + the parameters passed to tokenizer Returns ------- diff --git a/EduNLP/Vector/gensim_vec.py b/EduNLP/Vector/gensim_vec.py index 23a623b8..3ade74a5 100644 --- a/EduNLP/Vector/gensim_vec.py +++ b/EduNLP/Vector/gensim_vec.py @@ -11,16 +11,19 @@ class W2V(Vector): + """ + The part uses gensim library providing FastText, Word2Vec and KeyedVectors method to transfer word to vector. + + Parameters + ---------- + filepath: + path to the pretrained model file + method: str + fasttext + other(Word2Vec) + binary + """ def __init__(self, filepath, method=None, binary=None): - """ - - Parameters - ---------- - filepath: - path to the pretrained model file - method - binary - """ fp = PurePath(filepath) self.binary = binary if binary is not None else (True if fp.suffix == ".bin" else False) if self.binary is True: @@ -70,6 +73,22 @@ def infer_tokens(self, items, *args, **kwargs) -> list: class BowLoader(object): + """ + Using doc2bow model, which has a lot of effects. + + Convert document (a list of words) into the bag-of-words format = list of \ + (token_id, token_count) 2-tuples. Each word is assumed to be a \ + tokenized and normalized string (either unicode or utf8-encoded). \ + No further preprocessing is done on the words in document;\ + apply tokenization, stemming etc. before calling this method. + + If allow_update is set, then also update dictionary in the process: \ + create ids for new words. At the same time, update document frequencies – \ + for each word appearing in this document, increase its document frequency (self.dfs) by one. + + If allow_update is not set, this function is const, \ + aka read-only. + """ def __init__(self, filepath): self.dictionary = corpora.Dictionary.load(filepath) @@ -88,6 +107,11 @@ def vector_size(self): class TfidfLoader(object): + """ + This module implements functionality related to the Term Frequency - \ + Inverse Document Frequency \ + vector space bag-of-words models. + """ def __init__(self, filepath): self.tfidf_model = TfidfModel.load(filepath) # 'tfidf' model shold be used based on 'bow' model @@ -111,6 +135,22 @@ def vector_size(self): class D2V(Vector): + """ + It is a collection which include d2v, bow, tfidf method. + + Parameters + ----------- + filepath + method: str + d2v + bow + tfidf + item + + Returns + --------- + d2v model:D2V + """ def __init__(self, filepath, method="d2v"): self._method = method self._filepath = filepath diff --git a/EduNLP/Vector/t2v.py b/EduNLP/Vector/t2v.py index ec0887ef..dbd68855 100644 --- a/EduNLP/Vector/t2v.py +++ b/EduNLP/Vector/t2v.py @@ -20,6 +20,25 @@ class T2V(object): + """ + The function aims to transfer token list to vector. If you have a certain model, you can use T2V directly. \ + Otherwise, calling get_pretrained_t2v function is a better way to get vector which can switch it without your model. + + Parameters + ---------- + model: str + select the model type + e.g.: d2v, rnn, lstm, gru, elmo, etc. + + Examples + -------- + >>> item = [{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\ + ... 如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}] + >>> path = "examples/test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> t2v = T2V('d2v',filepath=path) + >>> print(t2v(item)) # doctest: +ELLIPSIS + [array([...dtype=float32)] + """ def __init__(self, model: str, *args, **kwargs): model = model.lower() self.model_type = model @@ -49,10 +68,41 @@ def vector_size(self) -> int: "d2v_lit_256": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_literal_256.zip", "d2v"], "w2v_eng_300": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/w2v/general_english_300.zip", "w2v"], "w2v_lit_300": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/w2v/general_literal_300.zip", "w2v"], + "test_w2v": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/w2v/test_w2v_256.zip", "w2v"], + "test_d2v": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/test_256.zip", "d2v"] } def get_pretrained_t2v(name, model_dir=MODEL_DIR): + """ + It is a good idea if you want to switch token list to vector earily. + + Parameters + ---------- + name:str + select the pretrained model + e.g.: + d2v_all_256, + d2v_sci_256, + d2v_eng_256, + d2v_lit_256, + w2v_eng_300, + w2v_lit_300. + model_dir:str + the path of model, default: MODEL_DIR = '~/.EduNLP/model' + + Returns + ------- + t2v model: T2V + + Examples + -------- + >>> item = [{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\ + ... 如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}] + >>> i2v = get_pretrained_t2v("test_d2v", "examples/test_model/data/d2v") # doctest: +ELLIPSIS + >>> print(i2v(item)) # doctest: +ELLIPSIS + [array([...dtype=float32)] + """ if name not in PRETRAINED_MODELS: raise KeyError( "Unknown pretrained model %s, use one of the provided pretrained models: %s" % ( diff --git a/EduNLP/utils/data.py b/EduNLP/utils/data.py index dec5d773..e901696c 100644 --- a/EduNLP/utils/data.py +++ b/EduNLP/utils/data.py @@ -11,6 +11,7 @@ @contextmanager def add_annotation(key, tag_mode, tar: list, key_as_tag=True): + """add tag""" if key_as_tag is True: if tag_mode == "delimiter": tar.append(ann_begin_format.format(key)) @@ -26,6 +27,7 @@ def add_annotation(key, tag_mode, tar: list, key_as_tag=True): def dict2str4sif(obj: dict, key_as_tag=True, tag_mode="delimiter", add_list_no_tag=True, keys=None) -> str: r""" + The function aims to transfer dictionary format item to string format item. Parameters ---------- diff --git a/docs/source/_static/formula.png b/docs/source/_static/formula.png new file mode 100644 index 00000000..10fecbd3 Binary files /dev/null and b/docs/source/_static/formula.png differ diff --git a/docs/source/_static/formulagroup.png b/docs/source/_static/formulagroup.png new file mode 100644 index 00000000..4b48f46c Binary files /dev/null and b/docs/source/_static/formulagroup.png differ diff --git "a/docs/source/_static/\346\226\260\346\265\201\347\250\213\345\233\276.png" "b/docs/source/_static/\346\226\260\346\265\201\347\250\213\345\233\276.png" new file mode 100644 index 00000000..bbfacbd3 Binary files /dev/null and "b/docs/source/_static/\346\226\260\346\265\201\347\250\213\345\233\276.png" differ diff --git "a/docs/source/_static/\346\265\201\347\250\213\345\233\276.jpg" "b/docs/source/_static/\346\265\201\347\250\213\345\233\276.jpg" new file mode 100644 index 00000000..dfdb8737 Binary files /dev/null and "b/docs/source/_static/\346\265\201\347\250\213\345\233\276.jpg" differ diff --git a/docs/source/api/ModelZoo.rst b/docs/source/api/ModelZoo.rst index ffdc764d..4a624cb9 100644 --- a/docs/source/api/ModelZoo.rst +++ b/docs/source/api/ModelZoo.rst @@ -1,5 +1,5 @@ EduNLP.ModelZoo -============== +================== rnn ----------- diff --git a/docs/source/api/formula.rst b/docs/source/api/formula.rst index a584d003..d34311c3 100644 --- a/docs/source/api/formula.rst +++ b/docs/source/api/formula.rst @@ -1,6 +1,10 @@ EduNLP.Formula ======================= +.. automodule:: EduNLP.Formula.Formula + :members: + :imported-members: + .. automodule:: EduNLP.Formula.ast :members: :imported-members: diff --git a/docs/source/api/sif.rst b/docs/source/api/sif.rst index a49f7f15..7467b7cb 100644 --- a/docs/source/api/sif.rst +++ b/docs/source/api/sif.rst @@ -8,16 +8,16 @@ SIF :imported-members: -Segment ----------- -.. automodule:: EduNLP.SIF.segment +Parser +-------- +.. automodule:: EduNLP.SIF.parser.parser.Parser :members: :imported-members: -Parser --------- -.. automodule:: EduNLP.SIF.parser +Segment +---------- +.. automodule:: EduNLP.SIF.segment.segment :members: :imported-members: @@ -40,6 +40,14 @@ text formula ^^^^^^^^^ -.. automodule:: EduNLP.SIF.tokenization.formula +.. automodule:: EduNLP.SIF.tokenization.formula.formula + :members: + :imported-members: + +.. automodule:: EduNLP.SIF.tokenization.formula.ast_token :members: :imported-members: + +.. automodule:: EduNLP.SIF.tokenization.formula.linear_token + :members: + :imported-members: \ No newline at end of file diff --git a/docs/source/api/vector.rst b/docs/source/api/vector.rst index 9081dfea..1c73d4bc 100644 --- a/docs/source/api/vector.rst +++ b/docs/source/api/vector.rst @@ -1,10 +1,16 @@ EduNLP.Vector ========================== -Vector ---------------- +EduNLP.Vector.rnn +-------------------- -.. automodule:: EduNLP.Vector +.. automodule:: EduNLP.Vector.rnn :members: :imported-members: +EduNLP.Vector +------------------------- + +.. automodule:: EduNLP.Vector + :members: + :imported-members: diff --git a/docs/source/conf.py b/docs/source/conf.py index 9d6a118b..7787a16d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -74,6 +74,11 @@ def copy_tree(src, tar): 'build/blitz/pretrain/seg_token/d2v': '_static/d2v.png', 'build/blitz/pretrain/seg_token/d2v_d1': '_static/d2v_d1.png', 'build/blitz/pretrain/seg_token/d2v_d2': '_static/d2v_d2.png', + 'build/blitz/tokenizer/tokenizier': '_static/tokenizer.png', + 'build/blitz/sif/sif4sci': '_static/tokenizer.png', + 'build/blitz/vectorization/get_pretrained_i2v': '_static/i2v.png', + 'build/blitz/tokenizer/total_tokenize': '_static/tokenizer.png', + 'build/blitz/vectorization/total_vector': '_static/i2v.png', } # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/index.rst b/docs/source/index.rst index 16107eae..27e9ed8e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -151,11 +151,11 @@ If this repository is helpful for you, please cite our work tutorial/zh/index tutorial/zh/sif - tutorial/zh/seg tutorial/zh/parse + tutorial/zh/seg tutorial/zh/tokenize - tutorial/zh/vectorization tutorial/zh/pretrain + tutorial/zh/vectorization .. toctree:: @@ -164,13 +164,12 @@ If this repository is helpful for you, please cite our work :hidden: :glob: - api/index - api/i2v api/sif - api/tokenizer + api/utils api/formula + api/tokenizer api/pretrain api/ModelZoo + api/i2v api/vector - api/utils diff --git a/docs/source/tutorial/zh/index.rst b/docs/source/tutorial/zh/index.rst index 546065b0..5dafba2b 100644 --- a/docs/source/tutorial/zh/index.rst +++ b/docs/source/tutorial/zh/index.rst @@ -3,148 +3,51 @@ * `标准项目格式 `_ -* `语法解析 `_ +* `语法解析 `_ -* `成分分解 `_ +* `成分分解 `_ * `令牌化 `_ -* `向量化 `_ - * `预训练 `_ -示例 --------- - -标准项目格式 -^^^^^^^^ - -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: sif_gallery - :glob: - - Code for beginner to learn how to use SIF4Sci <../../build/blitz/sif/sif> - Code for beginner to learn how to use sif_additon <../../build/blitz/sif/sif_addition> - - -成分分解 -^^^^^^^^^^^ - -语义成分分解 -#################### - -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: dict2str4sif_gallery - :glob: - - Code for beginner to learn how to use dict2str4sif <../../build/blitz/utils/data.ipynb> - - -结构成分分解 -#################### +* `向量化 `_ -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: seg_gallery - :glob: - - Code for beginner to learn how to use seg <../../build/blitz/seg/seg.ipynb> +主要流程 +---------- +.. figure:: ../../_static/新流程图.png -语法解析 -^^^^^^^^^^^ +* `语法解析 `_ :其作用是将传入的item转换为标准sif格式(即把字母、数字用 ``$...$`` 包裹起来,把选择填空的括号、下划线转换为特殊符号等)。 -文本语法结构解析 -#################### +* `成分分解 `_ :其作用是将传入的符合sif标准的item根据元素种类进行分割开来,从而服务于后面的令牌化环节(即可以将不同类型元素使用各自的方法令牌化)。 -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: parse_gallery - :glob: - - Code for beginner to learn how to use parse <../../build/blitz/parse/parse.ipynb> +* `令牌化 `_:其作用是将传入的经过分词后的item元素列表进行令牌化分解,从而服务于后面的向量化模块。 + 其中通常情况下直接使用文本形式的令牌化方法即可,对于公式而言还可使用ast方法进行解析(调用formula模块); +* `向量化 `_:此部分主要调用的是I2V类及其子类,其作用是将传入的令牌化后的item元素列表进行向量化操作,最终即可得到相应的静态向量。 + 对于向量化模块来说,可以调用自己训练好的模型,也可直接调用提供的预训练模型(调用get_pretrained_i2v模块即可)。 -公式语法结构解析 -#################### +* **下游模型**:将得到的向量进一步处理,从而得到所需的结果。 -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: formula_gallery - :glob: - - Code for beginner to learn how to use Formula <../../build/blitz/formula/formula.ipynb> +示例 +-------- +为使您快速了解此项目的功能,此部分仅展示常用的函数接口使用方法(如得到令牌化序列、试题对应的向量等),对于其中间函数模块(如parse、formula、segment等)以及更细分的接口方法不做展示,如需深入学习,请查看相关部分的文档。 -令牌化 -^^^^^^^^^^^ .. nbgallery:: :caption: This is a thumbnail gallery: - :name: tokenizer_gallery + :name: tokenize_gallery :glob: - Code for beginner to learn how to use Tokenizer <../../build/blitz/tokenizer/tokenizer.ipynb> + 令牌化 <../../build/blitz/tokenizer/tokenizer.ipynb> -向量化 -^^^^^^^^^^^ .. nbgallery:: :caption: This is a thumbnail gallery: :name: vectorization_gallery :glob: - Code for beginner to learn how to use i2v <../../build/blitz/vectorization/i2v.ipynb> - - -预训练 -^^^^^^^^^^^ - -获得数据集 -#################### - -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: rst1-gallery - :glob: - - prepare_dataset <../../build/blitz/pretrain/prepare_dataset.ipynb> - - -gensim模型d2v例子 -#################### - -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: rst2-gallery - :glob: - - d2v_general <../../build/blitz/pretrain/gensim/d2v_general.ipynb> - d2v_bow_tfidf <../../build/blitz/pretrain/gensim/d2v_bow_tfidf.ipynb> - d2v_stem_tf <../../build/blitz/pretrain/gensim/d2v_stem_tf.ipynb> - - -gensim模型w2v例子 -#################### - -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: rst3-gallery - :glob: - - w2v_stem_text <../../build/blitz/pretrain/gensim/w2v_stem_text.ipynb> - w2v_stem_tf <../../build/blitz/pretrain/gensim/w2v_stem_tf.ipynb> - - -seg_token例子 -#################### - -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: rst4-gallery - :glob: - - d2v.ipynb <../../build/blitz/pretrain/seg_token/d2v.ipynb> + 向量化 <../../build/blitz/vectorization/total_vector.ipynb> diff --git a/docs/source/tutorial/zh/parse.rst b/docs/source/tutorial/zh/parse.rst index 9d6ea22e..03721ba0 100644 --- a/docs/source/tutorial/zh/parse.rst +++ b/docs/source/tutorial/zh/parse.rst @@ -4,6 +4,7 @@ 在教育资源中,文本、公式都具有内在的隐式或显式的语法结构,提取这种结构对后续进一步的处理是大有裨益的: * 文本语法结构解析 + * 公式语法结构解析 其目的是: @@ -18,19 +19,274 @@ 1.匹配公式之外的英文字母、数字,只对两个汉字之间的字母、数字做修正,其余匹配到的情况视为不合 latex 语法录入的公式 -2.匹配“( )”型括号(包含英文格式和中文格式),即括号内无内容或为空格的括号,将括号替换$\\SIFChoice$ +2.匹配“( )”型括号(包含英文格式和中文格式),即括号内无内容或为空格的括号,将括号替换 ``$\\SIFChoice$`` -3.匹配下划线,替换连续的下划线或下划线中夹杂空格的情况,将其替换为$\\SIFBlank$ +3.匹配下划线,替换连续的下划线或下划线中夹杂空格的情况,将其替换为 ``$\\SIFBlank$`` 4.匹配latex公式,主要检查latex公式的完整性和可解析性,对latex 中出现中文字符发出警告 -学习路线图 +公式语法结构解析 +-------------------- + +本功能主要由EduNLP.Formula模块实现,具有检查传入的公式是否合法,并将合法的公式转换为art树的形式。从实际使用的角度,本模块常作为中间处理过程,调用相应的模型即可自动选择本模块的相关参数,故一般不需要特别关注。 + +主要内容介绍 ++++++++++++++++ + +1.Formula:对传入的单个公式进行判断,判断传入的公式是否为str形式,如果是则使用ast的方法进行处理,否则进行报错。此外,提供了variable_standardization参数,当此参数为True时,使用变量标准化方法,即同一变量拥有相同的变量编号。 + +2.FormulaGroup:如果需要传入公式集则可调用此接口,最终将形成ast森林,森林中树的结构同Formula。 + +Formula +>>>>>>>>>>>> + +Formula 首先在分词功能中对原始文本的公式做切分处理,另外提供 ``公式解析树`` 功能,可以将数学公式的抽象语法分析树用文本或图片的形式表示出来。 + +本模块另提供公式变量标准化的功能,如判断几个子公式内的‘x’为同一变量。 + +调用库 ++++++++++ + +:: + + import matplotlib.pyplot as plt + from EduNLP.Formula import Formula + from EduNLP.Formula.viz import ForestPlotter + +初始化 ++++++++++ + +传入参数:item + +item为str 或 List[Dict]类型,具体内容为latex 公式 或 公式经解析后产生的抽象语法分析树。 + +:: + + >>> f=Formula("x^2 + x+1 = y") + >>> f + + +查看公式切分后的具体内容 +++++++++++++++++++++++++++++ + +- 查看公式切分后的结点元素 + +:: + + >>> f.elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, + {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}] + +- 查看公式的抽象语法分析树 + +:: + + >>> f.ast + [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 3],'child': [1, 2],'father': None,'forest': None}}, + {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + 'structure': {'bro': [None, 2], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, + 'structure': {'bro': [3, 5], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + 'structure': {'bro': [4, 6], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + 'structure': {'bro': [5, 7], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + 'structure': {'bro': [6, 8], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}, + 'structure': {'bro': [7, None],'child': None,'father': None,'forest': None}}] + + >>> print('nodes: ',f.ast_graph.nodes) + nodes: [0, 1, 2, 3, 4, 5, 6, 7, 8] + >>> print('edges: ' ,f.ast_graph.edges) + edges: [(0, 1), (0, 2)] + +- 将抽象语法分析树用图片表示 + +:: + + >>> ForestPlotter().export(f.ast_graph, root_list=[node["val"]["id"] for node in f.ast if node["structure"]["father"] is None],) + >>> plt.show() + + +.. figure:: ../../_static/formula.png + + +变量标准化 ++++++++++++ + +此参数使得同一变量拥有相同的变量编号。 + +如:``x`` 变量的编号为 ``0``, ``y`` 变量的编号为 ``1``。 + +:: + + >>> f.variable_standardization().elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}, + {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] + +FormulaGroup +>>>>>>>>>>>>>>> + +调用 ``FormulaGroup`` 类解析公式方程组,相关的属性和函数方法同上。 + +:: + + import matplotlib.pyplot as plt + from EduNLP.Formula import Formula + from EduNLP.Formula import FormulaGroup + from EduNLP.Formula.viz import ForestPlotter + >>> fs = FormulaGroup(["x^2 = y", "x^3 = y^2", "x + y = \pi"]) + >>> fs + ;;> + >>> fs.elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, + {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 7, 'type': 'textord', 'text': '3', 'role': 'sup'}, + {'id': 8, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 9, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 10, 'type': 'mathord', 'text': 'y', 'role': 'base'}, + {'id': 11, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 12, 'type': 'mathord', 'text': 'x', 'role': None}, + {'id': 13, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 14, 'type': 'mathord', 'text': 'y', 'role': None}, + {'id': 15, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 16, 'type': 'mathord', 'text': '\\pi', 'role': None}] + >>> fs.ast + [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 3], + 'child': [1, 2], + 'father': None, + 'forest': None}}, + {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + 'structure': {'bro': [None, 2], + 'child': None, + 'father': 0, + 'forest': [6, 12]}}, + {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, + 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, + 'structure': {'bro': [3, None], + 'child': None, + 'father': None, + 'forest': [10, 14]}}, + {'val': {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 8], + 'child': [6, 7], + 'father': None, + 'forest': None}}, + {'val': {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + show more (open the raw output data in a text editor) ... + >>> fs.variable_standardization()[0] + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] + >>> ForestPlotter().export(fs.ast_graph, root_list=[node["val"]["id"] for node in fs.ast if node["structure"]["father"] is None],) + +.. figure:: ../../_static/formulagroup.png + + +文本语法结构解析 -------------------- -.. toctree:: - :maxdepth: 1 - :titlesonly: - - 文本语法结构解析 - 公式语法结构解析 +本部分主要由EduNLP.SIF.Parse模块实现,主要功能为将文本中的字母、数字等进行提取,将其转换为标准格式。 + +此模块主要作为 *中间模块* 来对输入的生文本进行解析处理,用户一般不直接调用此模块。 + +主要流程介绍 ++++++++++++++++ + +1.按照以下顺序,先后对传入的文本进行判断类型 + +* is_chinese:用于匹配中文字符 [\u4e00-\u9fa5] + +* is_alphabet:匹配公式之外的英文字母,将匹配到的只对两个汉字之间的字母做修正(使用$$包裹起来),其余匹配到的情况视为不合 latex 语法录入的公式 + +* is_number:匹配公式之外的数字,只对两个汉字之间的数字做修正(使用$$包裹起来),其余匹配到的情况视为不合 latex 语法录入的公式 + +2.匹配 latex 公式 + +* latex 中出现中文字符,打印且只打印一次 warning + +* 使用_is_formula_legal函数,检查latex公式的完整性和可解析性,对于不合法公式报错 + +调用库 +>>>>>>>>>>>> + +:: + + from EduNLP.SIF.Parser import Parser + +输入 +>>>>>>> + +类型:str + +内容:题目文本 (text) + +:: + + >>> text1 = '生产某种零件的A工厂25名工人的日加工零件数_ _' + >>> text2 = 'X的分布列为( )' + >>> text3 = '① AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.AC的中点为D' + >>> text4 = '支持公式如$\\frac{y}{x}$,$\\SIFBlank$,$\\FigureID{1}$,不支持公式如$\\frac{ \\dddot y}{x}$' + +进行解析 +>>>>>>>>>>>>>>>>>>>> + +:: + + >>> text_parser1 = Parser(text1) + >>> text_parser2 = Parser(text2) + >>> text_parser3 = Parser(text3) + >>> text_parser4 = Parser(text4) + +相关描述参数 +>>>>>>>>>>>> + +- 尝试转换为标准形式 + +:: + + >>> text_parser1.description_list() + >>> print('text_parser1.text:',text_parser1.text) + text_parser1.text: 生产某种零件的$A$工厂$25$名工人的日加工零件数$\SIFBlank$ + >>> text_parser2.description_list() + >>> print('text_parser2.text:',text_parser2.text) + text_parser2.text: $X$的分布列为$\SIFChoice$ + +- 判断是否有语法问题 + +:: + + >>> text_parser3.description_list() + >>> print('text_parser3.error_flag: ',text_parser3.error_flag) + text_parser3.error_flag: 1 + >>> text_parser4.description_list() + >>> print('text_parser4.fomula_illegal_flag: ',text_parser4.fomula_illegal_flag) + text_parser4.fomula_illegal_flag: 1 diff --git "a/docs/source/tutorial/zh/parse/\345\205\254\345\274\217\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" "b/docs/source/tutorial/zh/parse/\345\205\254\345\274\217\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" index 1a7717fb..94d8517c 100644 --- "a/docs/source/tutorial/zh/parse/\345\205\254\345\274\217\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" +++ "b/docs/source/tutorial/zh/parse/\345\205\254\345\274\217\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" @@ -10,52 +10,159 @@ 2.FormulaGroup:如果需要传入公式集则可调用此接口,最终将形成ast森林,森林中树的结构同Formula。 +Formula +>>>>>>>>>>>> -Examples: +Formula 首先在分词功能中对原始文本的公式做切分处理,另外提供 ``公式解析树`` 功能,可以将数学公式的抽象语法分析树用文本或图片的形式表示出来。 + +本模块另提供公式变量标准化的功能,如判断几个子公式内的‘x’为同一变量。 + +初始化 ++++++++++ + +传入参数:item + +item为str 或 List[Dict]类型,具体内容为latex 公式 或 公式经解析后产生的抽象语法分析树。 :: - >>> text = '支持公式如$\\frac{y}{x}$,$\\SIFBlank$,$\\FigureID{1}$,不支持公式如$\\frac{ \\dddot y}{x}$' - >>> text_parser = Parser(text) - >>> text_parser.description_list() - >>> text_parser.fomula_illegal_flag - >>> 1 + >>> f=Formula("x^2 + x+1 = y") + >>> f + + +查看公式切分后的具体内容 +++++++++++++++++++++++++++++ + +- 查看公式切分后的结点元素 :: - >>> f = Formula("x") - >>> f - - >>> f.ast - [{'val': {'id': 0, 'type': 'mathord', 'text': 'x', 'role': None}, 'structure': {'bro': [None, None], 'child': None, 'father': None, 'forest': None}}] - >>> f.elements - [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None}] - >>> f.variable_standardization(inplace=True) - - >>> f.elements - [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}] + >>> f.elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, + {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}] + +- 查看公式的抽象语法分析树 :: - >>> fg = FormulaGroup(["x + y", "y + x", "z + x"]) - >>> fg - ;;> - >>> fg = FormulaGroup(["x + y", Formula("y + x"), "z + x"]) - >>> fg - ;;> - >>> fg = FormulaGroup(["x", Formula("y"), "x"]) - >>> fg.elements - [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'y', 'role': None},\ - {'id': 2, 'type': 'mathord', 'text': 'x', 'role': None}] - >>> fg = FormulaGroup(["x", Formula("y"), "x"], variable_standardization=True) - >>> fg.elements - [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}, {'id': 1, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}, {'id': 2, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}] - -详细示范 -+++++++++++++++ + >>> f.ast + [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 3],'child': [1, 2],'father': None,'forest': None}}, + {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + 'structure': {'bro': [None, 2], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, + 'structure': {'bro': [3, 5], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + 'structure': {'bro': [4, 6], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + 'structure': {'bro': [5, 7], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + 'structure': {'bro': [6, 8], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}, + 'structure': {'bro': [7, None],'child': None,'father': None,'forest': None}}] + + >>> print('nodes: ',f.ast_graph.nodes) + nodes: [0, 1, 2, 3, 4, 5, 6, 7, 8] + >>> print('edges: ' ,f.ast_graph.edges) + edges: [(0, 1), (0, 2)] + +- 将抽象语法分析树用图片表示 + +:: + + >>> ForestPlotter().export(f.ast_graph, root_list=[node["val"]["id"] for node in f.ast if node["structure"]["father"] is None],) + >>> plt.show() + +.. figure:: ../../../_static/formula.png + +变量标准化 ++++++++++++ + +此参数使得同一变量拥有相同的变量编号。 + +如:``x`` 变量的编号为 ``0``, ``y`` 变量的编号为 ``1``。 + +:: + + >>> f.variable_standardization().elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}, + {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] + +FormulaGroup +>>>>>>>>>>>>>>> + +调用 ``FormulaGroup`` 类解析公式方程组,相关的属性和函数方法同上。 + +:: -.. toctree:: - :titlesonly: + >>> fs = FormulaGroup(["x^2 = y", "x^3 = y^2", "x + y = \pi"]) + >>> fs + ;;> + >>> fs.elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, + {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 7, 'type': 'textord', 'text': '3', 'role': 'sup'}, + {'id': 8, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 9, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 10, 'type': 'mathord', 'text': 'y', 'role': 'base'}, + {'id': 11, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 12, 'type': 'mathord', 'text': 'x', 'role': None}, + {'id': 13, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 14, 'type': 'mathord', 'text': 'y', 'role': None}, + {'id': 15, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 16, 'type': 'mathord', 'text': '\\pi', 'role': None}] + >>> fs.ast + [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 3], + 'child': [1, 2], + 'father': None, + 'forest': None}}, + {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + 'structure': {'bro': [None, 2], + 'child': None, + 'father': 0, + 'forest': [6, 12]}}, + {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, + 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, + 'structure': {'bro': [3, None], + 'child': None, + 'father': None, + 'forest': [10, 14]}}, + {'val': {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 8], + 'child': [6, 7], + 'father': None, + 'forest': None}}, + {'val': {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + show more (open the raw output data in a text editor) ... + >>> fs.variable_standardization()[0] + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] + >>> ForestPlotter().export(fs.ast_graph, root_list=[node["val"]["id"] for node in fs.ast if node["structure"]["father"] is None],) - 树型处理效果 <../../../build/blitz/formula/tree.ipynb> - 公式解析效果案例 <../../../build/blitz/formula/formula.ipynb> +.. figure:: ../../../_static/formulagroup.png \ No newline at end of file diff --git "a/docs/source/tutorial/zh/parse/\346\226\207\346\234\254\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" "b/docs/source/tutorial/zh/parse/\346\226\207\346\234\254\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" index f2f442a0..aaa54b64 100644 --- "a/docs/source/tutorial/zh/parse/\346\226\207\346\234\254\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" +++ "b/docs/source/tutorial/zh/parse/\346\226\207\346\234\254\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" @@ -2,7 +2,9 @@ -------------------- 本部分主要由EduNLP.SIF.Parse模块实现,主要功能为将文本中的字母、数字等进行提取,将其转换为标准格式。 - + +此模块主要作为 *中间模块* 来对输入的生文本进行解析处理,用户一般不直接调用此模块。 + 主要流程介绍 +++++++++++++++ @@ -20,20 +22,51 @@ * 使用_is_formula_legal函数,检查latex公式的完整性和可解析性,对于不合法公式报错 -Examples: +输入 +>>>>>>> + +类型:str + +内容:题目文本 (text) :: - >>> text = '生产某种零件的A工厂25名工人的日加工零件数_ _' - >>> text_parser = Parser(text) - >>> text_parser.description_list() - >>> text_parser.text - >>> '生产某种零件的$A$工厂$25$名工人的日加工零件数$\\SIFBlank$' + >>> text1 = '生产某种零件的A工厂25名工人的日加工零件数_ _' + >>> text2 = 'X的分布列为( )' + >>> text3 = '① AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.AC的中点为D' + >>> text4 = '支持公式如$\\frac{y}{x}$,$\\SIFBlank$,$\\FigureID{1}$,不支持公式如$\\frac{ \\dddot y}{x}$' -详细示范 -+++++++++++++++ +进行解析 +>>>>>>>>>>>>>>>>>>>> + +:: + + >>> text_parser1 = Parser(text1) + >>> text_parser2 = Parser(text2) + >>> text_parser3 = Parser(text3) + >>> text_parser4 = Parser(text4) + +相关描述参数 +>>>>>>>>>>>> + +- 尝试转换为标准形式 + +:: + + >>> text_parser1.description_list() + >>> print('text_parser1.text:',text_parser1.text) + text_parser1.text: 生产某种零件的$A$工厂$25$名工人的日加工零件数$\SIFBlank$ + >>> text_parser2.description_list() + >>> print('text_parser2.text:',text_parser2.text) + text_parser2.text: $X$的分布列为$\SIFChoice$ + +- 判断是否有语法问题 + +:: -.. toctree:: - :titlesonly: - - 文本语法结构解析的案例 <../../../build/blitz/parse/parse.ipynb> + >>> text_parser3.description_list() + >>> print('text_parser3.error_flag: ',text_parser3.error_flag) + text_parser3.error_flag: 1 + >>> text_parser4.description_list() + >>> print('text_parser4.fomula_illegal_flag: ',text_parser4.fomula_illegal_flag) + text_parser4.fomula_illegal_flag: 1 diff --git a/docs/source/tutorial/zh/pretrain.rst b/docs/source/tutorial/zh/pretrain.rst index 477717a4..ff4d4fed 100644 --- a/docs/source/tutorial/zh/pretrain.rst +++ b/docs/source/tutorial/zh/pretrain.rst @@ -8,13 +8,123 @@ * 如何加载预训练模型 * 公开的预训练模型 -学习路线图 ------------------- +导入模块 +---------- + +:: + + from EduNLP.I2V import get_pretrained_i2v + from EduNLP.Vector import get_pretrained_t2v + +训练模型 +------------ + +如需训练模型则可直接train_vector函数接口,来使使训练模型更加方便。模块调用gensim库中的相关训练模型,目前提供了"sg"、 "cbow"、 "fastext"、 "d2v"、 "bow"、 "tfidf"的训练方法,并提供了embedding_dim参数,使之可以按照需求确定向量的维度。 + +基本步骤 +################## + +1.确定模型的类型,选择适合的Tokenizer(GensimWordTokenizer、 GensimSegTokenizer),使之令牌化; + +2.调用train_vector函数,即可得到所需的预训练模型。 + +Examples: + +:: + + >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] + + # 10 dimension with fasstext method + train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="d2v") + + +装载模型 +-------- + +将所得到的模型传入I2V模块即可装载模型 + +Examples: + +:: + + >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) + +公开模型一览 +------------ + +版本说明 +################## + +一级版本 + +* 公开版本1(luna_pub):高考 +* 公开版本2( luna_pub_large):高考 + 地区试题 + +二级版本: + +* 小科(Chinese,Math,English,History,Geography,Politics,Biology,Physics,Chemistry) +* 大科(理科science、文科literal、全科all) + +三级版本:【待完成】 + +* 不使用第三方初始化词表 +* 使用第三方初始化词表 + +模型训练数据说明 +################## + +* 当前【词向量w2v】【句向量d2v】模型所用的数据均为 【高中学段】 的题目 +* 测试数据:`[OpenLUNA.json] `_ + +当前提供以下模型,更多分学科、分题型模型正在训练中,敬请期待 + "d2v_all_256"(全科),"d2v_sci_256"(理科),"d2v_eng_256"(英语),"d2v_lit_256"(文科) + + +模型训练案例 +------------ + +获得数据集 +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + prepare_dataset <../../build/blitz/pretrain/prepare_dataset.ipynb> + +gensim模型d2v例子 +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + d2v_bow_tfidf <../../build/blitz/pretrain/gensim/d2v_bow_tfidf.ipynb> + d2v_general <../../build/blitz/pretrain/gensim/d2v_general.ipynb> + d2v_stem_tf <../../build/blitz/pretrain/gensim/d2v_stem_tf.ipynb> + +gensim模型w2v例子 +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + w2v_stem_text <../../build/blitz/pretrain/gensim/w2v_stem_text.ipynb> + w2v_stem_tf <../../build/blitz/pretrain/gensim/w2v_stem_tf.ipynb> + +seg_token例子 +#################### .. toctree:: :maxdepth: 1 :titlesonly: - 训练模型 - 装载模型 - 公开模型一览 + d2v.ipynb <../../build/blitz/pretrain/seg_token/d2v.ipynb> + d2v_d1 <../../build/blitz/pretrain/seg_token/d2v_d1.ipynb> + d2v_d2 <../../build/blitz/pretrain/seg_token/d2v_d2.ipynb> \ No newline at end of file diff --git a/docs/source/tutorial/zh/seg.rst b/docs/source/tutorial/zh/seg.rst index e1e1c0db..f65a2b41 100644 --- a/docs/source/tutorial/zh/seg.rst +++ b/docs/source/tutorial/zh/seg.rst @@ -14,13 +14,173 @@ 2.将输入的item按照元素类型进行切分、分组。 -学习路线图 --------------------- +语义成分分解 +------------ + +由于选择题是以字典的形式给出,故需要将其在保留数据类型关系的情况下转换为文本格式。dict2str4sif函数就是实现此功能的一个模块,该模块可以将选择题形式的item转换为字符格式,并将题干和选项、各选项之间分割开来。 + +导入库 ++++++++++ + +:: + + from EduNLP.utils import dict2str4sif + +基础使用方法 +++++++++++++++++++ + +:: + + >>> item = { + ... "stem": r"若复数$z=1+2 i+i^{3}$,则$|z|=$", + ... "options": ['0', '1', r'$\sqrt{2}$', '2'], + ... } + >>> dict2str4sif(item) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' + +可选的的额外参数/接口 +++++++++++++++++++++++ + +1.add_list_no_tag:当此参数为True较False时区别在于是否需要将选项部分的标签计数 + +:: + + >>> dict2str4sif(item, add_list_no_tag=True) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' + + >>> dict2str4sif(item, add_list_no_tag=False) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2$\\SIFTag{options_end}$' + +2.tag_mode:此参数为选择标签所在位置,delimiter为头尾都加标签,head为仅头部加标签,tail为仅尾部加标签 + +:: + + >>> dict2str4sif(item, tag_mode="head") # doctest: +ELLIPSIS + '$\\SIFTag{stem}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{options}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2' + + >>> dict2str4sif(item, tag_mode="tail") # doctest: +ELLIPSIS + '若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options}$' + +3.key_as_tag:当其为False时则不区分切分标签的类型,而是仅在选项之间加入$\SIFSep$ + +:: + + >>> dict2str4sif(item, key_as_tag=False) + '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' + +结构成分分解 +------------ + +对切片后的item中的各个元素进行分词,提供深度选项,可以按照需求选择所有地方切分或者在部分标签处切分(比如\SIFSep、\SIFTag处);对标签添加的位置也可以进行选择,可以在头尾处添加或仅在头或尾处添加。 + +具有两种模式: + +* linear模式,用于对文本进行处理(使用jieba库进行分词); + +* ast模式,用于对公式进行解析。 + +基础分解流程: + +- 使用正则匹配方法匹配出各个组成成分 + +- 对特殊结构的成分进行处理,如将base64编码的图片转为numpy形式 + +- 将当前元素分类放入各个元素组中 + +- 按照需求输入相应的参数得到筛选后的结果 + +导入库 ++++++++++ + +:: + + from EduNLP.SIF.segment import seg + from EduNLP.SIF import sif4sci + +基础使用方法 +++++++++++++++++++ + +:: + + >>> test_item = r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$" + >>> seg(test_item) + >>> ['如图所示,则', '\\bigtriangleup ABC', '的面积是', '\\SIFBlank', '。', \FigureID{1}] + +可选的的额外参数/接口 +++++++++++++++++++++++ + +1.describe:可以统计出各种类型元素的数量 + +:: + + >>> s.describe() + {'t': 3, 'f': 1, 'g': 1, 'm': 1} + +2.filter:可以选择性的筛除某种或几种类型的元素 + +此接口可传入keep参数来选择需要保留的元素类型,也可直接传入特殊字符来筛除特定元素类型 + +各字母所代表的元素类型: + +- "t": text +- "f": formula +- "g": figure +- "m": question mark +- "a": tag +- "s": sep tag + +:: + + >>> with s.filter("f"): + ... s + ['如图所示,则', '的面积是', '\\SIFBlank', '。', \FigureID{1}] + >>> with s.filter(keep="t"): + ... s + ['如图所示,则', '的面积是', '。'] + +3.symbol:选择性的将部分类型的数据转换为特殊符号遮掩起来 + +symbol所代表的元素类型: + +- "t": text +- "f": formula +- "g": figure +- "m": question mark + +:: + + >>> seg(test_item, symbol="fgm") + ['如图所示,则', '[FORMULA]', '的面积是', '[MARK]', '。', '[FIGURE]'] + >>> seg(test_item, symbol="tfgm") + ['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]'] + +此外,当前还提供了sif4sci函数,其可以很方便的将item转换为结构成分分解后的结果 + +:: + + >>> segments = sif4sci(item["stem"], figures=figures, tokenization=False) + >>> segments + ['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形', 'ABC', '的斜边', 'BC', ', 直角边', 'AB', ', ', 'AC', '.', '\\bigtriangleup ABC', '的三边所围成的区域记为', 'I', ',黑色部分记为', 'II', ', 其余部分记为', 'III', '.在整个图形中随机取一点,此点取自', 'I,II,III', '的概率分别记为', 'p_1,p_2,p_3', ',则', '\\SIFChoice', \FigureID{1}] + +- 调用此函数时,可以按照需求选择性的输出某一类型的数据 + +:: + + >>> segments.formula_segments + ['ABC', + 'BC', + 'AB', + 'AC', + '\\bigtriangleup ABC', + 'I', + 'II', + 'III', + 'I,II,III', + 'p_1,p_2,p_3'] -.. toctree:: - :maxdepth: 1 - :titlesonly: +- 与seg函数类似,sif4sci也提供了标记化切分选项通过修改 ``symbol`` 参数来将不同的成分转化成特定标记,方便您的研究 - 语义成分分解 - 结构成分分解 +:: + >>> sif4sci(item["stem"], figures=figures, tokenization=False, symbol="tfgm") + ['[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[FIGURE]'] diff --git "a/docs/source/tutorial/zh/seg/\347\273\223\346\236\204\346\210\220\345\210\206\345\210\206\350\247\243.rst" "b/docs/source/tutorial/zh/seg/\347\273\223\346\236\204\346\210\220\345\210\206\345\210\206\350\247\243.rst" index 13ae96ca..bffe1b64 100644 --- "a/docs/source/tutorial/zh/seg/\347\273\223\346\236\204\346\210\220\345\210\206\345\210\206\350\247\243.rst" +++ "b/docs/source/tutorial/zh/seg/\347\273\223\346\236\204\346\210\220\345\210\206\345\210\206\350\247\243.rst" @@ -26,6 +26,16 @@ 2.filter:可以选择性的筛除某种或几种类型的元素 +此接口可传入keep参数来选择需要保留的元素类型,也可直接传入特殊字符来筛除特定元素类型 + +各字母所代表的元素类型: + "t": text + "f": formula + "g": figure + "m": question mark + "a": tag + "s": sep tag + :: >>> with s.filter("f"): @@ -37,17 +47,15 @@ 3.symbol:选择性的将部分类型的数据转换为特殊符号遮掩起来 +symbol所代表的元素类型: + "t": text + "f": formula + "g": figure + "m": question mark + :: >>> seg(test_item, symbol="fgm") ['如图所示,则', '[FORMULA]', '的面积是', '[MARK]', '。', '[FIGURE]'] >>> seg(test_item, symbol="tfgm") ['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]'] - -详细示范 -+++++++++++ - -.. toctree:: - :titlesonly: - - 结构成分分解的案例 <../../../build/blitz/seg/seg.ipynb> diff --git "a/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst" "b/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst" index 0950dd87..8c709a89 100644 --- "a/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst" +++ "b/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst" @@ -44,12 +44,4 @@ :: >>> dict2str4sif(item, key_as_tag=False) - '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' - -详细示范 -++++++++++++++++++++++ - -.. toctree:: - :titlesonly: - - 语义成分分解的案例 <../../../build/blitz/utils/data.ipynb> + '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' \ No newline at end of file diff --git a/docs/source/tutorial/zh/sif.rst b/docs/source/tutorial/zh/sif.rst index 0bb9f2ae..0d34eb91 100644 --- a/docs/source/tutorial/zh/sif.rst +++ b/docs/source/tutorial/zh/sif.rst @@ -87,6 +87,42 @@ version: 0.2 例如:``则$a$的取值范围是(\u3000\u3000)`` +判断是否为sif格式和转换为sif格式的函数 +-------------------------------------------- + +调用库 +++++++++ +:: + + from EduNLP.SIF import is_sif, to_sif + +is_sif ++++++++++++ + +:: + + >>> text1 = '若$x,y$满足约束条件' + >>> text2 = '$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,' + >>> text3 = '则$z=x+7 y$的最大值$\\SIFUnderline$' + >>> text4 = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...' + >>> is_sif(text1) + True + >>> is_sif(text2) + True + >>> is_sif(text3) + True + >>> is_sif(text4) + False + +to_sif ++++++++++++ + +:: + + >>> text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...' + >>> to_sif(text) + '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...' + Change Log ---------------- @@ -106,4 +142,3 @@ Change Log 1. 注明 ``$$`` 之中不能出现换行符。 2. 添加文本标注格式说明。 - diff --git a/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb b/docs/source/tutorial/zh/tokenization/PureTextTokenizer.rst similarity index 100% rename from docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb rename to docs/source/tutorial/zh/tokenization/PureTextTokenizer.rst diff --git a/docs/source/tutorial/zh/tokenize.rst b/docs/source/tutorial/zh/tokenize.rst index ce719757..4982f031 100644 --- a/docs/source/tutorial/zh/tokenize.rst +++ b/docs/source/tutorial/zh/tokenize.rst @@ -16,13 +16,157 @@ 具有两种模式,一种是linear模式,用于对文本进行处理(使用jieba库进行分词);一种是ast模式,用于对公式进行解析。 -学习路线图 --------------------- +分词 +------- -.. toctree:: - :maxdepth: 1 - :titlesonly: +词解析(text-tokenization):一个句子(不含公式)是由若干“词”按顺序构成的,将一个句子切分为若干词的过程称为“词解析”。根据词的粒度大小,又可细分为“词组解析”和"单字解析"。 + +:: + + - 词组解析 (word-tokenization):每一个词组为一个“令牌”(token)。 + + - 单字解析 (char-tokenization):单个字符即为一个“令牌”(token)。 + + +词解析分为两个主要步骤: + +1. 分词: + + - 词组解析:使用分词工具切分并提取题目文本中的词。本项目目前支持的分词工具有:`jieba` + + - 单字解析:按字符划分。 + +2. 筛选:过滤指定的停用词。 + + 本项目默认使用的停用词表:`[stopwords] `_ + 你也可以使用自己的停用词表,具体使用方法见下面的示例。 + +Examples: + +:: + + from EduNLP.SIF.tokenization.text import tokenize + >>> text = "三角函数是基本初等函数之一" + >>> tokenize(text, granularity="word") + ['三角函数', '初等', '函数'] - 分词 - 分句 - 令牌化 + >>> tokenize(text, granularity="char") + ['三', '角', '函', '数', '基', '初', '函', '数'] + +分句 +------- + +将较长的文档切分成若干句子的过程称为“分句”。每个句子为一个“令牌”(token)(待实现)。 + +令牌化 +------- +即综合解析,将带公式的句子切分为若干标记的过程。每个标记为一个“令牌”(token)。 + +此功能对应的实现函数为tokenize,将已经经过结构成分分解后的item传入其中即可得到所需结果。 + +:: + + from EduNLP.Tokenizer import get_tokenizer + >>> items = "如图所示,则三角形$ABC$的面积是$\\SIFBlank$。$\\FigureID{1}$" + >>> tokenize(SegmentList(items)) + ['如图所示', '三角形', 'ABC', '面积', '\\\\SIFBlank', \\FigureID{1}] + >>> tokenize(SegmentList(items),formula_params={"method": "ast"}) + ['如图所示', '三角形', , '面积', '\\\\SIFBlank', \\FigureID{1}] + + + +我们提供了多种已经封装好的令牌化器供用户便捷调用,通过查看 ``./EduNLP/Tokenizer/tokenizer.py`` 及 ``./EduNLP/Pretrain/gensim_vec.py`` 可以查看更多令牌化器,下面是一个完整的令牌化器列表: + +- TextTokenizer + +- PureTextTokenizer + +- GensimSegTokenizer + +- GensimWordTokenizer + + +TextTokenizer ++++++++++++++++++++++ + +即文本令牌解析器,在默认情况下对传入的item中的图片、标签、分隔符、题目空缺符等部分则转换成特殊字符进行保护,从而对文本、公式进行令牌化操作。此外,此令牌解析器对文本、公式均采用线性的分析方法,并提供的key参数用于对传入的item进行预处理,待未来根据需求进行开发。 + +:: + + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokenizer = TextTokenizer() + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + >>> items = [{ + ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", + ... "options": ["1", "2"] + ... }] + >>> tokens = tokenizer(items, key=lambda x: x["stem"]) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + +PureTextTokenizer ++++++++++++++++++++++ + +即纯净型文本令牌解析器,在默认情况下对传入的item中的图片、标签、分隔符、题目空缺符等部分则转换成特殊字符进行保护,并对特殊公式(例如:$\\FormFigureID{...}$, $\\FormFigureBase64{...}$)进行筛除,从而对文本、纯文本公式进行令牌化操作。此外,此令牌解析器对文本、公式均采用线性的分析方法,并提供的key参数用于对传入的item进行预处理,待未来根据需求进行开发。 + + +:: + + >>> tokenizer = PureTextTokenizer() + >>> items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"] + >>> tokens = tokenizer(items) + >>> next(tokens)[:10] + ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z'] + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + >>> items = [{ + ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", + ... "options": ["1", "2"] + ... }] + >>> tokens = tokenizer(items, key=lambda x: x["stem"]) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + +GensimWordTokenizer ++++++++++++++++++++++++ + +此令牌解析器在默认情况下对传入的item中的图片、题目空缺符等部分转换成特殊字符进行保护,从而对文本、公式、标签、分隔符进行令牌化操作。此外,从令牌化方法而言,此令牌解析器对文本均采用线性的分析方法,而对公式采用抽象语法树的分析方法,提供了general参数可供使用者选择:当general为true的时候则代表着传入的item并非标准格式,此时对公式也使用线性的分析方法;当general为false时则代表使用抽象语法树的方法对公式进行解析。 + +GensimSegTokenizer +++++++++++++++++++++ + +此令牌解析器在默认情况下对传入的item中的图片、分隔符、题目空缺符等部分则转换成特殊字符进行保护,从而对文本、公式、标签进行令牌化操作。此外,从令牌化方法而言,此令牌解析器对文本均采用线性的分析方法,而对公式采用抽象语法树的分析方法。 + +与GensimWordTokenizer相比,GensimSegTokenizer解析器主要区别是: + +* 提供了切分深度的选项,即可以在sep标签或者tag标签处进行切割 +* 默认在item组分(如text、formula)的头部插入开始标签 + +Examples +---------- + +:: + + >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] + >>> tokenizer = GensimWordTokenizer(symbol="fgmas", general=False) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]'] diff --git a/docs/source/tutorial/zh/vectorization.rst b/docs/source/tutorial/zh/vectorization.rst index 89175ba6..8c57cac7 100644 --- a/docs/source/tutorial/zh/vectorization.rst +++ b/docs/source/tutorial/zh/vectorization.rst @@ -3,6 +3,10 @@ 此部分提供了简便的接口,可以直接将传入的items经过转化得到向量。当前提供了是否使用预训练模型的选项,可根据需要进行选择,如不使用预训练模型则可直接调用D2V函数,使用预训练模型则调用get_pretrained_i2v函数。 +- 不使用预训练模型 + +- 使用预训练模型 + 总体流程 --------------------------- @@ -14,13 +18,138 @@ 4.使用已有或者使用提供的预训练模型,将令牌化后的item转换为向量。 -学习路线图 ---------------------------- -.. toctree:: - :maxdepth: 1 - :titlesonly: +不使用预训练模型:直接调用已有模型 +------------------------------------ + +使用自己提供的任一预训练模型(给出模型存放路径即可)将给定的题目文本转成向量。 + +* 优点:可以使用自己的模型,另可调整训练参数,灵活性强。 + +导入模块 +++++++++++ + +:: + + from EduNLP.I2V import D2V,W2V,get_pretrained_i2v + from EduNLP.Vector import T2V,get_pretrained_t2v + +提供的模型类型 +++++++++++++++++++++ + +- W2V + +- D2V + +- T2V + +W2V +<<<<<<<<< + +此模型方法直接使用gensim库中的相关模型方法,将传入的word转换为vector,当前提供一下四种方法: + + - FastText + + - Word2Vec + + - KeyedVectors + +:: + + >>> i2v = get_pretrained_i2v("test_w2v", "examples/test_model/data/w2v") # doctest: +ELLIPSIS + >>> item_vector, token_vector = i2v(["有学者认为:‘学习’,必须适应实际"]) + >>> item_vector # doctest: +ELLIPSIS + array([[...]], dtype=float32) + +D2V +<<<<<<<<<<<< + +此模型方法可以将item转换为vector,是一个综合性的处理方法,当前提供以下方法: + +- d2v:调用gensim库中的Doc2Vec,来使item转换为vector + +- BowLoader:调用gensim库中的corpora模块将doc转化为bow + +- TfidfLoader:调用gensim库中的TfidfModel模块将doc转化为bow + +:: + + >>> item = {"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\SIFChoice$$\FigureID{1}$"} + >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) + >>> i2v(item) + ([array([ 4.76559885e-02, -1.60574958e-01, 1.94614579e-03, 2.40295693e-01, + 2.24517003e-01, -3.24351490e-02, 4.35789041e-02, -1.65670961e-02,... + +T2V +<<<<<<<<<< + +使用自己提供的任一预训练模型(给出模型存放路径即可)将一组题目的切分序列表征为向量。 + +- 优点:模型及其参数可自主调整,灵活性强。 + +输入 +^^^^^^^^^^ + +类型:list +内容:一个题组中每个题目切分序列的组合。 +> 使用 ``GensimWordTokenizer`` 模型即可将题目文本(`str` 类型)转换成 tokens。 + +:: + + >>> token_items=['公式','[FORMULA]','公式','[FORMULA]','如图','[FIGURE]','x',',','y','约束条件','[SEP]','z','=','x','+','7','y','最大值','[MARK]'] + >>> path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> t2v = T2V('d2v',filepath=path) + >>> t2v(token_items) + [array([ 0.0256574 , 0.06061139, -0.00121044, -0.0167674 , -0.0111706 , + 0.05325712, -0.02097339, -0.01613594, 0.02904145, 0.0185046 ,... + +处理的具体流程 +++++++++++++++++++++ + +1.调用get_tokenizer函数,得到经过分词后的结果; + +2.根据使用的模型,选择提供的模型类型,进行向量化处理。 + + +使用预训练模型:直接调用get_pretrained_i2v +--------------------------------------------- + +使用 EduNLP 项目组给定的预训练模型将给定的题目文本转成向量。 + +* 优点:简单方便。 + +* 缺点:只能使用项目中给定的模型,局限性较大。 + +* 调用此函数即可获得相应的预训练模型,目前提供以下的预训练模型:d2v_all_256、d2v_sci_256、d2v_eng_256、d2v_lit_256 + +模型选择与使用 +################## + +根据题目所属学科选择预训练模型: + ++--------------------+------------------------+ +| 预训练模型名称 | 模型训练数据的所属学科 | ++====================+========================+ +| d2v_all_256 | 全学科 | ++--------------------+------------------------+ +| d2v_sci_256 | 理科 | ++--------------------+------------------------+ +| d2v_lit_256 | 文科 | ++--------------------+------------------------+ +| d2v_eng_256 | 英语 | ++--------------------+------------------------+ + +处理的具体流程 +################## + +1.下载相应的预处理模型 + +2.将所得到的模型传入D2V,使用D2V进行处理 + +Examples: - 不使用预训练模型 - 使用预训练模型 +:: + >>> i2v = get_pretrained_i2v("d2v_sci_256") + >>> i2v(item) diff --git "a/docs/source/tutorial/zh/vectorization/\344\270\215\344\275\277\347\224\250\351\242\204\350\256\255\347\273\203\346\250\241\345\236\213.rst" "b/docs/source/tutorial/zh/vectorization/\344\270\215\344\275\277\347\224\250\351\242\204\350\256\255\347\273\203\346\250\241\345\236\213.rst" index 5a26588f..04f21712 100644 --- "a/docs/source/tutorial/zh/vectorization/\344\270\215\344\275\277\347\224\250\351\242\204\350\256\255\347\273\203\346\250\241\345\236\213.rst" +++ "b/docs/source/tutorial/zh/vectorization/\344\270\215\344\275\277\347\224\250\351\242\204\350\256\255\347\273\203\346\250\241\345\236\213.rst" @@ -11,7 +11,7 @@ 1.调用get_tokenizer函数,得到经过分词后的结果; -2.调用T2V模块,根据需要选择是否使用预训练的t2v模型 +2.调用D2V或W2V等模块,根据需要选择是否使用预训练的t2v模型 Examples: diff --git a/examples/formula/formula.ipynb b/examples/formula/formula.ipynb index f748a90a..a9626563 100644 --- a/examples/formula/formula.ipynb +++ b/examples/formula/formula.ipynb @@ -3,13 +3,13 @@ { "cell_type": "markdown", "source": [ - "# Formula\n", - "\n", - "## 概述\n", - "\n", - "Formula 首先在分词功能中对原始文本的公式做切分处理,另外提供 [公式解析树] 功能,可以将数学公式的抽象语法分析树用文本或图片的形式表示出来。 \n", - "\n", - "本模块另提供公式变量标准化的功能,如判断几个子公式内的‘x’为同一变量。" + "# Formula\r\n", + "\r\n", + "## 概述\r\n", + "\r\n", + "Formula 首先在分词功能中对原始文本的公式做切分处理,另外提供多种功能使之能够适应多种用户需求,例如 [公式解析树] 功能,可以将数学公式的抽象语法分析树用文本或图片的形式表示出来;又如[公式变量标准化]的功能,能判断几个子公式内的‘x’为同一变量。\r\n", + "\r\n", + "由于本部分常作为中间模块,故仅展示基本调用方法,如需更进一步学习模块相关参数请参见对应文档。" ], "metadata": {} }, @@ -17,9 +17,9 @@ "cell_type": "code", "execution_count": 1, "source": [ - "import matplotlib.pyplot as plt\n", - "from EduNLP.Formula import Formula\n", - "from EduNLP.Formula import FormulaGroup\n", + "import matplotlib.pyplot as plt\r\n", + "from EduNLP.Formula import Formula\r\n", + "from EduNLP.Formula import FormulaGroup\r\n", "from EduNLP.Formula.viz import ForestPlotter" ], "outputs": [], @@ -41,7 +41,7 @@ "cell_type": "code", "execution_count": 2, "source": [ - "f = Formula(\"x^2 + x+1 = y\")\n", + "f = Formula(\"x^2 + x+1 = y\")\r\n", "f " ], "outputs": [ @@ -60,188 +60,6 @@ "collapsed": true } }, - { - "cell_type": "markdown", - "source": [ - "- 查看公式切分后的结点元素:" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 3, - "source": [ - "f.elements" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[{'id': 0, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", - " {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'},\n", - " {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'},\n", - " {'id': 3, 'type': 'bin', 'text': '+', 'role': None},\n", - " {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None},\n", - " {'id': 5, 'type': 'bin', 'text': '+', 'role': None},\n", - " {'id': 6, 'type': 'textord', 'text': '1', 'role': None},\n", - " {'id': 7, 'type': 'rel', 'text': '=', 'role': None},\n", - " {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}]" - ] - }, - "metadata": {}, - "execution_count": 3 - } - ], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "- 查看公式的抽象语法分析树:" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 4, - "source": [ - "f.ast " - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[{'val': {'id': 0, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", - " 'structure': {'bro': [None, 3],\n", - " 'child': [1, 2],\n", - " 'father': None,\n", - " 'forest': None}},\n", - " {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'},\n", - " 'structure': {'bro': [None, 2], 'child': None, 'father': 0, 'forest': None}},\n", - " {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'},\n", - " 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}},\n", - " {'val': {'id': 3, 'type': 'bin', 'text': '+', 'role': None},\n", - " 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}},\n", - " {'val': {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None},\n", - " 'structure': {'bro': [3, 5], 'child': None, 'father': None, 'forest': None}},\n", - " {'val': {'id': 5, 'type': 'bin', 'text': '+', 'role': None},\n", - " 'structure': {'bro': [4, 6], 'child': None, 'father': None, 'forest': None}},\n", - " {'val': {'id': 6, 'type': 'textord', 'text': '1', 'role': None},\n", - " 'structure': {'bro': [5, 7], 'child': None, 'father': None, 'forest': None}},\n", - " {'val': {'id': 7, 'type': 'rel', 'text': '=', 'role': None},\n", - " 'structure': {'bro': [6, 8], 'child': None, 'father': None, 'forest': None}},\n", - " {'val': {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None},\n", - " 'structure': {'bro': [7, None],\n", - " 'child': None,\n", - " 'father': None,\n", - " 'forest': None}}]" - ] - }, - "metadata": {}, - "execution_count": 4 - } - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 13, - "source": [ - "print('nodes: ',f.ast_graph.nodes)\n", - "print('edges: ' ,f.ast_graph.edges)\n" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "nodes: [0, 1, 2, 3, 4, 5, 6, 7, 8]\n", - "edges: [(0, 1), (0, 2)]\n" - ] - } - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "code", - "execution_count": 17, - "source": [ - "ForestPlotter().export(\n", - " f.ast_graph, root_list=[node[\"val\"][\"id\"] for node in f.ast if node[\"structure\"][\"father\"] is None],\n", - ")\n", - "plt.show()" - ], - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "
" - ], - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAV0AAADnCAYAAAC9roUQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWAElEQVR4nO3dW3BT94HH8e9fV0u2ZRswGEPC/ZaEhBAIuW2apM10uk3KtLPdzs4+93ln9mH7ug/70vdt+7Az2Z3pdHZ220mbaTZt2iYQciOhBBICOKHcwTeChWTdjnR0zj7IdoFgY8vS/xzC7/NEIkv/3zmSf4j//38k4/s+IiJiRyToACIidxOVroiIRSpdERGLVLoiIhapdEVELFLpiohYpNIVEbFIpSsiYpFKV0TEIpWuiIhFKl0REYtUuiIiFql0RUQsUumKiFik0hURsUilKyJikUpXRMQila6IiEUqXRERi1S6IiIWqXRFRCxS6YqIWKTSFRGxSKUrImKRSldExCKVroiIRSpdERGLVLoiIhapdEVELIrZHCwST476bnWFzTFnY2KJMa/mDLTr8W0f662OJ0zne9rNOYPIGPZzNdtrM+jXVJjO0fXa/bvcasb3fXuDGeOv+dGr1saby/kfv4Dv+6Zdj2/7WG91PGE639NuzhlExrCfq9lem0G/psJ0jq7X7t/lVgv19EL57Eczf8598PINt/l1l+z+/+Tau/9tO1bLzHl8bpX8oVfI7nspNJkAnNG/cPX1n9iMdNtM4y//G4Xj+2xGAm6fK3/4t+Q/fBnfq4cik1erkD/0CuO//Fc8pxSKTL7vk93/X0z88WfUy3lrmYIU6tKtjp/DzY2TPfBznOGTeLXKzBNYHT9Lav0uYpnld+yTNdfxmViC+LJ7qZcnQ5PJr7vUrpwj1rsyNJkAIh3d+DXHaqbb5fIqBaojn4OJhiZTJN5BZvdeEoNbiCTTochkjMGvlfHrLpGEvUxBCnXpAjiXT9L14PMklq0hEu8gtW5n0JFaaq7jS617mOTgFvy6G4pM1fEz1AsTOJeOUy/lQpEJYNnf/hNepYAXQPHOlsv3PWI9A8SXrqY6djoUmQDc3BixHvvTsrNl8qoVkqvvJ735CWpfXLCeKwihL93kqq2UPnuX2tWLjb8hzxwGILF8HeUzh3Hz40RTmYBTNm+243MLE+Te/9/GL2zE7rul2TIlV26m5/G/J7n6fqLpnlBkqpfz5A7+knphgkg8aTXTXLmiqQwYqJz/mHjfYCgyAZROHSS98VGreebKZCJRKhc/pXzuCLGe5dZzBUELaW0S9KJHEBnmIwyLM2E/V1pIWxgtpImIyKwCKd1brfR+lcZrZYbr73f94lG7x52LmxujOPQOlQuf4IycmvXnpv8VtZAMQZ2nVo6/kNubOUfzHXMuN/8L18ZzNJd2vqbCxurFEdPc3CilUwdxs8N073yRyY9/jzM8RHJgMx1rH6L8lw/xvTrpzY/jZodJb34CgNzBXxFNZ8BESG3YTeHYG0TTGbxKEfw6sd4B6qUcHWsfpvTZeyRXbsTEUziXPiVbyZN5ZC/Rrr4gDrnpY3azw+Q//DXx/jVUx89RL0zgVcv4TomeJ37QtnGv/uGnRFMZTDyJm7/Ckud+yOTR1/DKk3Ss3YFz8RjJex+kevYIxRP7yezaS+Hj1yEaI5bpb2xJ8lxMLIHnFEJ/nhY7fu3KWcrnjuJmh/GqZRIrNuBcPE60q4/U+l1UR0/h5saZPPIaAB3rHsa5cIxY70o8p0g9PzbvjIvNevUPPyW1YTfOpRNgImQeecHauEG8psImkHe6sZ4B0pseI9Y3yORHr5LasJt470q6HnyeyoVjVMfOEO1aSj3/xcwTBhDrHcBzysDU9I3vAZDatAe/7hJfck/j9ql9kb7XuD3Rv47ObU/jjM7+N2i7NXvM0XQPmUe/S/W6v/07t/4NRONtHTeW6af7kReJpDIklq/Hr1cBcPNXiGX6Sd6znWiqm/TGPUS7luBcPknn9m+AaTw3nduexq+7dO98AWPm/zIL6jwtdvx4/zpSa3fgjHxONN2DV8qRXH0ftYnLxHqWkxjYRL0wQcfaHcT6VuKVcqQ27KZeuEpm13eIdHQvKOdissZ6lpMc2Ei9kCXa2Ys7edXOuAG9psImkOTRrj6KJ94itX4XzuWTxHsHqGVHyB/6DR33PEBixXp81yHWt5LiZ+/O3M+vlvDKeeL9ayh88kdqE5cAKJ08AL5PvVIgkujAzY3jla5ROXcEgOqVsxRPHCAxsDGIwwWaP+Z6KUf+w1/fmN3Mf82g2XGn95eaSBSMwasU8etuYz9lR/fUtrH81M4KQ3LVNorH/gTT/2w1hmjXUoon9uN789/yFtR5Wuz4kXiS8ulDJAc24TlFYktW4QwPkehfS+3aKPX8OCaeoHLuKG52hEi6B0yE+LJ7KRzf19R+7KafWyJE0r1Eu3oxxhDrWmpn3IBeU2ET6O6F8rmjeE6Rzi1PkvvgZXr2fG/Bj1k49idSG3YveAtTULsXWnHMt3K7Ffl2jbvYnLbP060y3CpHkOdrobsXbL2mwnSO5soZdoHM6U5Lrd0x8+dmn7Cu7d9oURo7WnHMd9K4zQo6b9DjL4ReU3eWUO5euPl2Nz9O+eyRto1nU7Or3LUvLrYjzm3Hne/trfZVOk/FE/vJHvh5uyKF7rmbz5hh+p20LdDdC37NoV6YINq9FDc3TrSrj1jPCqqjp6heOY+bvUx68xPEMsspnz1C5eKnVMfO0HHvdspnj9C59UmckVOUTr0/s3JaPHmAxPL1uLlxevZ8r/Hkei75w7/FGEP3zoWt1AZ97NNKpw/Rs+yeUOW69vYvGqvzmx4j1rUk8DwQzvPUed8zuG0smWYyFYfeoT61gJbe+hSx7oXN67Y6k+/Vmfzo/6Yue6/Rcc8DLc0TJoHuXnCGh4ikuvGcEunNj1P6/P3GRPzAJhL9a274ZQIaH7TiufhuY9VzenfC9Sun8d4B/Op1n6Dke/hene4d32psLQtYM8dePPEWzqXj5A+9suCV5nbmmlmdb3HhNpsnrOep3b4KmUwkCvg4l46TXH2/tZxBCHT3QnJwC155kviSQUpD79C1/Xmcy0N4lUmq42duWPmMpntwr41gEmnc3Dh+rULp8/cArls5vR+v5lAvXiO+dBWTR39HffILTCTK5NHfYSx+stJsmjn2zvu+RnL1/WR27235O5Jmc/m+d8PqfNB5IJznCaB85jDOpePUJi6HJlPn1qfI7N7btnPVTKbk6sbvr2li18mdJJDpha77n/3S/+u498GZPydXbQUgsXz9DbfXS7lb/rNjeirh5vuEUTPHDu1fqFhoLmMi9D75D6HJMy1s5wkgtf4RUusfCVWmdmsmk3PhGJ33fa394QIW6O6FhYgk07N+BqhWTkXufJlHvxt0BCtCcVnHQq4DLxz7E7kPXsYZOUXlwiftjtYSYTw+ZbpzM4U1VxgzhVHg73Sz+14iMbARNzuC51bpfuibQGNRZHrVM5JI07HmIUyscUmn5xRmLiHMvfc/M/fL7nuJWM9yMo/9HfkPfz1zXXm0M5jPWwjr8SnTnZsprLnCmCmsAn+nG0ln6Nz2NABd27+OMzwEcMOqZ2rDrpknChrXYJdOHpj57+n7JVZuJLV+F252pOnrylstjMenTHduprDmCmOmsAq8dK+PUDj2BsnBxgT7Dauepw/NbBMDvvQ33l/vF2l8Fo4xTV9X3nphPD5lunMzhTVXGDOFk745ok2C/pT/IDLMRxi+jSDs50rfHLEwd9pnL4Tgna6IyN3D6jvdSDw56rtV+19Fegsmlhjzas5Aux7f9rHe6njCdL6n3ZwziIxhP1ezvTaDfk2F6Rxdr92/y61mtXQXyhgTBa4A9/m+/6XLnowx3wb+2ff9r1sPJyLShLBPL+wARm9VuFPeBvYYYzrsRRIRaV7YS/c54M3ZbvR9Pw98CjxuLZGIyCKEvXSfZY7SnfLm1M+JiIReaEvXGBMHngLeus2P7qPxjlhEJPRCW7rAbuC07/u3uxTlXWCHMabLQiYRkUUJc+nOZ2oB3/dLwGHgybYnEhFZpDCX7nM0pg7mQ1MMInJHCGXpTm0B2wMcuN3PTnkTla6I3AFCWbrAY8CnU1vC5uMDYKsxprd9kUREFi+spbuQqQV833eAg8DTbUskItICYS7d2y6i3URTDCISeqErXWNMJ43Lf9+9zY/eTBdJiEjoha50aVwQ8dHUVrCFOAysNcb0tyGTiEhLhLF0FzSfO833fZfGB+A80+pAIiKtEsbSfRZ4o8n7aopBREItVKU7teVrG40tYM3QYpqIhFqoSpfGlq+DU1vAmvEJ0G+MGWxhJhGRlglb6c7r8xZm4/u+B+xHUwwiElJhK91m9ufeTFMMIhJaoSndqa1ea2ls/VoMffiNiIRWaEqXxlavt6e2fi3GSSBljFm3+EgiIq0VptJd1HzuNL/x9cbaOiYioRSm0m3qoohZaIpBREIpFKU7tcWrH/i4RQ/5JvCcMca06PFERFoiFKVLYypg/9SWr1Y4A9SAzS16PBGRlghL6bZiq9iM6+Z1NcUgIqESptJt1XzuNM3rikjoBF66U1u7UjS2erXSPuAZY0zgxygiMi0MhfQs8ObUlEDL+L5/EcgCD7TycUVEFiMMpduOqYVpmmIQkVAJtHSntnS15KKIWegiCREJlaDf6W4GXBpbvNphP/C0MSbWpscXEVmQoEv3OWBfq+dzp/m+PwZcBh5ux+OLiCxUGEq3XVML07RfV0RCI7DSndrK9QztW0SbpnldEQmNIN/pPgBcm9ra1U5vAU8aYxJtHkdE5LaCLF0bUwv4vp8FPgcebfdYIiK3E0jpGmOitHer2M3eBJ6dGldEJDBBvdN9j0bpbmr3QMaYFNAH/AD4WbvHExGZS1ClmwfSNMqw3ao09gPfD5yzMJ6IyKyCKt0hGh9w8y/tHsj3/TqwF5gAPm33eCIiczFtui5BRERuIeiLI0RE7ioqXRERi+b8IJhIPDnqu9UVtsLMl4klxgDCkM3EEmNezRkIOoeI3BnmnNM1xvhrfvSqxTjzc/7HLwAQhmznf/wCvu/rW4dFZF40vSAiYlFbSrd89qOZP+c+ePlLt2f3vURx6J12DD2ruTLVSzlyB3/JxBv/YTWTiNx92lK61fFzuLlxsgd+jjN8Eq9WuaH0und+ux3DNp0pmu6h57HvE0l2Ws8lIneXtk0vOJdP0vXg8ySWrSES7yC1bme7hmpJpvLZIyRXtv2qZBG5y7WtdJOrtlL67F1qVy823lWeOTxzW3HoHcqn/4xXq7Rr+AVlqhez5D/4FbWJYXSxiIi0k3YvLJJ2L4jIQmj3goiIRfMq3VvtQFgsNzdGcegdKhc+wRk5NevPTb8Tv1WGZnPd7n6z3T5XFhGR+ZjXV5O7uVFKpw7iZofp3vkikx//Hmd4iOTAZjrWPkT5Lx/ie3XSmx/HzQ6T3vwEAFf/8FOiqQwmnsTNX2HJcz9k8uhreOVJOtbuwLl4jOS9D1I9e4Tiif1kdu2l8PHrEI0Ry/TjOSXwXEwsgecUWparduUs5XNHcbPDeNUyiRUbcC4eJ9rVR2r9Lqqjp3Bz40weeQ2AjnUP41w4Rqx3JZ5TpJ4fa9X5F5G7zLze6cZ6BkhveoxY3yCTH71KasNu4r0r6XrweSoXjlEdO0O0ayn1/BczxQYQy/TT/ciLRFIZEsvX49erALj5K8Qy/STv2U401U164x6iXUtwLp+kc/s3wDSmSDu3PY1fd+ne+QKN77FsTa54/zpSa3fgjHxONN2DV8qRXH0ftYnLxHqWkxjYRL0wQcfaHcT6VuKVcqQ27KZeuEpm13eIdHQv6qSLyN1rXqUb7eqjeOItUut34Vw+Sbx3gFp2hPyh39BxzwMkVqzHdx1ifSspfvbuX+849e04JhIFY/AqRfy6i193iXR041w6Tr2Uh0gUMCRXbaN47E8wvbhnDNGupRRP7Mf33JblisSTlE8fIjmwCc8pEluyCmd4iET/WmrXRqnnxzHxBJVzR3GzI0TSPWAixJfdS+H4PurlyebPuIjc1Ra0e6F87iieU6Rzy5PkPniZnj3fs5HxS27evRBkLu1eEJGFWNDuhdTaHXRueRLgS8XW7OJUK8yWq5lM+T+/wtXXf0K9eK2lGUVEYJ4LafMxvajl1xzqhQmi3Utxc+NEu/qI9aygOnqK6pXzuNnLM/Or197+RWPxatNjxLqWtCrKojJldu2leOItvEqBaGdvyzOJyN2tZft0pxe1nOEhIqluPKdEevPjlD5/vzG/OrCJRP+aGxa0Zhav2lC4zWZy81eoF7PEl65uSyYRubu1rHSnF7WSg1vwypPElwxSGnqHru3P41wewqtMUh0/M7Og5fveDYtX7bDQTABXX/938H3c/JW2ZBKRu5suA14kLaSJyELoMmAREYsWVbo3v0uea7eA7UtnF5JNRMSWpncvZPe9RGJgI252BM+t0v3QNwEonnhrZqdAJJGmY81DmFgcPJf84d9ijCExuJXK+aPE+1ZRL1wFEyG1fheTR1/DROL0/s0/LuqgFpqt8Okb+DWHxMrNuNdG8WsV4svWUPj490Q7l9Dz+PcxscSiMomIwCLe6UbSGTq3PQ1A1/av4wwPAdywUyC1YVejcAHfq9O941uNq9KqZSIdGapfnCfWN4jnlBr3S3Y1ftatLe6gFpgtvmQ1nlMimspMZfUASA5uITGwkVp2eFF5RESmLWJ64a93LRx7g+TgVoAbdwqcPoTvNj5vwUSiTB79HSaZxs1faRRevY7nlPDrNZKrtjX2xnb1zZShrWxepQAmgucUcIaHcC6fAKBy8QSVi8eI9w0uMo+ISIN2L8zCzY3hjJyic+tTt82i3QsiMl8tuyLtqybWs4JYz4qgY4jIV8yc73Qj8eSo71ZD1zwmlhgDCEM2E0uMeTVnIOgcInJnmLN0RUSktXRxhIiIRSpdERGLVLoiIhapdEVELFLpiohYpNIVEbFIpSsiYpFKV0TEIpWuiIhFKl0REYtUuiIiFql0RUQsUumKiFik0hURsUilKyJikUpXRMQila6IiEUqXRERi1S6IiIWqXRFRCxS6YqIWKTSFRGxSKUrImKRSldExCKVroiIRSpdERGLVLoiIhapdEVELFLpiohYpNIVEbFIpSsiYpFKV0TEIpWuiIhFKl0REYtUuiIiFql0RUQsUumKiFik0hURsUilKyJikUpXRMQila6IiEUqXRERi1S6IiIWqXRFRCxS6YqIWKTSFRGxSKUrImKRSldExCKVroiIRSpdERGLVLoiIhapdEVELFLpiohYpNIVEbFIpSsiYpFKV0TEIpWuiIhFKl0REYtUuiIiFql0RUQsUumKiFik0hURsUilKyJikUpXRMQila6IiEUqXRERi1S6IiIWqXRFRCxS6YqIWKTSFRGxSKUrImKRSldExCKVroiIRSpdERGLVLoiIhapdEVELFLpiohYpNIVEbFIpSsiYpFKV0TEIpWuiIhFKl0REYtUuiIiFql0RUQsUumKiFik0hURsUilKyJikUpXRMQila6IiEUqXRERi1S6IiIWqXRFRCxS6YqIWKTSFRGxSKUrImKRSldExCKVroiIRSpdERGLVLoiIhapdEVELFLpiohY9P8QdEv7a4+pVQAAAABJRU5ErkJggg==" - }, - "metadata": { - "needs_background": "light" - } - } - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## 变量标准化\n", - "\n", - "下面这个例子中,`var` 为变量编号。同一变量拥有相同的变量编号。 \n", - "如:`x` 变量的编号为 `0`, `y` 变量的编号为 `1`。" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 20, - "source": [ - "f.variable_standardization().elements" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[{'id': 0, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", - " {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0},\n", - " {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'},\n", - " {'id': 3, 'type': 'bin', 'text': '+', 'role': None},\n", - " {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0},\n", - " {'id': 5, 'type': 'bin', 'text': '+', 'role': None},\n", - " {'id': 6, 'type': 'textord', 'text': '1', 'role': None},\n", - " {'id': 7, 'type': 'rel', 'text': '=', 'role': None},\n", - " {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}]" - ] - }, - "metadata": {}, - "execution_count": 20 - } - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, { "cell_type": "markdown", "source": [ @@ -255,11 +73,11 @@ "cell_type": "code", "execution_count": 21, "source": [ - "fs = FormulaGroup([\n", - " \"x^2 = y\",\n", - " \"x^3 = y^2\",\n", - " \"x + y = \\pi\"\n", - "])\n", + "fs = FormulaGroup([\r\n", + " \"x^2 = y\",\r\n", + " \"x^3 = y^2\",\r\n", + " \"x + y = \\pi\"\r\n", + "])\r\n", "fs" ], "outputs": [ @@ -280,209 +98,6 @@ "name": "#%%\n" } } - }, - { - "cell_type": "code", - "execution_count": 22, - "source": [ - "fs.elements" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[{'id': 0, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", - " {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'},\n", - " {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'},\n", - " {'id': 3, 'type': 'rel', 'text': '=', 'role': None},\n", - " {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None},\n", - " {'id': 5, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", - " {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'},\n", - " {'id': 7, 'type': 'textord', 'text': '3', 'role': 'sup'},\n", - " {'id': 8, 'type': 'rel', 'text': '=', 'role': None},\n", - " {'id': 9, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", - " {'id': 10, 'type': 'mathord', 'text': 'y', 'role': 'base'},\n", - " {'id': 11, 'type': 'textord', 'text': '2', 'role': 'sup'},\n", - " {'id': 12, 'type': 'mathord', 'text': 'x', 'role': None},\n", - " {'id': 13, 'type': 'bin', 'text': '+', 'role': None},\n", - " {'id': 14, 'type': 'mathord', 'text': 'y', 'role': None},\n", - " {'id': 15, 'type': 'rel', 'text': '=', 'role': None},\n", - " {'id': 16, 'type': 'mathord', 'text': '\\\\pi', 'role': None}]" - ] - }, - "metadata": {}, - "execution_count": 22 - } - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 23, - "source": [ - "fs.ast" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[{'val': {'id': 0, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", - " 'structure': {'bro': [None, 3],\n", - " 'child': [1, 2],\n", - " 'father': None,\n", - " 'forest': None}},\n", - " {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'},\n", - " 'structure': {'bro': [None, 2],\n", - " 'child': None,\n", - " 'father': 0,\n", - " 'forest': [6, 12]}},\n", - " {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'},\n", - " 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}},\n", - " {'val': {'id': 3, 'type': 'rel', 'text': '=', 'role': None},\n", - " 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}},\n", - " {'val': {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None},\n", - " 'structure': {'bro': [3, None],\n", - " 'child': None,\n", - " 'father': None,\n", - " 'forest': [10, 14]}},\n", - " {'val': {'id': 5, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", - " 'structure': {'bro': [None, 8],\n", - " 'child': [6, 7],\n", - " 'father': None,\n", - " 'forest': None}},\n", - " {'val': {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'},\n", - " 'structure': {'bro': [None, 7],\n", - " 'child': None,\n", - " 'father': 5,\n", - " 'forest': [1, 12]}},\n", - " {'val': {'id': 7, 'type': 'textord', 'text': '3', 'role': 'sup'},\n", - " 'structure': {'bro': [6, None], 'child': None, 'father': 5, 'forest': None}},\n", - " {'val': {'id': 8, 'type': 'rel', 'text': '=', 'role': None},\n", - " 'structure': {'bro': [5, 9], 'child': None, 'father': None, 'forest': None}},\n", - " {'val': {'id': 9, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", - " 'structure': {'bro': [8, None],\n", - " 'child': [10, 11],\n", - " 'father': None,\n", - " 'forest': None}},\n", - " {'val': {'id': 10, 'type': 'mathord', 'text': 'y', 'role': 'base'},\n", - " 'structure': {'bro': [None, 11],\n", - " 'child': None,\n", - " 'father': 9,\n", - " 'forest': [4, 14]}},\n", - " {'val': {'id': 11, 'type': 'textord', 'text': '2', 'role': 'sup'},\n", - " 'structure': {'bro': [10, None],\n", - " 'child': None,\n", - " 'father': 9,\n", - " 'forest': None}},\n", - " {'val': {'id': 12, 'type': 'mathord', 'text': 'x', 'role': None},\n", - " 'structure': {'bro': [None, 13],\n", - " 'child': None,\n", - " 'father': None,\n", - " 'forest': [1, 6]}},\n", - " {'val': {'id': 13, 'type': 'bin', 'text': '+', 'role': None},\n", - " 'structure': {'bro': [12, 14],\n", - " 'child': None,\n", - " 'father': None,\n", - " 'forest': None}},\n", - " {'val': {'id': 14, 'type': 'mathord', 'text': 'y', 'role': None},\n", - " 'structure': {'bro': [13, 15],\n", - " 'child': None,\n", - " 'father': None,\n", - " 'forest': [4, 10]}},\n", - " {'val': {'id': 15, 'type': 'rel', 'text': '=', 'role': None},\n", - " 'structure': {'bro': [14, 16],\n", - " 'child': None,\n", - " 'father': None,\n", - " 'forest': None}},\n", - " {'val': {'id': 16, 'type': 'mathord', 'text': '\\\\pi', 'role': None},\n", - " 'structure': {'bro': [15, None],\n", - " 'child': None,\n", - " 'father': None,\n", - " 'forest': None}}]" - ] - }, - "metadata": {}, - "execution_count": 23 - } - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 25, - "source": [ - "ForestPlotter().export(\n", - " fs.ast_graph, root_list=[node[\"val\"][\"id\"] for node in fs.ast if node[\"structure\"][\"father\"] is None],\n", - ")" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[Text(22.32, 181.2, 'id: 0\\ntype: supsub\\ntext: \\\\supsub\\nrole: None'),\n", - " Text(11.16, 108.72, 'id: 1\\ntype: mathord\\ntext: x\\nrole: base'),\n", - " Text(33.480000000000004, 108.72, 'id: 2\\ntype: textord\\ntext: 2\\nrole: sup'),\n", - " Text(55.8, 181.2, 'id: 3\\ntype: rel\\ntext: =\\nrole: None'),\n", - " Text(78.12, 181.2, 'id: 4\\ntype: mathord\\ntext: y\\nrole: None'),\n", - " Text(111.6, 181.2, 'id: 5\\ntype: supsub\\ntext: \\\\supsub\\nrole: None'),\n", - " Text(100.44, 108.72, 'id: 6\\ntype: mathord\\ntext: x\\nrole: base'),\n", - " Text(122.76, 108.72, 'id: 7\\ntype: textord\\ntext: 3\\nrole: sup'),\n", - " Text(145.08, 181.2, 'id: 8\\ntype: rel\\ntext: =\\nrole: None'),\n", - " Text(178.56, 181.2, 'id: 9\\ntype: supsub\\ntext: \\\\supsub\\nrole: None'),\n", - " Text(167.4, 108.72, 'id: 10\\ntype: mathord\\ntext: y\\nrole: base'),\n", - " Text(189.72, 108.72, 'id: 11\\ntype: textord\\ntext: 2\\nrole: sup'),\n", - " Text(212.04, 181.2, 'id: 12\\ntype: mathord\\ntext: x\\nrole: None'),\n", - " Text(234.36, 181.2, 'id: 13\\ntype: bin\\ntext: +\\nrole: None'),\n", - " Text(256.68, 181.2, 'id: 14\\ntype: mathord\\ntext: y\\nrole: None'),\n", - " Text(279.0, 181.2, 'id: 15\\ntype: rel\\ntext: =\\nrole: None'),\n", - " Text(301.32, 181.2, 'id: 16\\ntype: mathord\\ntext: \\\\pi\\nrole: None')]" - ] - }, - "metadata": {}, - "execution_count": 25 - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "
" - ], - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAV0AAADnCAYAAAC9roUQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAU1klEQVR4nO3dWW+c133H8d95ZuNwJyVZFLVYtixLju3Ure0UKeJe1AHaAgZStEDQi170BfS2yAvoReEG6HLRl5CroldxgCBwbbR20sULHDu2KTm0ZUkWqZWSOFxmPb2YkTgcDWd7znPmmWe+H8CwuJ3nP3+e+fHZ5oyx1goA4Ecw7AIAYJwQugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgk7XW2X8mnV2XZF3+Z9LZ9WHV07ztKB7bII8xaT0Os/049CKqeeF67vmYy1HVHJfnnrOctNbKFWOMffxHbzgbT5K+fv01WWvNMOpp3nYUj63ddrpJWo/DbD8OvYhqXrieez7mclQ1x+W554q30wuVzVuSpJ1LH0mSSje+0uavf+Fr8wfWUdm8rc2Pfu58XEm69z//FnrcsHXsrL6v4rULQ62h+M1KLH7XO19+oO0Lvxra9quFDRU+fVulm5e0tfKu83HLG9dU+PTtSGq2tqZ7//vvzsfduvBLlW5ecj5u8dqFfc/FOEn72lDp+pfaWX1fQX5GteKWqtv3lJlf8rX5A+tITS/KVkrOx63ubCo1teig4nB1BJOzTh5fmBokK1vc8lpDuzpSM4e1s/p/w9v+9IJSUwvKHjmt8p1vnI+bml6ULRcjqbm0vqrMoZPOxzWpdKj5edC4lbvrCnJTA48bJc8X0vZOZaQmZ1W+u+53823qqG7eVJCbdD5ubee+qlt3ZKsVR2MPVkeQm1J18/ZQa7BWMpncEGrYX0eQyys17fsPYdOc2C2otHZRWyvvyqSzzset3r/laC4/OratFFW+fcX5uEFu0sH8fHTc9OJxVTZvhhw3Gt72dCef+s6+j7OPPansY0/62vyBdQS5KU0//33n4+aOPa3csadDjxu2jiA3pczi8aHWMHHiGU2ceMZrDe3qCHJTmn7u1aFtX5LmvvvDSMfNHDoR2dgTJ5+LZNwwOdBp3NzSUwOPGyXne7ou9l5dn4vpp6ZO2x7ksZXvrrcdc/fyx7K1at/jhamllcs+u+px1Ns+SJiafM2LbtvZu05xq6ft9yuKmpuNynMvLOd7ujur76myeFwygco3vlTQOL8y/dyrSs0e0fbFXymzsKzq9j1V7l2XLW7LZPPKHDrx8FC8ePUzBblJZ3uKXWv6/L+Umj2i0vpvlTl8avBxVt5RtbCh1Mwh1XYLSs0cUrWwIVvalmrV+uGZMcounVXx6ufKnXg2usfUR59dcNXjSLYdcS98zYtu26nt3FfhN28pNb2orU/+QxOPf1u54+2PMOJS8yg+98JyHrqpyXnJBDKpvaHTc489/HdmfknlO9+otluQrRQlWU2cel7Fq59JQSCTytS/0eGtbN1qCibnpFql66FZ13Hyswrys/UPalXVdjYVZHIK5h6TalVJRiadc3A+L359dtXjKLYddS98zYtu22k8CElSdvmcah0uqsWl5lF87oXlPHSnnnlFO6vvK3v8vGy1ovzpF/Z9PXv0jLJHz0hSfa+jVlVmYTnS847damr9OKpx8k+++PDfYc83xa3Prnocxbaj7oWvedFtO1PP/OHI1TyKz72wIrmQlj/zUv3/XRqRnjsaxebb6rUmX+O4ELc+D7M3w+6Fr8fucjujVnOcnnthRH4hrbZbUHVnU5JUvHahp1uooripuVNdcR673+0ltceDbHOYvZCi60e3cWvF7UjGHdQwxm3+3fdyYdGnSC6kFSemFWQnVd26o9zyOVW37ys9v6TMkdMqrl1U+dZlpRsvjKhsXFNq5rDKNy8pu/SUZAJVN29pa+VdTZ3/npe60rNHtHvpIwX5WeWOnXU6tqxV+fYV5ZbOKjW9EPljSWqPB9nmMHvRrbbU5Jw2P/ipZl76gYI+72Hu1ufCr3+u3Mnn++5zVPM4qnnR7XdfunlJxcufyGTzmnnhT/oaO0rO93RTk/MyQUqpmUNSrVb/q9u4QFGfXFa14pZspaTKxrX6D9maFASytarKN75yXVLXuuolVOt1OB47s3BMpbUvnAVut+0ltceDbHOYvehWm62W92pxOK4JUsounxvoD1tU8ziqedHtd//gomncRHIh7YF2J6wnTjyriabbNXYufbT/HM2Zl12X1FNd7W6ydjF2rVxUbtntiyTGsceDbnNYveiltkFfKNHLY3Y9bph5HNW86DZueuawUmcXlFlYHmj8qHh5GXDrObO77/5E26vvSfJ7UrxTHVGNG2Ryykf4xO6ljiT02FUdSehFlD0+aOyw83gYvUjPHY1d4EoRvgx488OfyWQnlMrX11gorV3U5PlXlFk4Vr9B+fInMiZQrbyr6v2byhw+peKVTzX9O3/c5t7D4dSR9Mfnqo441BCXOqKqIcrHNmo1x+H3HEZkoZtePC5b3pVkZCtFpRePS6a+bKVJZ6VUWjJGRkaZI6elWlWpmUN7N6rHvI6kP75RqyEudYzifBu1muPwew4jstBtPox7cH9d89d8HeZFVUfSH9+o1RCXOkZxvo1azXH4PYfBe6QBgE9O3/snBu9Z5bIe3iON90gbRg1RzD3eI8393B/4OWOb7peLmjHmY0l/ba390BhzXtIb1tp4Lno5oowxP5Z0x1r7942Pv5b0R9ba1eFW5p8x5l8lXbTW/kvj45uSvm2tXRtuZRhn3k4vGGNSkp6SdLHxqVVJJ4wxw3pLgaQ6J2ml6eOVxufGEb1A7Pg8p3tK0m1rbUGSrLVlSV+rHsRw57yk5neivND43DiiF4gdn6F7TvufAGp8zJ6HI8aYrOp/3JpPJYxlj40x05IWJV1u+vRY9gLx4jt0V1o+x+GeW09KumKtbV69elx7/LSkL6zd96L+ce0FYsRn6LYe6kkc7rlGj/fQC8QSe7rJ0q7H30iaMsbM+y9nqNr14itJy8aYiSHUA0iKyZ6uMY3X8CGsR3ps6/cEXtT4/XFr14uy6sHrbkFfoE9eQtcYMytpTtLV5s9ba29Jqkga/ioUydBu704azyMKeoFY8rWne071m9TbrVTMeTYHGkcL7Y4mpDHrsTEmUP1C2sU2Xx6rXiB+fIZuuzCQuI3HlcOSjKSbbb42bj0+KWnDWtvujbjGrReIGZ+h2+5QT+Jwz5VzklZs+9d1j1uPmW+ILV+he9Bhr8ThniudevyFpDONl2KPg67zjYu3GBb2dJPjwB5ba7clXZd02mdBQ9SpF3ck7Upa8loR0BB56Db2rs6q/UUNSfpSLHzjQqe9O2m8jijoBWLLx57uKUm3rLVb7b7IwjfOdLpYKY3XBSR6gdjyEbqdTi08wCmGEA5Y6KbVWPTYGDOjRxe6aTUWvUA8+Qjdbod6Eod7YZ3RowvdtBqXHrdb6KbVuPQCMcSebjLQ4z30ArHGnm4y9NLjaxqPhW966cWDhW/yHuoB9onVni73Tg6sa48bL5oYhwtIvfSiovpdM1y8hXeRhm5joZtZ1ZcXPJC19rbqC98cjbKeBOtl704ajyMKeoFYi3pP95y6X9R4YBz2wpxrHB10u0XqgUT3uLHQTad7wpsluheIr6hD97y6n1p4gIsbgznc+H+7hW5arSjZe3cnVX/7+XYL3bRivmEofOzp9rIHJnG4N6jzki4csNBNq6Tv3fV6akFivmFIfIQue7rR6qfHX0h60hiTjrCeYeqnFxfExVsMgY/TC+x5RKvnHo/Bwjf99OLBwjfHIq0IaBFZ6DYWunlKvV3UkOq38Bxn4Zu+9XMKR0r2EUW/vUj66RbEUJR7uo9LunnQQjetGgvfXBL3Tvarn0NqKdlHFP32Isl/gBBTUYZuv08AKdmB4FyPC920SmTQNC10c6WPH2O+wbsoQ7ef87kPJDIQInRG0mVrbamPn0lq0Dytg9/89CDMN3jHnu5o4w/bnkF6wXyDd3Hb0+XCRn/6vXAkSWuS8saYhQjqGaZBevGVpGMsfAOfot7THWgvjHsne9b30UTjRRQXlbw/boP0goVv4F0koWuMmZM0oy4L3bRi4Zu+DXI0ISXzFMOgveAUA7yKak/3nPq/qPFAEgPBuaaFbvo9by4lLGiaFrrhDxBiL8rQHSQMpIQFQoSONP5/a4CfTVrQnFJ9oZvCAD/LfINXUYXuoId6UvICISrnJK30uNBNq6QFTZg/8sw3eMWe7ugK84ftC0lPJGjhmzC9YOEbeBXHPV1uG+vNIHeHSJKstTuS1iU94bSi4QnTiw2x8A08ch66jYVuzqj3hW5aPVj4ZsJdVYkU5mhCStZhNb3AyIhiT/e0pBuNZQT7xsI3PQtzNCEl6zQOvcDIiCJ0Bz7Ua8KeRweN5S9Pqr+Fblol4jROY6GbefW30E0r5hu8iSp0wxzqSex5dDPIQjetkhI0/bz56UGYb/AmitANe6gnJWQvLEIujiaSEjSuesF8gxdOQ9cYMynpWbl5EnzLGDMdvqpkadza9JLCH02sSZowxnwrfFXD0Xgl2osKP9++krRkjDkduiigC9d7uq9J+gNJPww5zp9L+j1JfxG6ouSZlfS3Ct/j70nKS/rnsAUN0VFJfyPpL0OO86qkjKR/DF0R0IXr0H1L0o6kvws5zo8lbUv6z9AVJYy19p6ka5L+KeRQv1T9Qty7oYsaEmvtmqSbkv4h5FBvqn4h7p3QRQFdmMFeRQoAGETUb8EOAGhC6AKARx0XPAkyuXVbKUWyoLhJZWu2WnJ790Q6e71WLi65HDNqkfY4nb0uSS7Hj7LH9ALjoOM5XWOMffxHb0Sy4a9ff02ux/769ddkrR2p1aKi7rEkp32Ossf0AuMg1J5mZbO+fvbOpY8kSdXChgqfvh26qHZjlzfWVPjNW07GHjWP9OL2VW2thL/poHXcyuZt3X3nJ6HHjcpB82336mfa+nzwGw8OGre8cU3bF/87VM1Aq1DrqZauf6md1fcV5GdUK24pNb2g1JSbN5ltHTuzcEyl62GWGhhdrb0o37kqE4Q/M9M6bnrmkLLHzjqoOBoHzTdbKSmYnHU+bmZhWZV7Nxw+AsDJhbS90xO13YJKa4Ou6Nh57N2rn6pWHGjhsoTY60V6YVmV+7dka1Wn41prFf+1vB+dbyadVW37vvNxK5u3Vb55KeS4wH6h9nQnn/rOI5+b+27YF0q1H3vixLOaOPGsk7FHTWsvsrkpZQ+fcj6uJOXPvBx63KhENd86jTv78p+FHh9oxi1jAOBR19At313ve9Dy3fWHFyWa7V7+eN8hcbex23293efabWvU0OeDt9vLz/TSh17Gj1MvkExdTy/srL6nyuJxyQQq3/hSwdSCKnfXNf3cq0rNHtH2yjuqFjaUmjmk2m5BqZlDqhY2ZEvbUq2q8u0rkjHKLp1V8ernyjWdIug2dvHqZ6oW7qi0/ltlFpYlY1S5u6bSxLRstSKTyqhW2lF6fvRvlaTP0fdh1HqBZOoauqnJeckEMqm9b03PPfbw30F+VkG+ceW4VlVtZ1NBJqdg7jGpVpVkZNI5mXS277FTUwuqFu4oc+iEbKUsEwQKJmYa33dUQX5GxW8+7/MhxxN97q3WMH3oZfw49QLJ1DV0p555RTur7yt7/LxstaL86Rf2fb3141b5J198+O/c0v63Pes69hO/2608ZY+c7vo9o4A+10XZh57Gj1EvkEw9XUjLn3lJQWZi3wRtPc9V2y2ourPZdwGtY3cat1bc1uavf6Facavv7YyCfnoRZuxO49pqRYVP3lRl8/ZA23EhyvnWOv44zzcMx8C3jO2svqfixLSC7KSqW3eUWz6n6vZ9peeXlJqc0+YHP9XMSz9QkMk5Gzczv6T09IIUpAYte6QMq8cmlXHy4guXOtUsa1W+fUW5pbNKTff34hzmG3wb+JmVmpyXCVJKzRySarX6Cxca6zjYarn+TQO8V2CncWu7BRWvXZBqYd6DcHQMpcflXSlI7Y0fE51qziwcU2nti74Dt9u44zbf4MfAe7pTz7zy8N/tzp0NetN6t3HnX/mrgcYdRcPq8dT57w00bpQ61VwrF5Vbftr5uNJ4zTf4Ea9jSGAAQSYX61fSAc0GDt3Wm8Nd3yzOzed1UfZ51HpML5AEfZ1e2PzwZzLZCaXysyrfXVdp7aImz7+izMIxVTdvaWvlXVXv31B2+bzKty7LZHIKspOaPPv7occvfPKmJk49L1spqbq1ofKtK5p6/lUFmYmBHnicRdnnUesxvUDS9LWnm148riA3JcnIVopKLx6XWlalCiamVdu6q1pxSyaVqV+gcDB+en5JpRtfaffKp43vTu4bakbZ51HrMb1A0vS1p9t832T+zEv7vjb9/PdDF9Np/ImTzz389+7ljzX9wp/KJPRWnij7PGo9phdImo5v18N7pEWP9wXbQy8wDjqGLgDALW4ZAwCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8Oj/AWdihSPDqhKLAAAAAElFTkSuQmCC" - }, - "metadata": { - "needs_background": "light" - } - } - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 28, - "source": [ - "for ft in fs.variable_standardization():\n", - " print(ft.elements)" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[{'id': 0, 'type': 'supsub', 'text': '\\\\supsub', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}]\n", - "[{'id': 5, 'type': 'supsub', 'text': '\\\\supsub', 'role': None}, {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, {'id': 7, 'type': 'textord', 'text': '3', 'role': 'sup'}, {'id': 8, 'type': 'rel', 'text': '=', 'role': None}, {'id': 9, 'type': 'supsub', 'text': '\\\\supsub', 'role': None}, {'id': 10, 'type': 'mathord', 'text': 'y', 'role': 'base', 'var': 1}, {'id': 11, 'type': 'textord', 'text': '2', 'role': 'sup'}]\n", - "[{'id': 12, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}, {'id': 13, 'type': 'bin', 'text': '+', 'role': None}, {'id': 14, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}, {'id': 15, 'type': 'rel', 'text': '=', 'role': None}, {'id': 16, 'type': 'mathord', 'text': '\\\\pi', 'role': None}]\n" - ] - } - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } } ], "metadata": { diff --git a/examples/i2v/get_pretrained_i2v.ipynb b/examples/i2v/get_pretrained_i2v.ipynb new file mode 100644 index 00000000..9fe707b7 --- /dev/null +++ b/examples/i2v/get_pretrained_i2v.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# get_pretrained_i2v\n", + "\n", + "## 概述\n", + "\n", + "使用 EduNLP 项目组给定的预训练模型将给定的题目文本转成向量。\n", + "\n", + "- 优点:简单方便。\n", + "- 缺点:只能使用项目中给定的模型,局限性较大。\n" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 导入功能块" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "from EduNLP import get_pretrained_i2v" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输入\n", + "\n", + "类型:str \n", + "内容:题目文本 (text)" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "item = {\n", + "\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\n", + "}\n" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 模型选择与使用" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "根据题目所属学科选择预训练模型: \n", + "\n", + " 预训练模型名称 | 模型训练数据的所属学科 \n", + " -------------- | ---------------------- \n", + " d2v_all_256 | 全学科 \n", + " d2v_sci_256 | 理科 \n", + " d2v_eng_256 | 英语 \n", + " d2v_lit_256 | 文科 \n", + "\n" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "i2v = get_pretrained_i2v(\"d2v_sci_256\")" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "EduNLP, INFO Use pretrained t2v model d2v_sci_256\n", + "downloader, INFO http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_science_256.zip is saved as /home/lvrui/.EduNLP/model/general_science_256.zip\n", + "downloader, INFO file existed, skipped\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 注意:\n", + " 默认的 EduNLP 项目存储地址为根目录(`~/.EduNLP`),模型存储地址为项目存储地址下的 `model` 文件夹。您可以通过修改下面的环境变量来修改模型存储地址:\n", + " - EduNLP 项目存储地址:`EDUNLPPATH = xx/xx/xx`\n", + " - 模型存储地址:`EDUNLPMODELPATH = xx/xx/xx`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "print(i2v(item))" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "([array([-2.38860980e-01, 7.09681511e-02, -2.71706015e-01, 1.64714813e-01,\n", + " 2.81243492e-02, -1.82386801e-01, 9.22331214e-02, 1.31783364e-02,\n", + " 9.15176645e-02, 3.14464062e-01, 9.37800854e-02, -2.28523940e-01,\n", + " -2.60597020e-01, 6.49375990e-02, 9.75619778e-02, -1.97933778e-01,\n", + " 8.29798505e-02, -2.26491719e-01, -1.77030653e-01, -3.56038064e-02,\n", + " 6.22844934e-01, -2.66110301e-01, 8.00080523e-02, -1.60827965e-01,\n", + " -1.78654417e-01, -1.33000776e-01, 2.76004016e-01, 1.79546073e-01,\n", + " 8.71006995e-02, 2.33958483e-01, 1.76031828e-01, 1.55402005e-01,\n", + " -1.38987333e-01, -1.92975491e-01, -1.09528497e-01, 1.12305783e-01,\n", + " 2.32549626e-02, 7.75609687e-02, -2.43636876e-01, 6.35311157e-02,\n", + " -4.82399836e-02, -2.24204548e-02, 7.49862418e-02, -1.91449642e-01,\n", + " 9.72701237e-02, 4.00750965e-01, 2.81992704e-01, 3.07581365e-01,\n", + " -4.68867749e-01, -3.03025767e-02, -1.95257351e-01, 1.79073047e-02,\n", + " -2.15334237e-01, 9.98005569e-02, -2.62755096e-01, -2.39337608e-01,\n", + " 3.44270498e-01, 1.50241479e-01, -2.96006531e-01, -3.81666899e-01,\n", + " -1.19041964e-01, 6.18071109e-02, 6.49120063e-02, 9.94637012e-02,\n", + " 1.23297565e-01, 1.29930690e-01, 1.27305657e-01, -1.53804764e-01,\n", + " 7.04720244e-03, -1.33500487e-01, -1.51161134e-01, 1.13862932e-01,\n", + " -2.44814962e-01, -8.95622373e-02, 4.76458520e-02, -5.92206642e-02,\n", + " 2.88407020e-02, -5.88610955e-02, -4.25557904e-02, 3.20446432e-01,\n", + " -2.61463765e-02, 7.19539896e-02, -1.32161498e-01, 1.62227061e-02,\n", + " 1.20197656e-03, -2.03355268e-01, -6.83294982e-03, -2.82588631e-01,\n", + " -1.61395460e-01, -5.05547188e-02, -2.27462381e-01, -1.70932785e-01,\n", + " 1.41351461e-01, -1.30069017e-01, -1.83039993e-01, -6.79691881e-02,\n", + " -2.15642393e-01, -7.84436688e-02, 1.77202985e-01, 4.50607650e-02,\n", + " 7.02605024e-02, 8.01992565e-02, -1.55584306e-01, -2.00563252e-01,\n", + " 1.17082551e-01, 9.73844752e-02, -1.10356934e-01, -1.37866074e-02,\n", + " -8.57235789e-02, -5.56467362e-02, -9.36827138e-02, 6.82030804e-03,\n", + " 6.92379624e-02, -2.28701755e-01, 6.70390204e-02, 1.34586483e-01,\n", + " 2.25231394e-01, 1.33322045e-01, -8.82911906e-02, 1.42205298e-01,\n", + " 2.41012901e-01, 7.94170424e-03, -7.02124536e-02, 2.51370400e-01,\n", + " 1.04983136e-01, -6.39194548e-02, 5.24720028e-02, 7.16757867e-03,\n", + " -1.08169973e-01, -1.08731678e-02, 1.69618204e-02, 7.87692815e-02,\n", + " -2.26539060e-01, 3.29003595e-02, 1.91522852e-01, 2.75921494e-01,\n", + " -1.64055750e-01, 5.83723187e-02, 9.84422341e-02, 3.21688712e-01,\n", + " -2.62310840e-02, -2.08140060e-01, 1.14425711e-01, 1.23823956e-01,\n", + " -8.62085819e-03, -4.14005108e-02, -3.41566652e-02, 1.34680912e-01,\n", + " 4.27634180e-01, 1.42883554e-01, -1.54787973e-01, 7.96157196e-02,\n", + " 1.40678003e-01, 1.39171826e-02, 1.66003749e-01, -4.85638082e-02,\n", + " 5.88261709e-02, 9.51106697e-02, 1.81014258e-02, 1.44485429e-01,\n", + " 4.01205927e-01, 6.77596256e-02, -5.52676022e-01, -1.87850371e-01,\n", + " 1.12366609e-01, -6.84190989e-02, 9.48949978e-02, 2.23454669e-01,\n", + " -1.69843137e-01, 2.09085494e-01, 4.29946512e-01, -3.36349100e-01,\n", + " 6.12608856e-03, -1.46142125e-01, -5.11092655e-02, 8.06671828e-02,\n", + " 1.81744993e-01, -6.78945482e-02, -5.77093139e-02, 1.52337164e-01,\n", + " 2.21259117e-01, 3.35705757e-01, -2.51778495e-02, 1.03662543e-01,\n", + " -4.21361588e-02, 1.43061429e-01, -3.92947495e-01, -4.89463992e-02,\n", + " -9.15660262e-02, -1.00108273e-01, 3.86523217e-01, -4.25569601e-02,\n", + " 4.10154127e-02, -3.41399819e-01, 2.13903114e-02, 8.09015241e-03,\n", + " 9.56344381e-02, 1.12729572e-01, 7.25207478e-02, -6.64384067e-02,\n", + " -2.73666024e-01, -2.79651750e-02, 1.18422434e-01, -5.22459708e-02,\n", + " -2.47057881e-02, 2.84700710e-02, 2.07451075e-01, -9.74238589e-02,\n", + " 8.08936954e-02, 4.07307222e-02, -1.35277033e-01, 2.18436554e-01,\n", + " 1.28792310e-02, -1.20433331e-01, 2.41929386e-02, 1.28128864e-02,\n", + " -7.39881098e-02, -1.12995692e-01, 7.69245178e-02, -2.87000872e-02,\n", + " 1.64782573e-02, -2.78794408e-01, -2.64403820e-01, -2.43874848e-01,\n", + " 1.77457914e-01, 4.11631197e-01, -6.09753132e-02, 2.84967333e-01,\n", + " 9.81074646e-02, -2.68213183e-01, 1.52153388e-01, 2.42148209e-02,\n", + " 1.24371536e-01, 6.02926640e-03, 8.22689310e-02, 2.82294262e-04,\n", + " -1.40584474e-02, 4.09389734e-02, -2.58334547e-01, -9.83026102e-02,\n", + " -1.91695184e-01, -2.61005852e-02, -2.21736208e-01, -4.36628833e-02,\n", + " 9.49840024e-02, -5.16017936e-02, 2.17577979e-01, 2.58604765e-01,\n", + " 6.33814484e-02, -7.10158283e-03, 9.87893157e-03, -2.26405971e-02,\n", + " 1.67435139e-01, 2.90897069e-03, 2.35914681e-02, 5.43428905e-06],\n", + " dtype=float32)], None)\n" + ] + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/i2v/i2v.ipynb b/examples/i2v/i2v.ipynb new file mode 100644 index 00000000..507c994e --- /dev/null +++ b/examples/i2v/i2v.ipynb @@ -0,0 +1,192 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# I2V\n", + "\n", + "## 概述\n", + "\n", + "使用自己提供的任一预训练模型(给出模型存放路径即可)将给定的题目文本转成向量。\n", + "\n", + "- 优点:可以使用自己的模型,另可调整训练参数,灵活性强。" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 导入类" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "from EduNLP.I2V import D2V" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输入\n", + "\n", + "类型:str \n", + "内容:题目文本 (text)" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 37, + "source": [ + "item = {\n", + "\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\n", + "}" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输出" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 34, + "source": [ + "model_path = \"../test_model/test_gensim_luna_stem_tf_d2v_256.bin\"\n", + "i2v = D2V(\"text\",\"d2v\",filepath=model_path, pretrained_t2v = False)\n", + "i2v " + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 34 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 35, + "source": [ + "i2v(item)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "([array([ 4.76559885e-02, -1.60574958e-01, 1.94614579e-03, 2.40295693e-01,\n", + " 2.24517003e-01, -3.24351490e-02, 4.35789041e-02, -1.65670961e-02,\n", + " -7.77302235e-02, 4.23757173e-02, 4.62658405e-02, 7.54115507e-02,\n", + " -4.54682261e-02, -1.82153687e-01, 5.55203669e-02, 4.23391759e-02,\n", + " 8.86691213e-02, 6.97413310e-02, -2.47167766e-01, 2.54209518e-01,\n", + " -3.76413465e-02, 3.58376503e-02, -1.39907554e-01, -8.55517760e-02,\n", + " -1.62535697e-01, -4.44540828e-02, -3.99694731e-03, 1.83905549e-02,\n", + " -8.03738683e-02, -9.05910060e-02, 1.45633578e-01, 9.63102728e-02,\n", + " -7.19666481e-02, -8.49684048e-03, -1.51718438e-01, -1.46381939e-02,\n", + " 8.34727809e-02, -7.11122975e-02, 1.66607365e-01, -1.14558250e-01,\n", + " -1.72963589e-01, 4.86062802e-02, -1.63086802e-02, -3.68945636e-02,\n", + " 2.46143237e-01, 5.40899672e-03, 5.04904091e-02, 1.16586924e-01,\n", + " 7.59096816e-02, 1.20751150e-02, 1.04407202e-02, 3.19544263e-02,\n", + " -6.02783300e-02, 1.18572332e-01, -2.19343737e-01, 2.67594811e-02,\n", + " 1.01860933e-01, -2.87170410e-02, 5.16606905e-02, 1.62313670e-01,\n", + " -5.12879491e-02, -1.62193626e-02, -6.77167401e-02, 1.67254247e-02,\n", + " 1.10977821e-01, 8.02466944e-02, -2.00764649e-02, 1.28788516e-01,\n", + " -7.20706284e-02, -6.22547232e-02, 1.06899485e-01, 4.60059335e-03,\n", + " -1.99650228e-01, -1.38489634e-01, 7.20307231e-02, -4.98757213e-02,\n", + " -1.94095057e-02, -5.85906627e-03, 1.47433639e-01, 4.68258560e-02,\n", + " 9.31144804e-02, -4.59938832e-02, 3.38427201e-02, 4.83937971e-02,\n", + " -1.27312467e-01, 2.01561809e-01, 1.10482745e-01, -1.70595810e-01,\n", + " -9.55015421e-02, -7.73611516e-02, 4.43056040e-02, -1.65684260e-02,\n", + " 1.65379923e-02, -1.26138464e-01, 8.31304193e-02, 2.06687212e-01,\n", + " -1.69529378e-01, 3.43789416e-03, 1.19198427e-01, -1.38129979e-01,\n", + " -1.87937781e-01, -8.27087983e-02, -1.76488962e-02, 8.51018950e-02,\n", + " 8.15693215e-02, 2.30262652e-02, 1.05074964e-01, 3.13350782e-02,\n", + " 1.53877333e-01, 1.01772640e-02, 9.17675197e-02, -1.32400826e-01,\n", + " 5.29836975e-02, 2.52282787e-02, -6.19753152e-02, -5.56256585e-02,\n", + " 3.87686864e-02, 4.30755690e-02, 7.57815093e-02, 2.63280701e-02,\n", + " 4.59217802e-02, -1.17288530e-01, 1.76368475e-01, 9.27482091e-04,\n", + " 2.64808517e-02, 9.73805785e-03, 1.90501258e-01, 1.02596413e-02,\n", + " -5.55249080e-02, -1.17555618e-01, -9.98716354e-02, 1.28057361e-01,\n", + " -4.52451073e-02, 7.51599446e-02, -3.01250312e-02, 6.24186322e-02,\n", + " 5.77449016e-02, 2.07213312e-02, -2.53734970e-03, -1.69801563e-01,\n", + " -2.28750743e-02, -2.55512260e-02, 1.70693725e-01, 2.35232189e-01,\n", + " -2.71384805e-01, -1.84327438e-01, 4.16823551e-02, 8.70332569e-02,\n", + " 1.82847306e-01, 2.76729286e-01, -4.31840494e-02, -1.38212308e-01,\n", + " -3.26297544e-02, -4.25132550e-02, -1.62892416e-01, 1.91870285e-03,\n", + " 1.52552709e-01, -1.01523520e-02, -9.16219354e-02, -5.46490997e-02,\n", + " 6.06994517e-02, -6.42470419e-02, 7.96310753e-02, -5.70830703e-02,\n", + " -8.82780831e-03, -3.94574478e-02, 9.63162258e-02, 1.54309124e-01,\n", + " 1.81100428e-01, 8.63620341e-02, 1.56518817e-02, -4.08006124e-02,\n", + " 5.20652272e-02, 8.38029310e-02, -1.55516326e-01, 3.57730500e-03,\n", + " -1.50946556e-02, 2.84812655e-02, 1.37905419e-01, 8.77659023e-02,\n", + " 8.23542774e-02, -1.04377635e-01, 4.80731949e-03, 1.18891411e-02,\n", + " 9.32120830e-02, 7.88019150e-02, -1.44494563e-01, -7.53350407e-02,\n", + " -1.13602541e-01, 5.43805361e-02, 1.64935380e-01, -2.00515296e-02,\n", + " 1.92917317e-01, -4.35359031e-02, 8.92477036e-02, -4.37481068e-02,\n", + " 4.01461311e-02, -2.59898454e-01, -1.11872263e-01, -1.25746787e-01,\n", + " -2.34577611e-01, -6.69524372e-02, 5.55978045e-02, -1.91931397e-01,\n", + " 5.87355606e-02, 1.01886272e-01, -2.64038593e-01, -2.05450356e-02,\n", + " -1.97510555e-01, 9.13371146e-02, 1.49546817e-01, -3.91026959e-02,\n", + " 5.94646595e-02, 1.29657034e-02, -3.72891256e-04, 5.56622408e-02,\n", + " 1.61776438e-01, 2.29037628e-02, -1.94774106e-01, -5.02247922e-02,\n", + " -5.45939505e-02, 5.31783216e-02, 1.26433298e-01, -1.23263724e-01,\n", + " 8.53074417e-02, -1.41412809e-01, -7.71067888e-02, 1.21865064e-01,\n", + " 4.73318882e-02, 7.20091909e-02, -9.83269960e-02, 1.99413914e-02,\n", + " -1.88907124e-02, -2.14710683e-02, -4.93260436e-02, 1.64937660e-01,\n", + " -1.07827298e-01, -7.75848776e-02, -6.23578345e-03, -1.05760902e-01,\n", + " -4.14819457e-02, 5.95730543e-02, 4.11023498e-02, -2.18305327e-02,\n", + " -2.30057724e-02, -3.34391668e-02, 1.30382255e-01, 5.10290638e-02,\n", + " -1.21569566e-01, -1.23630039e-01, -1.83883369e-01, 1.10945016e-01,\n", + " -1.05633408e-01, -8.24846700e-02, -3.76710802e-01, -4.50239740e-02],\n", + " dtype=float32)],\n", + " None)" + ] + }, + "metadata": {}, + "execution_count": 35 + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/sif/sif.ipynb b/examples/sif/sif.ipynb index 2076e126..25affe58 100644 --- a/examples/sif/sif.ipynb +++ b/examples/sif/sif.ipynb @@ -7,7 +7,7 @@ "\n", "## 概述\n", "\n", - "SIFSci 是一个提供试题切分和标注的模块。它可定制化的将文本切分为令牌(token)序列,为后续试题的向量化做准备。" + "SIF4Sci 是一个提供试题切分和标注的模块。它可定制化的将文本切分为令牌(token)序列,为后续试题的向量化做准备。" ], "metadata": { "collapsed": true, @@ -19,7 +19,7 @@ { "cell_type": "markdown", "source": [ - "本文将以下面这道题目(来源自 LUNA 题库)为例,展示 SIFSci 的使用方法。 \n", + "本文将以下面这道题目(来源自 LUNA 题库)为例,展示 SIF4Sci 的使用方法。 \n", "\n", "![Figure](../../asset/_static/item.png)" ], @@ -1015,4 +1015,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/examples/sif/sif4sci.ipynb b/examples/sif/sif4sci.ipynb new file mode 100644 index 00000000..2076e126 --- /dev/null +++ b/examples/sif/sif4sci.ipynb @@ -0,0 +1,1018 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# SIF4Sci 使用示例\n", + "\n", + "## 概述\n", + "\n", + "SIFSci 是一个提供试题切分和标注的模块。它可定制化的将文本切分为令牌(token)序列,为后续试题的向量化做准备。" + ], + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "本文将以下面这道题目(来源自 LUNA 题库)为例,展示 SIFSci 的使用方法。 \n", + "\n", + "![Figure](../../asset/_static/item.png)" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 符合 [SIF 格式](https://edunlp.readthedocs.io/en/docs_dev/tutorial/zh/sif.html) 的题目录入格式为:" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "item = {\n", + " \"stem\": r\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\n", + " \"options\": [\"$p_1=p_2$\", \"$p_1=p_3$\", \"$p_2=p_3$\", \"$p_1=p_2+p_3$\"]\n", + "}\n", + "item[\"stem\"]" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\\\SIFChoice$$\\\\FigureID{1}$'" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "- 加载图片:`$\\\\FigureID{1}$`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "from PIL import Image\n", + "img = Image.open(\"../../asset/_static/item_figure.png\")\n", + "figures = {\"1\": img}\n", + "img" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAOYAAACICAYAAADzlJeRAAAe3klEQVR4nO2df1RT9/3/nyE/SWJyBdSUOgs72mKP1XhK24j2CGon4FagbtTMtpZt6rDzR7rjKt+2C+rnHGxdBbr2xB9zgc4j1NMJbpXg2hlsp2DnjrSHVaqcgquWbFqNLUgQkvf3DwgiSSA/7uUm8f04530O3Ny87zu5ed7X+8fr/XoJCCEEFAolrIjhuwEUCsUTKkwKJQyhwqRQwhAR3w2IBJqamuBwOHy+rtFokJKSMo4tokQ7VJgAGhoa0NzcDLvdjn//+99oa2uDw+FAa2srAIBhmFHf73K58O233wIAYmNj8cADDwAAZs+ejblz50Kr1UKn00Emk3H6OSjRg+Bum5Vtbm5Gc3MzrFYrPvnkE7S2toJhGPT09KC3t5f160mlUsTGxqKrqwsMw2D69OlYunQpdDodMjMzWb8eJTqIemHa7XbU19fjT3/6E/72t79BoVDA6XSiq6uL13YxDIOuri48/vjjyM/PR2ZmJpKSknhtEyV8iEph2mw21NbW4tChQzh16hSkUulQVzMckcvliImJgUwmQ3Z2NgwGA7RaLd/NovBIVAmzuroar732Gs6fPw8AuHnzJs8tCg61Wg2FQoFXX30VK1asGHOMS4k+Il6YNpsNf/jDH7Bz504ACGvLGChyuRxOpxOLFy/G+vXr6Zj0LiJihdnU1IQ33ngDdXV1cLlcoy5nRAMMw0AkEmHnzp14/vnn+W4OhWMiTpjNzc0oLCxEa2sr7HY7380Zd5RKJWQyGRVolBMxwrTZbFi/fj0++OAD3Lhxg+/m8A4VaHQT9sK02+347W9/i8rKyqgaP7KFUqmESqXCn//8Z+h0Or6bQ2GJsPaV3bdvH5KTk7Fv3z4qSh90dXXh66+/RlZWFjZs2HBXdu+jEhKGdHZ2Ep1OR9RqNQFAi59FJpORSZMmEYvFwuLdqCNlZRdYrI/iD2FnMfft24dZs2ahqamJjiUDxOFw4MqVK9Dr9XjqqadYsJ5tKJ+fjUNsNI4SGHw/Gdxcv36d5OXlUSvJUhGLxWTSpEmksbEx6HtyoSyNACBr6li80RS/CAuL2dDQgPvvvx/vv/8+tZIs0dfXhytXruBHP/oRSkpKAq+grRw78QrK0thvG2VseJ+VLS0txbZt2+ikBYeoVCrMmzcP1dXVfrr3WbB2/nlsPnk/dgr+Dw9eOImN07lrX0NDA4Db+17/+c9/4uuvv/Z67uzZs5GcnAwASEpKuqNEFXya65/97GeEYRjeu313QxGLxSQ5OZmcO3duzPtSt2YNqSOEkAtlJA2Df7NAT08PsVqtxGg0Ep1OR2JjYwkAwjAMYRiGiESigD6TUqkkDMMQpVJJABCtVkuys7OJ0WgMqQsfDvAizOvXr5NHH3106MbQMn4lLi6OHDlyxOe9cY8rb5fQhHnu3Dny8ssvk+TkZCISicbtQcwwDImNjSUZGRnEZDKR9vb2ED7F+DPuwmxvbycpKSlEIpHw/iO9W8uUKVPIO++843lzLpSRNcOXRurWEKSVkUAXSxobG0lhYSFJSEggKpUqYEvIdpHL5USpVJLExERiMpnI9evXQ/kJjwvjKsyzZ8+SxMRE3n+YtIBMnDiRvP3227dvzoUykjZChBfK0vwWZk9PDzGbzSQxMTGshydyuZwoFAqSnZ3N8novu4ybMM+ePUs0Gg3vN4aW2yUuLm5InAPH0siAwbxAytKGn+u7O9vZ2Uk2bdpEJkyYMDTWi5TCMAxJSEggZrN5nFTgP+MizPb2dmopw7TExcWRjRs3BnxPOzs7ycqVK4lcLue9qxpqUSqVYSdQzoXZ3t5OkpKSeP/yafFdGIbxW5w9PT3EaDSS+Ph4IhaLeW87m8Ut0JqaGk414Q+cCpOKMnIKwzBk165do97PqqoqotFoiEKh4L29XBa1Wk10Oh2vM7mcCfP69eskJSWF9y+ZFv/LlClTvC6luDcVqFQq3ts4noVhGGI0GklPTw9XMvEJJ54/DocDjzzyCFpaWtiuOuqRy+WQSCQecW6Tk5OhVqvvODcnJwcA8NFHH+G///0vrl27ht7eXvT19QUdnnPKlCloaGgYiixfX1+PZ555Bt98802QnyiyUSgUiI+Px5EjR8Y1ciEnwnzqqafw/vvvo6+vj+2qowaGYYYiuLtFl5OTM5RuQavVshodr6GhAXa7Hc3Nzaivr4fNZsNXX30FAIiJiUF/f//QuSqVCrt27cJf/vIXfPTRR9RdEkB8fDxKSkqwevXqcbke68Kkvq+eiEQiKJVKdHV1ITU1FUuXLkV6enpY5Dxxi7WjowN//etfceLECXz33XdwuVx3iJUyEFZ00aJFOHjwIPfpLtjsFzc2NpKEhATexwZ8F7frmUgkIjqdjpSUlESU7+bhw4eJRqMharWayOVyIpVKSUxMDO/fazgUsVhMZsyY4ZfPcSiwJszOzk4ybdo03r84vopMJiNKpZJotdqIE+JwTCYTSU9PJ+3t7aSqqoqsWLGCnDt3jphMJpKZmUkkEknULZMEU6ZMmULOnj3L2X1gRZg9PT0kNTWV9y+Lj6JWq0liYiLZvn17xDlKD6enp4esWLGC/PKXvyQ9PT3k448/Jh9//DHZtGkTKS0tveM8s9k8tDtEKBTyfg/4KhqNhrMHMCvCNBqNUb+2NVKMKpWKPPfcc5w+NceLc+fOEa1WS6qqqoYE6S49PT0kMzOTWK1Wj/e1t7eT7du3k8TERCKXy3m/L3yUhISEUXfrBEvIwmxvbw9rp2U2i1KpJNnZ2WHhGcIWVVVVRKfTkXPnznmI0l06OzuJVqslnZ2dPus5e/Ysee6554hSqSRSqZT3ezWexdf6byiEPCs7b948NDU1hVJFWCMSiSCXy5Gbm4utW7dGzU55h8OBoqIi2Gw2mEymMdecRSIRioqKYLFYRp2RdDgceOutt1BSUoLu7m5Oco6GI1OmTMHevXvx5JNPslJfSMLct28fNm/eHJVxemQyGWQyGX7yk59g27Zt0Gg0fDeJNWw2G/R6PXJycpCamur3+86cOYOLFy+itLR0zHMdDgcqKirw6quvoru7Gz09PaE0OSKIj4/Hhx9+yIojQtDCtNlsmDt3Lmw2W8iNCCcUCgViYmKwbt06bNmyJepS4NXX18NgMMBsNge1Trl//34sXLgwoLQMFRUV2Lx5M3p6etDd3R3wNSOJpKQkWK3WkHtWQUfJW7duXVS5aYnFYjAMg+LiYvzvf//Djh07ok6UxcXFKC8vR2NjY9DOAyaTCZWVlQENX55//nlcuXIFBw4cQHJyMmJjY4O6diTQ0dGBH/zgByFnnwvKYjY3NyM9PT1qurAMw2DZsmV46623ok6MwIB3T15eHhYuXIglS5aEXN/UqVORl5cHq9Ua1Pf1u9/9Djt27IDdbofT6Qy5PeGGRCKBVqvF6dOng64jKGEuWrQIVqs16IuGCxMmTMC9994Ls9kctQl5mpqaUFBQAJPJBJFIxFq9/f392Lp1a9C/A5vNhnXr1uH48eNR84AfjkqlwpYtW1BUVBTU+wMWZjRYS7FYDIVCgW3btmH9+vV8N4czysrKcOTIEVRVVaGtrY31+v/xj3/gxo0bwQWUHqShoQErV67E9evXo26CKJTJoIDHmC+++GJEi5JhGOTn56O9vT1qRWm326HX63Hx4kVYLBZORAkACxYsQGtrK2pra4OuIz09HZcvX8ZvfvMbxMfHs9e4MOCbb75BTk5OUOPNgCxmJFtLsViMyZMn47333ovabisAtLa2oqCgABs3bsTUqVM5v96sWbOQlZUFs9kc8k4Zm82GvLw8fPbZZ7h58yZLLeQXsViM/Px8HDhwIKD3BWQxI9VaMgyDRYsWoaWlJapFWVFRAb1eD7PZPC6iBICWlhaYzWbo9fqQZyI1Gg0aGxuxefPmqJmE6+vrw9GjR1FfXx/Q+/y2mE1NTcjKyoq4fZZxcXF45ZVXYDAY+G4KZzgcDhgMBjgcDphMJpw5c2bc23D16lVUVlaipqaGlfqamprw5JNP4sqVK6zUxzfJycn48ssv/T7fb4tpMpkiSpRisRjTpk3DyZMno1qUHR0dyMjIwJw5c/Dzn/+cF1ECQEJCAlJSUrBjxw5W6tPpdDh//jx0Oh3kcjkrdfLJ1atX8fvf/97v8/0SpsPhwKFDkZO+dOLEifjhD3+IL774gvcIAVxSW1uLvLw8mEwmzJo1i+/mYNmyZTh27NhQ9q5QYRgmarq23333HYqLi/3u7vslzOrqalbXwLgkISEB5eXlOHz4MPfhH3ikqKgIlZWVsFqtQQfe4oKamhoYDAZ0dHSwVmdxcTEsFgsSExNZq5MPent7/e5R+DXGnDlzJlpbW0NuGNdoNBpUVVUhPT2d76ZwhtsBfenSpViwYAHfzfGKvztRAqWjowNZWVkR8Vv0hUqlwhdffDHmpogxLWZra6vPJKLhRGJiIiwWS1SL0j0BZzQaw1aUwIBX0KpVq1BYWMhqvUlJSWhsbERqamrE9OBG4nQ6/Vo6GVOYe/bsCfs1pXvvvRcnT54c17if482OHTtgMBhgsVgi4kc5ffp0yGQy7N69m9V6GYbBxx9/jLS0tIh0hu/u7kZJScmYY80xhXngwIGwDWMokUiQkpKClpaWqNnAPBK3A/qNGzewc+dOzrx4uKC0tBTvvvsu6xvpZTIZTpw4gSeeeMIjCHYk4HQ6x1zXHHWM2dHRgYceeiisJhfcyOVyzJkzB3V1dRE/Y+eL5uZmFBQUwGg0IiEhge/mBMXUqVOh1+tRU1PDyWbzTZs2oaKiIuIcX8Za1xzVYtbX18PlcrHeqFCRSCSYM2cOjh8/HrWi3L17NwoLC1FTUxOxogSAS5cuoaSkBAUFBSF7BnmjrKwMRqMREydOZL1uLrl27Rqam5t9vj6qMA8dOhSW48v7778fx48fj8rlEIfDgYKCAnz66aewWq24dOkS300KGZFIhKVLlwa9BWosDAYDnnvuuYjq1nZ3d6OystLn6z67sg6HAxMnTuTkKRcKKSkpaGxsjEpL6XZAX7t2LaZPn853c1jn7bffRk5ODlasWMFJ/Xq9HrW1tWH3m/VFQkKCT5dDnxazqakp7CzStGnTgt41H+7U1tYObWiORlECgNlsxmuvvcbZOmRVVRUeffTRiJi1BgaWlXxNjPkU5ocffhhWkz6JiYk4ceJEVEWrc2MwGFBZWQmLxRJW3znbnDlzBjU1NdDr9Zz5XR87dgyPPPIIxGIxJ/WzSVdXF959912vr/nsyoZT+BCNRgOLxRJ165Tu/YdPP/10QGEkIx22d6KMxG63Y968eRHhIeSrO+vTYv7rX//itEH+kpCQgJqamqgTZUNDAzIyMlBaWnpXiRIYuKdz5sxBcXExJ/UzDAOr1Ypp06ZxUj+bOBwOr37FXoVpt9vDIoK2SqXCiy++GHWbm4uLi7F169aQwkhGOkuWLMHp06cD3kDsLxqNBseOHUNcXBwn9bOJt904XoXZ3NzMu7uTWCyGVqvlbIqdD+x2O7KysgAA27dvHzMtQbRTVVWFoqIiVneiDMe9P1SlUnFSPxt0dXXhk08+8TjuU5h8RyybPHkyjhw5wmsb2KSpqQkZGRnYuHEjK7Fdo4GWlhZUVVUhLy+PsyWO1atXY/HixWE9U+ut1+BVmGfPnuW1K5uQkID33nsvapZFdu/ejaKiItTU1ECpVPLdnLDi6tWreOmll1BQUMDZNQ4ePBjWezm/+uorjweTV2F+9tln49Igb0TTuNLhcECv1+PTTz+FxWKJCi8eLpg6dSo0Gg3Kyso4qV8mk4X1eFOpVHq453kVJl8/ILFYjMWLF0fFuLK1tRXz5s1DTk4OVq5cyVssnkihpKSE1bAkIwnn8abL5fIYZ/ucleWD+Ph4HDx4kJdrs0l1dTX0ej2qqqrGLYxkpHPmzBmYzWYYDAbOMsi5x5tCoZCT+oOlt7fX4zPfIUzi6se3Xd9BoIoF1ApgHD3yGIaB2WwOOzfAQHA4HCgsLMSRI0fQ2NiIq1ev8t2kiKKtrQ0mk4mVGLW++OMf/xh2Xdre3l6cP3/+jmN3CFMQI8K1K/+DTCwGbnYD4+QLLBaL8dhjjyEzM3N8LsgB7ng0c+bMwQsvvEC7rkHS39+PnJwczoYzDMPgjTfeCLsu7UgvJY+ubMfFi4iJCTptZlCoVCpUVFSM6zXZpL6+Hnl5eSgpKQmLMJKRTmpqKux2O2e/iWeffRbf//73Oak7WDo7O+/432Nxx27/dvAvMaCWDPzpugV81wfIZIBUCPQOWlOxGJAPntM7zMIOPw4ncGOY6R3x2gSXGEaj0dM5nThxy+kCAQBBDCRCIQQjj0MAsUiEGBD09/fDCcHA+MHZDycAxIggFbhunx8jgjRGENAXNhbFxcU4ffo0rFbrXe8wwCYmkwlZWVlISUnhZIbebDZj8eLFuHbtGut1B8PIYY+HabTbrw9GLegDbt4C4BwQJQA4nED/rUEBigfGoDe6gRu3AKkMEA8elwsHusI3uoF+IaBwe/p7vjbpnslesm4R9DsJYoRiSEUiCOGCk4w8LoY4hqDPRQAIIBLGQAACp9OJGJEYUmEMBK5+9LoA8dD/TrAVj8FutyMjIwMA8PLLL1NRsox7MqiwsJCTyUitVouVK1eG7S6U0fusfX1AL24LSyYEnH23/46RDEwSqSUAhIBw8LjLCQyehlu3AJHw9nuGvaaWSlFd9a6nWIgTzhghRAJgQHTigb+JE06BAMJBoxcjGCk2AYRC0bAPJYAwZtDSDuIKOE2vJ01NTZg3bx6MRiP14uGQS5cuobS0FHl5eZzU//rrr2PChAmc1B0qvoXpFpbDOSAsMQCh884Jof5bgxZzsLhfixllOnrwNZFIhPzlyzFn9hzv5xF3d3XkceL9+DhRVlYGg8EAq9Ua1m5e0QKXYUlkMhn2798flt5YQ8J0ufrQ6yIDyxXEOeyUQaspV9y2lsCgYCXDllQGu7YOJwDh7eNCIdDvvP2ewddUKhXeLC8FiRF6eToIICDu7isAkAFLJxBCCIL+IbNHfLyffdxhJC9evBhxYSQjHTYS5PoiNzc3LCfshn7TAggAVz/iJ09GTP+ICRKHE8AIa4k+oNcJSBWD3Vnh4OuDY1P3cbET6O67/Z6bt6BQxWHz/9sCgUQGydBkDEF/fx/6BwUoEQrgdPaht78Pvf1OYKhbGwO4+geOuwSD7yfod7pAQOB09sM15v+B0dzcjKysLKxatQrLly8P8N0UNuAyLInJZAo7q+kRwaChoQF5eXkjBtxiQNbH2rpmXFwcLl++HBHOBBUVFdizZw/MZjN1GOCZhIQE6PV6NDY2sv7b4Ttix8hIBv71AmVCwDn2af6gUChQVFQU9qJ0e/GcOHECVquVijIMuHr1KoxGI/R6Pet179q1i1erOdJ100OYdwhGMdgdheP2ZFCISKVS/OpXv2KnMo4Il2SwFE/YTpDrRqvVYtKkSazWGQgjDZWHMDUaze1wF90jZltDJBKsZbglg6V4wnaCXDe7du3iLXv1mBYzKSmJsxCK4W4twzUZLMUTLhLk5ubmYsqUKazVFwgjrbXXMSYXDr7hbC1tNhvmzZsHtVqNX//619SLJwJoaWmByWRiPScKH1ZTLpdj9uzZdxzzKsz4+HjWLy4UCvHMM8+wXm+oNDQ0ICsrC6WlpWGdDJbiCRcJcnNzczF58mTW6vMHiUTiEZ7VqzC5MOcPP/xw2EVR37FjB7Zu3QqLxXLXhpGMdLhIkPvSSy+Nq1dXT08PUlJS7jjmVZhs77pXq9XYsGEDq3WGwvBksNu3b6dePBEO2wlyc3NzIZFIxj6RJaRSqUfgOa/CfOCBB1h9YgiFwrDZBN3c3IyMjAysWrUKy5Yt47s5FBZgOyyJRqPBY489xkLL/OPBBx/0OOZVmAsWLGBtsVUkEmH58uVhMekTLclgKZ6wnSB3w4YNkEqlLLRsdEQiERYvXuxx3KswdTodawGfFQoF1q1bx0pdwRKNyWApnrC5EyUzM3NcjIlSqfS6ddCrMBmGYW1m9p577uE1IZA7jOTChQtpGMm7gNTUVNhsNlRXV4dUj0wmQ05ODkut8o3D4fAaocGnrywbYpLL5di4cWPI9QRLdXU1CgoKUFVVFbXJYCmesLUTpbCwkHOrqdVqvV7DpzCfeOKJkPvYTqeTs7Teo+FwOGAwGHDkyBFYLBbqgH6XwVaCXJ1Ox2mEA6lUiqefftrraz6FqdPpQs74NXPmzHHPP2Kz2ZCVlYX77rsPL7zwAvXiuUu5dOkSjEZjyDlR1qxZw1KLPJFKpT5XK0btyjqdwe/1EolE474c4U4GW1JSctclg6V4wkaC3CVLlnAWsCs+Pt7DscCNT2HKZDI8/vjjQV/U12wTV9BksBRvhJogV6fTcZJSYazVilE3Sj/77LNBO7Q7nc5xydhFk8FSxiKUBLkymQxz585lv1EAfvGLX/h8bVRhZmZmBt2dTU1N5XxGyx1GkiaDpYxGqAly8/PzIRCwGyh84cKFo86/jCpMhmHw0EMPBXxRqVSKJ598MuD3BUJZWRmKiopgtVrDLpASJfwIJUFueno6q+NMhmG8BDm/kzFj/qxatSrg/WmjzTaFit1uh16vx8WLF2GxWKgDOsVvgk2Qy7aDzMSJE8fUx5jCzM3NDfjCEonE52xTKLS2tiIrKws5OTlYvnw59eKhBEywCXLZGmeq1Wrs2rVrzPPGFGYwnvaPPvpoQOf7gzsZrNlspslgKUET7E6UH//4x6xcPy4uzi9j51f4yg0bNkCtVvt1YZFIFNIyy0hoMlgK2wSTIHfJkiUhp6f011oCXgI+++Lee+/F119/7dfFKyoqguoCj6SjowMFBQV4+umnacQ6CuucOXMGFy9eRGlp6ZjnOhwOyOVy+CkXryQnJ+PLL7/061y/HwEGgwEKhWLM8wghrIwvaTJYCtcEkiBXJpOFtPynVqtx+PBhv8/3W5ijLYYO5+bNmyELs6ioCOXl5bBardSLh8IpJpMJlZWVfoUlCTZmlUgkQn5+fkCzu34Lk2EY/PSnPx0z5Mj3vvc9vy8+EpvNhoyMDKjVapoMljIuBJIgN9iZWZVKhTfffDOg9wQ0ml23bt2Y3dmZM2cG1AA3TU1NyMrKgtFopGEkKeOKvwlytVptwB5AarUa+/fvD7gbHJAwtVot8vPzfVrNYGdk3clgLRYLTQZL4QV/wpI88sgjAdUpFouxaNGioCZCA57/3bZtm09PIIVCEdD4kiaDpYQTYyXITUlJCchi3nPPPTh48GBQbQlYmBqNBqtWrfJqmgOZkaXJYCnhyGhhSZKSkvxeLomLi8OxY8eCnskNasX09ddf92o1XS6XXw2pqKhAYWEhzGYzDSNJCStaWlpgNpt9Oh/4EwharVZjx44dIa1OBCVMmUyGoqIij4kgh8Mx6pSyO4wkTQZLCWdGS5A7Vhws97hy9erVIbXBb8+fkTgcDsyePRsXLly447iv6lpbW1FQUIC1a9fSiHWUiODo0aNQq9XYsmXL0LH4+Hhcu3bN6/kSiQQPP/wwjh8/HvJe5KCd/2QyGd555507Nnv6inZQW1uLgoICmEwmKkpKxOAtQe5oET1mz57NiiiBEIQJDMRDefbZZ4ca4q3/bTAYUFlZCYvFQpPBUiKOkQlyfW2YTklJwQcffMBa1I7Q3OUxMBHk9vYZvh3LnQz2vvvuo8lgKRHLyAS53nZZpaSkoLGxkdVQrSELUyaT4dChQ3eY+OHJYGkYSUqkMzxB7vAJT3dAALZFCbAgTGDAIygtLQ0zZsygyWApUYk7Qa5UKoVQKERsbCy0Wi0nogQCEKZlrQACwYgyvxxuX50lS5bAarWitbUVf//736kXDyXqKC0txX/+8x/I5XLk5OTg9OnTnGUaCGi5pK18PmZ8/grIniwAFqwVZGPvmjqQPVk4cOAAtmzZgsuXL7PeSK1Wy8oXkJ6ePvR3RUVFUHFGKRSRSIS+vj5OrxGAMNtQPn8GPn+FYM9AfGVY1gqQjbpBoVIo0U9HRwc6OjrueMgDACxrIcjeO/TvmjqCPViLtdiDYOTh/xiz7SgOnVqD3MGLtJXPR/beNJRtpqKk3D0kJSWNEGUbyucLIMgG6ggBGSy5tQIIslvw4IzgruO3MNuOHsIp7EX24PhyFSpByElspP4ClLuWgV7kJpThAtmD4SYqa08d1mAW7g9SH34Ksw1HD53CmrrBJ8KFMmDTDMwvpxM8lLuXtvJV2HQqDWWVG+Gpvxl4sCwXwfYn/RtjtpVj/ozP8cqwp4JlrQDZe9egbsSTgkK5Oxi0lrO4mWPxy2K2HT2EU2uGq78N51sApD2IILvQFEqEcwGfnwLSgh1EjoEfcTwGu7Gv3JalZe0MbDoFrKnzZsIplLuAtvPg0sl0TIspEAyIcG/2bceC7L1pKLtAgpoGplCigun3YxaAU59f8Pqypfy2800wBL0fk0K56xlcu0wru4CTQ8sTnuv9wUCFSaGEQls55s/YhFNDB9JQdiH0ZUQqTAolDGFldwmFQmEXKkwKJQz5/6GFNae7ZI7lAAAAAElFTkSuQmCC" + }, + "metadata": {}, + "execution_count": 6 + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## 导入模块" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "from EduNLP.SIF import sif4sci, is_sif, to_sif" + ], + "outputs": [], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## 验证题目格式" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "is_sif(item['stem'])" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 若发现题目因为公式没有包含在 `$$` 中而不符合 SIF 格式,则可以使用 `to_sif` 模块转成标准格式。示例如下:" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 8, + "source": [ + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", + "is_sif(text)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "False" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 9, + "source": [ + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", + "to_sif(text)\n" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...'" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 题目切分及令牌化\n", + "\n", + "现在我们得到了符合标准格式的题目文本,接下来可以对题目做进一步的预训练,例如:切分和令牌化。" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "### 题目切分" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "#### 基本切分\n", + "分离文本、公式、图片和特殊符号。" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 12, + "source": [ + "segments = sif4sci(item[\"stem\"], figures=figures, tokenization=False)\n", + "segments" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形', 'ABC', '的斜边', 'BC', ', 直角边', 'AB', ', ', 'AC', '.', '\\\\bigtriangleup ABC', '的三边所围成的区域记为', 'I', ',黑色部分记为', 'II', ', 其余部分记为', 'III', '.在整个图形中随机取一点,此点取自', 'I,II,III', '的概率分别记为', 'p_1,p_2,p_3', ',则', '\\\\SIFChoice', \\FigureID{1}]" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 文本部分" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 13, + "source": [ + "segments.text_segments" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形',\n", + " '的斜边',\n", + " ', 直角边',\n", + " ', ',\n", + " '.',\n", + " '的三边所围成的区域记为',\n", + " ',黑色部分记为',\n", + " ', 其余部分记为',\n", + " '.在整个图形中随机取一点,此点取自',\n", + " '的概率分别记为',\n", + " ',则']" + ] + }, + "metadata": {}, + "execution_count": 13 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 公式部分" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 15, + "source": [ + "segments.formula_segments\n" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['ABC',\n", + " 'BC',\n", + " 'AB',\n", + " 'AC',\n", + " '\\\\bigtriangleup ABC',\n", + " 'I',\n", + " 'II',\n", + " 'III',\n", + " 'I,II,III',\n", + " 'p_1,p_2,p_3']" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 图片部分" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 16, + "source": [ + "segments.figure_segments" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[\\FigureID{1}]" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 17, + "source": [ + "segments.figure_segments[0].figure" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAOYAAACICAYAAADzlJeRAAAe3klEQVR4nO2df1RT9/3/nyE/SWJyBdSUOgs72mKP1XhK24j2CGon4FagbtTMtpZt6rDzR7rjKt+2C+rnHGxdBbr2xB9zgc4j1NMJbpXg2hlsp2DnjrSHVaqcgquWbFqNLUgQkvf3DwgiSSA/7uUm8f04530O3Ny87zu5ed7X+8fr/XoJCCEEFAolrIjhuwEUCsUTKkwKJQyhwqRQwhAR3w2IBJqamuBwOHy+rtFokJKSMo4tokQ7VJgAGhoa0NzcDLvdjn//+99oa2uDw+FAa2srAIBhmFHf73K58O233wIAYmNj8cADDwAAZs+ejblz50Kr1UKn00Emk3H6OSjRg+Bum5Vtbm5Gc3MzrFYrPvnkE7S2toJhGPT09KC3t5f160mlUsTGxqKrqwsMw2D69OlYunQpdDodMjMzWb8eJTqIemHa7XbU19fjT3/6E/72t79BoVDA6XSiq6uL13YxDIOuri48/vjjyM/PR2ZmJpKSknhtEyV8iEph2mw21NbW4tChQzh16hSkUulQVzMckcvliImJgUwmQ3Z2NgwGA7RaLd/NovBIVAmzuroar732Gs6fPw8AuHnzJs8tCg61Wg2FQoFXX30VK1asGHOMS4k+Il6YNpsNf/jDH7Bz504ACGvLGChyuRxOpxOLFy/G+vXr6Zj0LiJihdnU1IQ33ngDdXV1cLlcoy5nRAMMw0AkEmHnzp14/vnn+W4OhWMiTpjNzc0oLCxEa2sr7HY7380Zd5RKJWQyGRVolBMxwrTZbFi/fj0++OAD3Lhxg+/m8A4VaHQT9sK02+347W9/i8rKyqgaP7KFUqmESqXCn//8Z+h0Or6bQ2GJsPaV3bdvH5KTk7Fv3z4qSh90dXXh66+/RlZWFjZs2HBXdu+jEhKGdHZ2Ep1OR9RqNQFAi59FJpORSZMmEYvFwuLdqCNlZRdYrI/iD2FnMfft24dZs2ahqamJjiUDxOFw4MqVK9Dr9XjqqadYsJ5tKJ+fjUNsNI4SGHw/Gdxcv36d5OXlUSvJUhGLxWTSpEmksbEx6HtyoSyNACBr6li80RS/CAuL2dDQgPvvvx/vv/8+tZIs0dfXhytXruBHP/oRSkpKAq+grRw78QrK0thvG2VseJ+VLS0txbZt2+ikBYeoVCrMmzcP1dXVfrr3WbB2/nlsPnk/dgr+Dw9eOImN07lrX0NDA4Db+17/+c9/4uuvv/Z67uzZs5GcnAwASEpKuqNEFXya65/97GeEYRjeu313QxGLxSQ5OZmcO3duzPtSt2YNqSOEkAtlJA2Df7NAT08PsVqtxGg0Ep1OR2JjYwkAwjAMYRiGiESigD6TUqkkDMMQpVJJABCtVkuys7OJ0WgMqQsfDvAizOvXr5NHH3106MbQMn4lLi6OHDlyxOe9cY8rb5fQhHnu3Dny8ssvk+TkZCISicbtQcwwDImNjSUZGRnEZDKR9vb2ED7F+DPuwmxvbycpKSlEIpHw/iO9W8uUKVPIO++843lzLpSRNcOXRurWEKSVkUAXSxobG0lhYSFJSEggKpUqYEvIdpHL5USpVJLExERiMpnI9evXQ/kJjwvjKsyzZ8+SxMRE3n+YtIBMnDiRvP3227dvzoUykjZChBfK0vwWZk9PDzGbzSQxMTGshydyuZwoFAqSnZ3N8novu4ybMM+ePUs0Gg3vN4aW2yUuLm5InAPH0siAwbxAytKGn+u7O9vZ2Uk2bdpEJkyYMDTWi5TCMAxJSEggZrN5nFTgP+MizPb2dmopw7TExcWRjRs3BnxPOzs7ycqVK4lcLue9qxpqUSqVYSdQzoXZ3t5OkpKSeP/yafFdGIbxW5w9PT3EaDSS+Ph4IhaLeW87m8Ut0JqaGk414Q+cCpOKMnIKwzBk165do97PqqoqotFoiEKh4L29XBa1Wk10Oh2vM7mcCfP69eskJSWF9y+ZFv/LlClTvC6luDcVqFQq3ts4noVhGGI0GklPTw9XMvEJJ54/DocDjzzyCFpaWtiuOuqRy+WQSCQecW6Tk5OhVqvvODcnJwcA8NFHH+G///0vrl27ht7eXvT19QUdnnPKlCloaGgYiixfX1+PZ555Bt98802QnyiyUSgUiI+Px5EjR8Y1ciEnwnzqqafw/vvvo6+vj+2qowaGYYYiuLtFl5OTM5RuQavVshodr6GhAXa7Hc3Nzaivr4fNZsNXX30FAIiJiUF/f//QuSqVCrt27cJf/vIXfPTRR9RdEkB8fDxKSkqwevXqcbke68Kkvq+eiEQiKJVKdHV1ITU1FUuXLkV6enpY5Dxxi7WjowN//etfceLECXz33XdwuVx3iJUyEFZ00aJFOHjwIPfpLtjsFzc2NpKEhATexwZ8F7frmUgkIjqdjpSUlESU7+bhw4eJRqMharWayOVyIpVKSUxMDO/fazgUsVhMZsyY4ZfPcSiwJszOzk4ybdo03r84vopMJiNKpZJotdqIE+JwTCYTSU9PJ+3t7aSqqoqsWLGCnDt3jphMJpKZmUkkEknULZMEU6ZMmULOnj3L2X1gRZg9PT0kNTWV9y+Lj6JWq0liYiLZvn17xDlKD6enp4esWLGC/PKXvyQ9PT3k448/Jh9//DHZtGkTKS0tveM8s9k8tDtEKBTyfg/4KhqNhrMHMCvCNBqNUb+2NVKMKpWKPPfcc5w+NceLc+fOEa1WS6qqqoYE6S49PT0kMzOTWK1Wj/e1t7eT7du3k8TERCKXy3m/L3yUhISEUXfrBEvIwmxvbw9rp2U2i1KpJNnZ2WHhGcIWVVVVRKfTkXPnznmI0l06OzuJVqslnZ2dPus5e/Ysee6554hSqSRSqZT3ezWexdf6byiEPCs7b948NDU1hVJFWCMSiSCXy5Gbm4utW7dGzU55h8OBoqIi2Gw2mEymMdecRSIRioqKYLFYRp2RdDgceOutt1BSUoLu7m5Oco6GI1OmTMHevXvx5JNPslJfSMLct28fNm/eHJVxemQyGWQyGX7yk59g27Zt0Gg0fDeJNWw2G/R6PXJycpCamur3+86cOYOLFy+itLR0zHMdDgcqKirw6quvoru7Gz09PaE0OSKIj4/Hhx9+yIojQtDCtNlsmDt3Lmw2W8iNCCcUCgViYmKwbt06bNmyJepS4NXX18NgMMBsNge1Trl//34sXLgwoLQMFRUV2Lx5M3p6etDd3R3wNSOJpKQkWK3WkHtWQUfJW7duXVS5aYnFYjAMg+LiYvzvf//Djh07ok6UxcXFKC8vR2NjY9DOAyaTCZWVlQENX55//nlcuXIFBw4cQHJyMmJjY4O6diTQ0dGBH/zgByFnnwvKYjY3NyM9PT1qurAMw2DZsmV46623ok6MwIB3T15eHhYuXIglS5aEXN/UqVORl5cHq9Ua1Pf1u9/9Djt27IDdbofT6Qy5PeGGRCKBVqvF6dOng64jKGEuWrQIVqs16IuGCxMmTMC9994Ls9kctQl5mpqaUFBQAJPJBJFIxFq9/f392Lp1a9C/A5vNhnXr1uH48eNR84AfjkqlwpYtW1BUVBTU+wMWZjRYS7FYDIVCgW3btmH9+vV8N4czysrKcOTIEVRVVaGtrY31+v/xj3/gxo0bwQWUHqShoQErV67E9evXo26CKJTJoIDHmC+++GJEi5JhGOTn56O9vT1qRWm326HX63Hx4kVYLBZORAkACxYsQGtrK2pra4OuIz09HZcvX8ZvfvMbxMfHs9e4MOCbb75BTk5OUOPNgCxmJFtLsViMyZMn47333ovabisAtLa2oqCgABs3bsTUqVM5v96sWbOQlZUFs9kc8k4Zm82GvLw8fPbZZ7h58yZLLeQXsViM/Px8HDhwIKD3BWQxI9VaMgyDRYsWoaWlJapFWVFRAb1eD7PZPC6iBICWlhaYzWbo9fqQZyI1Gg0aGxuxefPmqJmE6+vrw9GjR1FfXx/Q+/y2mE1NTcjKyoq4fZZxcXF45ZVXYDAY+G4KZzgcDhgMBjgcDphMJpw5c2bc23D16lVUVlaipqaGlfqamprw5JNP4sqVK6zUxzfJycn48ssv/T7fb4tpMpkiSpRisRjTpk3DyZMno1qUHR0dyMjIwJw5c/Dzn/+cF1ECQEJCAlJSUrBjxw5W6tPpdDh//jx0Oh3kcjkrdfLJ1atX8fvf/97v8/0SpsPhwKFDkZO+dOLEifjhD3+IL774gvcIAVxSW1uLvLw8mEwmzJo1i+/mYNmyZTh27NhQ9q5QYRgmarq23333HYqLi/3u7vslzOrqalbXwLgkISEB5eXlOHz4MPfhH3ikqKgIlZWVsFqtQQfe4oKamhoYDAZ0dHSwVmdxcTEsFgsSExNZq5MPent7/e5R+DXGnDlzJlpbW0NuGNdoNBpUVVUhPT2d76ZwhtsBfenSpViwYAHfzfGKvztRAqWjowNZWVkR8Vv0hUqlwhdffDHmpogxLWZra6vPJKLhRGJiIiwWS1SL0j0BZzQaw1aUwIBX0KpVq1BYWMhqvUlJSWhsbERqamrE9OBG4nQ6/Vo6GVOYe/bsCfs1pXvvvRcnT54c17if482OHTtgMBhgsVgi4kc5ffp0yGQy7N69m9V6GYbBxx9/jLS0tIh0hu/u7kZJScmYY80xhXngwIGwDWMokUiQkpKClpaWqNnAPBK3A/qNGzewc+dOzrx4uKC0tBTvvvsu6xvpZTIZTpw4gSeeeMIjCHYk4HQ6x1zXHHWM2dHRgYceeiisJhfcyOVyzJkzB3V1dRE/Y+eL5uZmFBQUwGg0IiEhge/mBMXUqVOh1+tRU1PDyWbzTZs2oaKiIuIcX8Za1xzVYtbX18PlcrHeqFCRSCSYM2cOjh8/HrWi3L17NwoLC1FTUxOxogSAS5cuoaSkBAUFBSF7BnmjrKwMRqMREydOZL1uLrl27Rqam5t9vj6qMA8dOhSW48v7778fx48fj8rlEIfDgYKCAnz66aewWq24dOkS300KGZFIhKVLlwa9BWosDAYDnnvuuYjq1nZ3d6OystLn6z67sg6HAxMnTuTkKRcKKSkpaGxsjEpL6XZAX7t2LaZPn853c1jn7bffRk5ODlasWMFJ/Xq9HrW1tWH3m/VFQkKCT5dDnxazqakp7CzStGnTgt41H+7U1tYObWiORlECgNlsxmuvvcbZOmRVVRUeffTRiJi1BgaWlXxNjPkU5ocffhhWkz6JiYk4ceJEVEWrc2MwGFBZWQmLxRJW3znbnDlzBjU1NdDr9Zz5XR87dgyPPPIIxGIxJ/WzSVdXF959912vr/nsyoZT+BCNRgOLxRJ165Tu/YdPP/10QGEkIx22d6KMxG63Y968eRHhIeSrO+vTYv7rX//itEH+kpCQgJqamqgTZUNDAzIyMlBaWnpXiRIYuKdz5sxBcXExJ/UzDAOr1Ypp06ZxUj+bOBwOr37FXoVpt9vDIoK2SqXCiy++GHWbm4uLi7F169aQwkhGOkuWLMHp06cD3kDsLxqNBseOHUNcXBwn9bOJt904XoXZ3NzMu7uTWCyGVqvlbIqdD+x2O7KysgAA27dvHzMtQbRTVVWFoqIiVneiDMe9P1SlUnFSPxt0dXXhk08+8TjuU5h8RyybPHkyjhw5wmsb2KSpqQkZGRnYuHEjK7Fdo4GWlhZUVVUhLy+PsyWO1atXY/HixWE9U+ut1+BVmGfPnuW1K5uQkID33nsvapZFdu/ejaKiItTU1ECpVPLdnLDi6tWreOmll1BQUMDZNQ4ePBjWezm/+uorjweTV2F+9tln49Igb0TTuNLhcECv1+PTTz+FxWKJCi8eLpg6dSo0Gg3Kyso4qV8mk4X1eFOpVHq453kVJl8/ILFYjMWLF0fFuLK1tRXz5s1DTk4OVq5cyVssnkihpKSE1bAkIwnn8abL5fIYZ/ucleWD+Ph4HDx4kJdrs0l1dTX0ej2qqqrGLYxkpHPmzBmYzWYYDAbOMsi5x5tCoZCT+oOlt7fX4zPfIUzi6se3Xd9BoIoF1ApgHD3yGIaB2WwOOzfAQHA4HCgsLMSRI0fQ2NiIq1ev8t2kiKKtrQ0mk4mVGLW++OMf/xh2Xdre3l6cP3/+jmN3CFMQI8K1K/+DTCwGbnYD4+QLLBaL8dhjjyEzM3N8LsgB7ng0c+bMwQsvvEC7rkHS39+PnJwczoYzDMPgjTfeCLsu7UgvJY+ubMfFi4iJCTptZlCoVCpUVFSM6zXZpL6+Hnl5eSgpKQmLMJKRTmpqKux2O2e/iWeffRbf//73Oak7WDo7O+/432Nxx27/dvAvMaCWDPzpugV81wfIZIBUCPQOWlOxGJAPntM7zMIOPw4ncGOY6R3x2gSXGEaj0dM5nThxy+kCAQBBDCRCIQQjj0MAsUiEGBD09/fDCcHA+MHZDycAxIggFbhunx8jgjRGENAXNhbFxcU4ffo0rFbrXe8wwCYmkwlZWVlISUnhZIbebDZj8eLFuHbtGut1B8PIYY+HabTbrw9GLegDbt4C4BwQJQA4nED/rUEBigfGoDe6gRu3AKkMEA8elwsHusI3uoF+IaBwe/p7vjbpnslesm4R9DsJYoRiSEUiCOGCk4w8LoY4hqDPRQAIIBLGQAACp9OJGJEYUmEMBK5+9LoA8dD/TrAVj8FutyMjIwMA8PLLL1NRsox7MqiwsJCTyUitVouVK1eG7S6U0fusfX1AL24LSyYEnH23/46RDEwSqSUAhIBw8LjLCQyehlu3AJHw9nuGvaaWSlFd9a6nWIgTzhghRAJgQHTigb+JE06BAMJBoxcjGCk2AYRC0bAPJYAwZtDSDuIKOE2vJ01NTZg3bx6MRiP14uGQS5cuobS0FHl5eZzU//rrr2PChAmc1B0qvoXpFpbDOSAsMQCh884Jof5bgxZzsLhfixllOnrwNZFIhPzlyzFn9hzv5xF3d3XkceL9+DhRVlYGg8EAq9Ua1m5e0QKXYUlkMhn2798flt5YQ8J0ufrQ6yIDyxXEOeyUQaspV9y2lsCgYCXDllQGu7YOJwDh7eNCIdDvvP2ewddUKhXeLC8FiRF6eToIICDu7isAkAFLJxBCCIL+IbNHfLyffdxhJC9evBhxYSQjHTYS5PoiNzc3LCfshn7TAggAVz/iJ09GTP+ICRKHE8AIa4k+oNcJSBWD3Vnh4OuDY1P3cbET6O67/Z6bt6BQxWHz/9sCgUQGydBkDEF/fx/6BwUoEQrgdPaht78Pvf1OYKhbGwO4+geOuwSD7yfod7pAQOB09sM15v+B0dzcjKysLKxatQrLly8P8N0UNuAyLInJZAo7q+kRwaChoQF5eXkjBtxiQNbH2rpmXFwcLl++HBHOBBUVFdizZw/MZjN1GOCZhIQE6PV6NDY2sv7b4Ttix8hIBv71AmVCwDn2af6gUChQVFQU9qJ0e/GcOHECVquVijIMuHr1KoxGI/R6Pet179q1i1erOdJ100OYdwhGMdgdheP2ZFCISKVS/OpXv2KnMo4Il2SwFE/YTpDrRqvVYtKkSazWGQgjDZWHMDUaze1wF90jZltDJBKsZbglg6V4wnaCXDe7du3iLXv1mBYzKSmJsxCK4W4twzUZLMUTLhLk5ubmYsqUKazVFwgjrbXXMSYXDr7hbC1tNhvmzZsHtVqNX//619SLJwJoaWmByWRiPScKH1ZTLpdj9uzZdxzzKsz4+HjWLy4UCvHMM8+wXm+oNDQ0ICsrC6WlpWGdDJbiCRcJcnNzczF58mTW6vMHiUTiEZ7VqzC5MOcPP/xw2EVR37FjB7Zu3QqLxXLXhpGMdLhIkPvSSy+Nq1dXT08PUlJS7jjmVZhs77pXq9XYsGEDq3WGwvBksNu3b6dePBEO2wlyc3NzIZFIxj6RJaRSqUfgOa/CfOCBB1h9YgiFwrDZBN3c3IyMjAysWrUKy5Yt47s5FBZgOyyJRqPBY489xkLL/OPBBx/0OOZVmAsWLGBtsVUkEmH58uVhMekTLclgKZ6wnSB3w4YNkEqlLLRsdEQiERYvXuxx3KswdTodawGfFQoF1q1bx0pdwRKNyWApnrC5EyUzM3NcjIlSqfS6ddCrMBmGYW1m9p577uE1IZA7jOTChQtpGMm7gNTUVNhsNlRXV4dUj0wmQ05ODkut8o3D4fAaocGnrywbYpLL5di4cWPI9QRLdXU1CgoKUFVVFbXJYCmesLUTpbCwkHOrqdVqvV7DpzCfeOKJkPvYTqeTs7Teo+FwOGAwGHDkyBFYLBbqgH6XwVaCXJ1Ox2mEA6lUiqefftrraz6FqdPpQs74NXPmzHHPP2Kz2ZCVlYX77rsPL7zwAvXiuUu5dOkSjEZjyDlR1qxZw1KLPJFKpT5XK0btyjqdwe/1EolE474c4U4GW1JSctclg6V4wkaC3CVLlnAWsCs+Pt7DscCNT2HKZDI8/vjjQV/U12wTV9BksBRvhJogV6fTcZJSYazVilE3Sj/77LNBO7Q7nc5xydhFk8FSxiKUBLkymQxz585lv1EAfvGLX/h8bVRhZmZmBt2dTU1N5XxGyx1GkiaDpYxGqAly8/PzIRCwGyh84cKFo86/jCpMhmHw0EMPBXxRqVSKJ598MuD3BUJZWRmKiopgtVrDLpASJfwIJUFueno6q+NMhmG8BDm/kzFj/qxatSrg/WmjzTaFit1uh16vx8WLF2GxWKgDOsVvgk2Qy7aDzMSJE8fUx5jCzM3NDfjCEonE52xTKLS2tiIrKws5OTlYvnw59eKhBEywCXLZGmeq1Wrs2rVrzPPGFGYwnvaPPvpoQOf7gzsZrNlspslgKUET7E6UH//4x6xcPy4uzi9j51f4yg0bNkCtVvt1YZFIFNIyy0hoMlgK2wSTIHfJkiUhp6f011oCXgI+++Lee+/F119/7dfFKyoqguoCj6SjowMFBQV4+umnacQ6CuucOXMGFy9eRGlp6ZjnOhwOyOVy+CkXryQnJ+PLL7/061y/HwEGgwEKhWLM8wghrIwvaTJYCtcEkiBXJpOFtPynVqtx+PBhv8/3W5ijLYYO5+bNmyELs6ioCOXl5bBardSLh8IpJpMJlZWVfoUlCTZmlUgkQn5+fkCzu34Lk2EY/PSnPx0z5Mj3vvc9vy8+EpvNhoyMDKjVapoMljIuBJIgN9iZWZVKhTfffDOg9wQ0ml23bt2Y3dmZM2cG1AA3TU1NyMrKgtFopGEkKeOKvwlytVptwB5AarUa+/fvD7gbHJAwtVot8vPzfVrNYGdk3clgLRYLTQZL4QV/wpI88sgjAdUpFouxaNGioCZCA57/3bZtm09PIIVCEdD4kiaDpYQTYyXITUlJCchi3nPPPTh48GBQbQlYmBqNBqtWrfJqmgOZkaXJYCnhyGhhSZKSkvxeLomLi8OxY8eCnskNasX09ddf92o1XS6XXw2pqKhAYWEhzGYzDSNJCStaWlpgNpt9Oh/4EwharVZjx44dIa1OBCVMmUyGoqIij4kgh8Mx6pSyO4wkTQZLCWdGS5A7Vhws97hy9erVIbXBb8+fkTgcDsyePRsXLly447iv6lpbW1FQUIC1a9fSiHWUiODo0aNQq9XYsmXL0LH4+Hhcu3bN6/kSiQQPP/wwjh8/HvJe5KCd/2QyGd555507Nnv6inZQW1uLgoICmEwmKkpKxOAtQe5oET1mz57NiiiBEIQJDMRDefbZZ4ca4q3/bTAYUFlZCYvFQpPBUiKOkQlyfW2YTklJwQcffMBa1I7Q3OUxMBHk9vYZvh3LnQz2vvvuo8lgKRHLyAS53nZZpaSkoLGxkdVQrSELUyaT4dChQ3eY+OHJYGkYSUqkMzxB7vAJT3dAALZFCbAgTGDAIygtLQ0zZsygyWApUYk7Qa5UKoVQKERsbCy0Wi0nogQCEKZlrQACwYgyvxxuX50lS5bAarWitbUVf//736kXDyXqKC0txX/+8x/I5XLk5OTg9OnTnGUaCGi5pK18PmZ8/grIniwAFqwVZGPvmjqQPVk4cOAAtmzZgsuXL7PeSK1Wy8oXkJ6ePvR3RUVFUHFGKRSRSIS+vj5OrxGAMNtQPn8GPn+FYM9AfGVY1gqQjbpBoVIo0U9HRwc6OjrueMgDACxrIcjeO/TvmjqCPViLtdiDYOTh/xiz7SgOnVqD3MGLtJXPR/beNJRtpqKk3D0kJSWNEGUbyucLIMgG6ggBGSy5tQIIslvw4IzgruO3MNuOHsIp7EX24PhyFSpByElspP4ClLuWgV7kJpThAtmD4SYqa08d1mAW7g9SH34Ksw1HD53CmrrBJ8KFMmDTDMwvpxM8lLuXtvJV2HQqDWWVG+Gpvxl4sCwXwfYn/RtjtpVj/ozP8cqwp4JlrQDZe9egbsSTgkK5Oxi0lrO4mWPxy2K2HT2EU2uGq78N51sApD2IILvQFEqEcwGfnwLSgh1EjoEfcTwGu7Gv3JalZe0MbDoFrKnzZsIplLuAtvPg0sl0TIspEAyIcG/2bceC7L1pKLtAgpoGplCigun3YxaAU59f8Pqypfy2800wBL0fk0K56xlcu0wru4CTQ8sTnuv9wUCFSaGEQls55s/YhFNDB9JQdiH0ZUQqTAolDGFldwmFQmEXKkwKJQz5/6GFNae7ZI7lAAAAAElFTkSuQmCC" + }, + "metadata": {}, + "execution_count": 17 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 特殊符号" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 19, + "source": [ + "segments.ques_mark_segments" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['\\\\SIFChoice']" + ] + }, + "metadata": {}, + "execution_count": 19 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "#### 标记化切分 \n", + "如果您不注重题目文本和公式的具体内容,仅仅是对题目的整体(或部分)构成感兴趣,那么可以通过修改 `symbol` 参数来将不同的成分转化成特定标记,方便您的研究。" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + " - symbol:\n", + " - \"t\": text\n", + " - \"f\": formula\n", + " - \"g\": figure\n", + " - \"m\": question mark" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 11, + "source": [ + "sif4sci(item[\"stem\"], figures=figures, tokenization=False, symbol=\"tfgm\")" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[FIGURE]']" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "### 令牌化\n", + "\n", + "为了方便后续向量化表征试题,本模块提供题目文本的令牌化解析(Tokenization),即将题目转换成令牌序列。 \n", + "\n", + "根据构成题目的元素类型,解析功能分为 **“文本解析”** 和 **“公式解析”** 两部分。更具体的过程解析参见 [令牌化](../Tokenizer/tokenizer.ipynb)。" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 20, + "source": [ + "tokens = sif4sci(item[\"stem\"], figures=figures, tokenization=True)" + ], + "outputs": [], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "- 文本解析结果" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 12, + "source": [ + "tokens.text_tokens" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['如图',\n", + " '古希腊',\n", + " '数学家',\n", + " '希波',\n", + " '克拉底',\n", + " '研究',\n", + " '几何图形',\n", + " '此图',\n", + " '三个',\n", + " '半圆',\n", + " '三个',\n", + " '半圆',\n", + " '直径',\n", + " '直角三角形',\n", + " '斜边',\n", + " '直角',\n", + " '三边',\n", + " '围成',\n", + " '区域',\n", + " '记',\n", + " '黑色',\n", + " '记',\n", + " '其余部分',\n", + " '记',\n", + " '图形',\n", + " '中',\n", + " '随机',\n", + " '取',\n", + " '一点',\n", + " '此点',\n", + " '取自',\n", + " '概率',\n", + " '记']" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "#### 公式解析结果" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 13, + "source": [ + "tokens.formula_tokens" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['ABC',\n", + " 'BC',\n", + " 'AB',\n", + " 'AC',\n", + " '\\\\bigtriangleup',\n", + " 'ABC',\n", + " 'I',\n", + " 'II',\n", + " 'III',\n", + " 'I',\n", + " ',',\n", + " 'II',\n", + " ',',\n", + " 'III',\n", + " 'p',\n", + " '_',\n", + " '1',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '2',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '3']" + ] + }, + "metadata": {}, + "execution_count": 13 + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "- 自定义参数,得到定制化解析结果" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(1)如果您想按 latex 语法标记拆分公式的各个部分,并得到顺序序列结果,输出方法(`method`)可以选择:`linear`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 37, + "source": [ + "sif4sci(\n", + " item[\"stem\"],\n", + " figures=figures,\n", + " tokenization=True,\n", + " tokenization_params={\n", + " \"formula_params\": {\n", + " \"method\": \"linear\",\n", + " }\n", + " }\n", + ").formula_tokens" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['ABC',\n", + " 'BC',\n", + " 'AB',\n", + " 'AC',\n", + " '\\\\bigtriangleup',\n", + " 'ABC',\n", + " 'I',\n", + " 'II',\n", + " 'III',\n", + " 'I',\n", + " ',',\n", + " 'II',\n", + " ',',\n", + " 'III',\n", + " 'p',\n", + " '_',\n", + " '1',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '2',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '3']" + ] + }, + "metadata": {}, + "execution_count": 37 + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "(2) 如果您想得到公式解析出的语法分析树序列,输出方法可以选择:`ast`\n", + "> 抽象语法分析树,简称语法树(Syntax tree),是源代码语法结构的一种抽象表示。它以树状的形式表现编程语言的语法结构,树上的每个节点都表示源代码中的一种结构。 \n", + "> 因此,ast 可以看做是公式的语法结构表征。" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 39, + "source": [ + "sif4sci(\n", + " item[\"stem\"],\n", + " figures=figures,\n", + " tokenization=True,\n", + " tokenization_params={\n", + " \"formula_params\":{\n", + " \"method\": \"ast\",\n", + " }\n", + " }\n", + ").formula_tokens\n" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ]" + ] + }, + "metadata": {}, + "execution_count": 39 + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "- 语法树展示:" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 109, + "source": [ + "f = sif4sci(\n", + " item[\"stem\"],\n", + " figures=figures,\n", + " tokenization=True,\n", + " tokenization_params={\n", + " \"formula_params\":{\n", + " \"method\": \"ast\",\n", + " \"return_type\": \"ast\",\n", + " \"ord2token\": True,\n", + " \"var_numbering\": True,\n", + " }\n", + " }\n", + ").formula_tokens\n", + "f\n" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ]" + ] + }, + "metadata": {}, + "execution_count": 109 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 110, + "source": [ + "for i in range(0, len(f)):\n", + " ForestPlotter().export(\n", + " f[i], root_list=[node for node in f[i]],\n", + " )\n", + "# plt.show()\n" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(3)如果您只是关心公式的结构和类型,并不关心变量具体是什么,比如二元二次方程 `x^2 + y = 1` ,它从公式结构和类型上来说,和 `w^2 + z = 1` 没有区别。 \n", + "此时,您可以设置如下参数:`ord2token = True`,将公式变量名转换成 token" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 40, + "source": [ + "sif4sci(\n", + " item[\"stem\"],\n", + " figures=figures,\n", + " tokenization=True,\n", + " tokenization_params={\n", + " \"formula_params\":{\n", + " \"method\": \"ast\",\n", + " \"return_type\": \"list\",\n", + " \"ord2token\": True,\n", + " }\n", + " }\n", + ").formula_tokens" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " '\\\\bigtriangleup',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " ',',\n", + " 'mathord',\n", + " 'mathord',\n", + " ',',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord',\n", + " 'textord',\n", + " '\\\\supsub']" + ] + }, + "metadata": {}, + "execution_count": 40 + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "(4) 如果您除了 (3) 中提供的功能之外,还需要区分不同的变量。此时可以另外设置参数:`var_numbering=True`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 44, + "source": [ + "sif4sci(\n", + " item[\"stem\"],\n", + " figures=figures,\n", + " tokenization=True,\n", + " tokenization_params={\n", + " \"formula_params\":{\n", + " \"method\": \"ast\",\n", + " \"ord2token\": True,\n", + " \"return_type\": \"list\",\n", + " \"var_numbering\": True\n", + " }\n", + " }\n", + ").formula_tokens" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['mathord_0',\n", + " 'mathord_1',\n", + " 'mathord_2',\n", + " 'mathord_1',\n", + " 'mathord_2',\n", + " 'mathord_0',\n", + " 'mathord_1',\n", + " 'mathord_0',\n", + " 'mathord_2',\n", + " '\\\\bigtriangleup',\n", + " 'mathord_0',\n", + " 'mathord_1',\n", + " 'mathord_2',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " ',',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " ',',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_4',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord_4',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord_4',\n", + " 'textord',\n", + " '\\\\supsub']" + ] + }, + "metadata": {}, + "execution_count": 44 + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## 综合训练\n", + "\n", + "综合上述方法,将题目转换成令牌序列,为后续向量化做准备。" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 96, + "source": [ + "sif4sci(item[\"stem\"], figures=figures, tokenization=True,\n", + " symbol=\"fgm\")" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['如图', '古希腊', '数学家', '希波', '克拉底', '研究', '几何图形', '此图', '三个', '半圆', '三个', '半圆', '直径', '直角三角形', '[FORMULA]', '斜边', '[FORMULA]', '直角', '[FORMULA]', '[FORMULA]', '[FORMULA]', '三边', '围成', '区域', '记', '[FORMULA]', '黑色', '记', '[FORMULA]', '其余部分', '记', '[FORMULA]', '图形', '中', '随机', '取', '一点', '此点', '取自', '[FORMULA]', '概率', '记', '[FORMULA]', '[MARK]', '[FIGURE]']" + ] + }, + "metadata": {}, + "execution_count": 96 + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/t2v/get_pretrained_t2v.ipynb b/examples/t2v/get_pretrained_t2v.ipynb new file mode 100644 index 00000000..801bcf87 --- /dev/null +++ b/examples/t2v/get_pretrained_t2v.ipynb @@ -0,0 +1,161 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# get_pretrained_t2v\n", + "\n", + "## 概述\n", + "\n", + "使用 EduNLP 项目组给定的预训练模型将一组题目的切分序列表征为向量。\n", + "\n", + "- 优点:简单方便。\n", + "- 缺点:只能使用项目中给定的模型,局限性较大。" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 导入功能块" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "from tqdm import tqdm\n", + "from EduNLP.SIF.segment import seg\n", + "from EduNLP.SIF.tokenization import tokenize\n", + "from EduNLP.Pretrain import GensimWordTokenizer\n", + "from EduNLP.Vector import get_pretrained_t2v" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输入\n", + "\n", + "类型:list \n", + "内容:一个题组中每个题目切分序列的组合。\n", + "> 这里需要调用 `GensimWordTokenizer` 将题目文本(`str` 类型)转换成 tokens。" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "def load_items():\n", + " test_items = [\n", + " {'ques_content':'有公式$\\\\FormFigureID{wrong1?}$和公式$\\\\FormFigureBase64{wrong2?}$,如图$\\\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$'},\n", + " {'ques_content':'如图$\\\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$'},\n", + " {'ques_content':'
Below is a discussion on a website.
t2v\n", + "t2v = get_pretrained_t2v(\"d2v_sci_256\")" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "downloader, INFO http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_science_256.zip is saved as /home/lvrui/.EduNLP/model/general_science_256.zip\n", + "downloader, INFO file existed, skipped\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 注意:\n", + " 默认的 EduNLP 项目存储地址为根目录(`~/.EduNLP`),模型存储地址为项目存储地址下的 `model` 文件夹。您可以通过修改下面的环境变量来修改模型存储地址:\n", + " - EduNLP 项目存储地址:`EDUNLPPATH = xx/xx/xx`\n", + " - 模型存储地址:`EDUNLPMODELPATH = xx/xx/xx`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "t2v(token_items)" + ], + "outputs": [], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/t2v/t2v.ipynb b/examples/t2v/t2v.ipynb new file mode 100644 index 00000000..908ff182 --- /dev/null +++ b/examples/t2v/t2v.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# T2V\n", + "\n", + "## 概述\n", + "\n", + "使用自己提供的任一预训练模型(给出模型存放路径即可)将一组题目的切分序列表征为向量。\n", + "\n", + "- 优点:模型及其参数可自主调整,灵活性强。\n" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 导入功能块" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 12, + "source": [ + "from tqdm import tqdm\n", + "from EduNLP.SIF.segment import seg\n", + "from EduNLP.SIF.tokenization import tokenize\n", + "from EduNLP.Pretrain import GensimWordTokenizer\n", + "from EduNLP.Vector import T2V" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输入\n", + "\n", + "类型:list \n", + "内容:一个题组中每个题目切分序列的组合。\n", + "> 这里需要调用 `GensimWordTokenizer` 将题目文本(`str` 类型)转换成 tokens。" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 11, + "source": [ + "print(type(token_items))\n", + "print(type(token_items[0]))" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "token_items[0]" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['公式',\n", + " '[FORMULA]',\n", + " '公式',\n", + " '[FORMULA]',\n", + " '如图',\n", + " '[FIGURE]',\n", + " 'x',\n", + " ',',\n", + " 'y',\n", + " '约束条件',\n", + " '[SEP]',\n", + " 'z',\n", + " '=',\n", + " 'x',\n", + " '+',\n", + " '7',\n", + " 'y',\n", + " '最大值',\n", + " '[MARK]']" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输出" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "path = \"../test_model/test_gensim_luna_stem_tf_d2v_256.bin\"\n", + "t2v = T2V('d2v',filepath=path)\n", + "t2v(token_items)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[array([ 0.0256574 , 0.06061139, -0.00121044, -0.0167674 , -0.0111706 ,\n", + " 0.05325712, -0.02097339, -0.01613594, 0.02904145, 0.0185046 ,\n", + " 0.03473525, 0.00628165, 0.03696947, 0.00666153, -0.02352318,\n", + " -0.00458236, 0.02308686, -0.02153478, 0.01579256, -0.01575841,\n", + " -0.02654778, 0.01376328, 0.02539059, -0.01098955, 0.02203193,\n", + " -0.01503642, 0.01310026, -0.03569775, -0.00450978, 0.02522727,\n", + " -0.01547103, -0.00907244, -0.00072009, -0.0021727 , 0.02894731,\n", + " 0.01382611, 0.01647377, 0.00452782, -0.02488854, 0.02741116,\n", + " 0.0489724 , -0.04156181, -0.00855933, 0.01783935, 0.00704233,\n", + " 0.01296936, -0.06078439, -0.04922014, -0.0206639 , 0.00820663,\n", + " 0.02565274, 0.0164784 , 0.00996537, -0.02215545, 0.06741589,\n", + " 0.01634789, -0.0094168 , 0.00183323, 0.00853508, -0.0547929 ,\n", + " 0.00405556, 0.01386227, -0.04204945, 0.02175955, -0.01960315,\n", + " -0.05279269, -0.01511251, -0.02905018, -0.00405249, 0.03328003,\n", + " -0.00487469, -0.00338632, 0.01793213, 0.00942458, -0.02468935,\n", + " 0.03548338, -0.00907473, 0.00927462, -0.02545504, 0.02286367,\n", + " -0.01822809, 0.03625014, -0.00976438, -0.00188348, 0.06408882,\n", + " -0.04314236, -0.00193059, 0.02433112, -0.0091018 , 0.0276503 ,\n", + " -0.0036342 , -0.02485391, 0.02309245, 0.01880057, -0.00893952,\n", + " -0.03391525, 0.02678591, -0.00618519, -0.03601262, 0.0327184 ,\n", + " 0.09240578, 0.03631649, -0.00700663, -0.01786321, -0.02987848,\n", + " 0.00315695, -0.02082208, -0.00494443, -0.02717963, -0.00938541,\n", + " -0.0329605 , 0.0069218 , 0.01227082, 0.00856757, -0.0008222 ,\n", + " -0.0067637 , -0.01577486, 0.0628339 , -0.02329138, -0.00475964,\n", + " 0.02197625, 0.03022351, 0.00256966, -0.00247619, -0.01218352,\n", + " 0.01257284, 0.0051926 , -0.05297434, -0.0057066 , 0.01031242,\n", + " 0.02414824, -0.0115857 , 0.01625632, -0.03126714, -0.02389767,\n", + " -0.01417263, 0.02280749, -0.01431546, -0.00771551, 0.0264634 ,\n", + " 0.00115387, -0.01903204, -0.00100629, 0.00608774, 0.03787961,\n", + " 0.05098663, 0.03064756, -0.00654223, -0.01838502, -0.01889201,\n", + " 0.04686983, -0.02295219, -0.00901293, 0.00916024, -0.00013042,\n", + " 0.01236307, -0.00918534, 0.01792936, 0.00862702, -0.00018518,\n", + " -0.00566689, 0.00499178, 0.0246148 , -0.0170825 , 0.01850726,\n", + " 0.00031357, 0.02411471, 0.01080729, -0.01361136, -0.06226439,\n", + " 0.01830878, 0.01209503, -0.00980596, -0.01865078, 0.03692432,\n", + " -0.04503555, 0.0037965 , -0.04214804, -0.05657932, -0.01566005,\n", + " 0.00271924, -0.00026349, -0.00783886, 0.01218421, -0.03205092,\n", + " -0.02793218, -0.00298462, 0.00380523, 0.04471321, -0.02079478,\n", + " 0.0100926 , 0.00450996, -0.03412817, 0.03027697, 0.00872989,\n", + " 0.01512562, 0.01527565, 0.03683509, 0.05608684, 0.01055199,\n", + " 0.01637757, -0.01995301, -0.01610573, 0.04207385, 0.00058077,\n", + " 0.03846577, 0.04952911, -0.02142448, 0.0049874 , -0.00308159,\n", + " -0.02233348, 0.02013967, -0.01194606, -0.02481469, 0.01824989,\n", + " -0.00939436, -0.00374474, 0.02278485, 0.04107878, 0.01870474,\n", + " -0.00310527, -0.00257802, -0.03689042, -0.0200304 , -0.04838364,\n", + " 0.0035307 , 0.02496746, -0.0385387 , 0.01649689, 0.01429029,\n", + " 0.04338812, -0.05614391, -0.01632982, 0.03378268, 0.01393604,\n", + " -0.03859077, 0.01855484, 0.00241599, -0.00985778, 0.00530987,\n", + " 0.03700508, -0.06107654, -0.00972089, 0.02251891, 0.01154722,\n", + " 0.00913082, -0.0267815 , -0.01723521, 0.0136464 , 0.01965802,\n", + " 0.04769301, -0.02218902, -0.01268643, 0.00650465, 0.00985247,\n", + " 0.0029873 ], dtype=float32),\n", + " array([ 0.00877787, 0.03242666, -0.00026327, -0.01881958, -0.00730135,\n", + " 0.03559063, -0.01825701, -0.01065201, 0.01681685, 0.01074173,\n", + " 0.02253641, 0.0082016 , 0.02200216, 0.00088347, -0.0205142 ,\n", + " -0.01339685, 0.01239092, -0.01781665, 0.01000167, -0.01227449,\n", + " -0.03044926, 0.00296532, 0.01440197, -0.01035894, 0.01061506,\n", + " -0.00530907, 0.00484147, -0.02209524, 0.00735557, 0.01712263,\n", + " -0.00231011, -0.01255511, -0.00114341, -0.01413104, 0.02112199,\n", + " 0.01123461, 0.01380601, -0.00019924, -0.02128731, 0.01526375,\n", + " 0.02988552, -0.02491145, -0.00939747, 0.00798917, 0.0135474 ,\n", + " 0.01258122, -0.03753063, -0.04039029, -0.01517935, 0.00668549,\n", + " 0.02796665, 0.01242495, 0.0059546 , -0.01216253, 0.0372387 ,\n", + " 0.01762399, -0.00170241, 0.0003667 , 0.00895109, -0.03517802,\n", + " -0.00762667, 0.01357641, -0.02436312, 0.01829541, -0.01330634,\n", + " -0.02818829, -0.01139517, -0.01664645, 0.00769452, 0.01209339,\n", + " -0.00416979, -0.01296107, -0.0064631 , 0.0050506 , -0.01833598,\n", + " 0.02872021, -0.00062401, 0.0109796 , -0.01280711, 0.01152301,\n", + " -0.01085931, 0.02023655, 0.00272896, -0.00558658, 0.03704501,\n", + " -0.01837787, -0.00414707, 0.00713773, -0.01023714, 0.0090292 ,\n", + " 0.00089387, -0.01082103, 0.02051528, 0.01287969, -0.0074691 ,\n", + " -0.01942614, 0.01223695, -0.0136801 , -0.01567431, 0.01466064,\n", + " 0.04967042, 0.02889016, -0.005946 , -0.00131571, -0.0110809 ,\n", + " 0.00165396, -0.01279759, -0.01407798, -0.01902512, -0.01361593,\n", + " -0.00631681, -0.00142478, 0.01678663, 0.00815052, -0.00193329,\n", + " -0.00845464, -0.00746565, 0.03766166, -0.01099476, 0.00489809,\n", + " 0.01403449, 0.01477709, -0.00150515, 0.00462877, -0.01271886,\n", + " 0.00072193, 0.00815068, -0.04432011, -0.00604029, -0.00264471,\n", + " 0.01325564, -0.01315497, 0.00713541, -0.0137267 , -0.01845939,\n", + " -0.02801731, 0.01673851, -0.00593479, -0.01457028, 0.01636872,\n", + " -0.00751132, -0.01056858, 0.01126528, 0.01645665, 0.02689397,\n", + " 0.01920939, 0.01767929, -0.00843761, -0.01002457, -0.00844629,\n", + " 0.02888541, -0.00503441, -0.00025836, 0.01326172, -0.00968244,\n", + " 0.00430614, -0.00964946, 0.00635843, 0.00445558, -0.00235765,\n", + " 0.00160239, -0.00325711, 0.03206096, -0.00511734, 0.01108837,\n", + " 0.0014369 , 0.02616214, 0.01631057, -0.00778238, -0.04322761,\n", + " -0.00086197, 0.01174034, -0.00230315, -0.01354581, 0.01665967,\n", + " -0.02281472, -0.0123808 , -0.02901287, -0.04143119, -0.00477564,\n", + " 0.00608404, -0.00701787, -0.00686041, 0.01422733, -0.02854553,\n", + " -0.01464688, -0.00404892, 0.00348112, 0.02299088, -0.02302668,\n", + " 0.01208024, 0.01010513, -0.01571813, 0.01446694, -0.00129136,\n", + " -0.00054684, -0.00328883, 0.01649218, 0.03326375, -0.00185443,\n", + " 0.02091988, -0.00814938, -0.0088084 , 0.02302703, -0.01156406,\n", + " 0.04080933, 0.02902327, -0.01330268, -0.00385899, -0.00826302,\n", + " -0.02295679, 0.00658087, -0.0056047 , -0.01404469, 0.00368797,\n", + " -0.01484573, 0.00689151, 0.02035506, 0.02181732, 0.02151672,\n", + " 0.0004279 , -0.00763045, -0.01551796, -0.02054572, -0.03275407,\n", + " 0.00623783, 0.007831 , -0.02604559, 0.01956206, 0.0161521 ,\n", + " 0.02634443, -0.03285164, -0.01301691, 0.01066694, 0.01585914,\n", + " -0.0187955 , 0.01046878, -0.00189302, -0.01132144, -0.00140048,\n", + " 0.02645635, -0.04300842, -0.00639437, 0.01285532, -0.00437311,\n", + " 0.01163111, -0.015357 , -0.00531165, 0.01102756, 0.00182517,\n", + " 0.02303016, -0.00949884, -0.02009463, 0.00573564, 0.00076009,\n", + " 0.00078505], dtype=float32)]" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/test_model/data/d2v/test_256.zip b/examples/test_model/data/d2v/test_256.zip new file mode 100644 index 00000000..f5ddd67b Binary files /dev/null and b/examples/test_model/data/d2v/test_256.zip differ diff --git a/examples/test_model/data/d2v/test_256/test_256.bin b/examples/test_model/data/d2v/test_256/test_256.bin new file mode 100644 index 00000000..7a56bca4 Binary files /dev/null and b/examples/test_model/data/d2v/test_256/test_256.bin differ diff --git a/examples/test_model/data/w2v/test_w2v_256.zip b/examples/test_model/data/w2v/test_w2v_256.zip new file mode 100644 index 00000000..00085d5f Binary files /dev/null and b/examples/test_model/data/w2v/test_w2v_256.zip differ diff --git a/examples/test_model/data/w2v/test_w2v_256/test_w2v_256.kv b/examples/test_model/data/w2v/test_w2v_256/test_w2v_256.kv new file mode 100644 index 00000000..23544927 Binary files /dev/null and b/examples/test_model/data/w2v/test_w2v_256/test_w2v_256.kv differ diff --git a/examples/tokenizer/tokenizer.ipynb b/examples/tokenizer/tokenizer.ipynb index 4819b00d..a928ebba 100644 --- a/examples/tokenizer/tokenizer.ipynb +++ b/examples/tokenizer/tokenizer.ipynb @@ -3,28 +3,39 @@ { "cell_type": "markdown", "source": [ - "# Tokenizer\n", - "\n", - "## 概述\n", - "\n", - "为了方便后续向量化表征试题,本模块提供题目文本的令牌化解析(Tokenization),即将题目转换成令牌序列。 \n", - "\n", - "根据构成题目的元素类型,解析功能分为 **“文本解析”** 和 **“公式解析”** 两部分。\n", - "\n", - "### 文本解析\n", - "\n", - "根据题目文本切分粒度的大小,文本解析又分为 **“句解析”** 和 **“词解析”**。\n", - "\n", - "(1) 句解析(sentence-tokenization):将较长的文档切分成若干句子的过程称为“分句”。每个句子为一个“令牌”(token)。(待实现) \n", - " \n", - "\n", - "(2) 词解析(text-tokenization):一个句子(不含公式)是由若干“词”按顺序构成的,将一个句子切分为若干词的过程称为“词解析”。根据词的粒度大小,又可细分为“词组解析”和\"单字解析\"。\n", - "- 词组解析 (word-tokenization):每一个词组为一个“令牌”(token)。\n", - "- 单字解析 (char-tokenization):单个字符即为一个“令牌”(token)。\n", - "\n", - "### 公式解析\n", - "\n", - "公式解析(formula-tokenization):理科类文本中常常含有公式。将一个符合 latex 语法的公式切分为标记字符列表的过程称为“公式解析”。每个标记字符为一个“令牌”(token)。 \n", + "# 令牌化\r\n", + "\r\n", + "## 概述\r\n", + "\r\n", + "此模块可以定制化的将文本切分为令牌(token)序列,在此,将展示三种方式来进行令牌化:调用tokenize函数、调用sif4sci函数、调用封装好的tokenizer。" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 调用tokenize函数\r\n", + "\r\n", + "### 概述\r\n", + "\r\n", + "为了方便后续向量化表征试题,本模块提供题目文本的令牌化解析(Tokenization),即将题目转换成令牌序列。 \r\n", + "\r\n", + "根据构成题目的元素类型,解析功能分为 **“文本解析”** 和 **“公式解析”** 两部分。\r\n", + "\r\n", + "#### 文本解析\r\n", + "\r\n", + "根据题目文本切分粒度的大小,文本解析又分为 **“句解析”** 和 **“词解析”**。\r\n", + "\r\n", + "(1) 句解析(sentence-tokenization):将较长的文档切分成若干句子的过程称为“分句”。每个句子为一个“令牌”(token)。(待实现) \r\n", + " \r\n", + "\r\n", + "(2) 词解析(text-tokenization):一个句子(不含公式)是由若干“词”按顺序构成的,将一个句子切分为若干词的过程称为“词解析”。根据词的粒度大小,又可细分为“词组解析”和\"单字解析\"。\r\n", + "- 词组解析 (word-tokenization):每一个词组为一个“令牌”(token)。\r\n", + "- 单字解析 (char-tokenization):单个字符即为一个“令牌”(token)。\r\n", + "\r\n", + "#### 公式解析\r\n", + "\r\n", + "公式解析(formula-tokenization):理科类文本中常常含有公式。将一个符合 latex 语法的公式切分为标记字符列表的过程称为“公式解析”。每个标记字符为一个“令牌”(token)。 \r\n", " " ], "metadata": {} @@ -32,15 +43,15 @@ { "cell_type": "markdown", "source": [ - "## 文本解析" + "### 文本解析" ], "metadata": {} }, { "cell_type": "markdown", "source": [ - "### 句解析\n", - "\n", + "#### 句解析\r\n", + "\r\n", "待实现..." ], "metadata": {} @@ -48,19 +59,19 @@ { "cell_type": "markdown", "source": [ - "### 词解析\n", - "\n", - "词解析分为两个主要步骤: \n", - "\n", - "(1) 分词: \n", - "- 词组解析:使用分词工具切分并提取题目文本中的词。 \n", - " 本项目目前支持的分词工具有:`jieba` \n", - "- 单字解析:按字符划分。\n", - " \n", - " \n", - "(2) 筛选:过滤指定的停用词。 \n", - "- 本项目默认使用的停用词表:[stopwords](https://github.com/bigdata-ustc/EduNLP/blob/master/EduNLP/meta_data/sif_stopwords.txt) \n", - "- 你也可以使用自己的停用词表,具体使用方法见下面的示例。\n" + "#### 词解析\r\n", + "\r\n", + "词解析分为两个主要步骤: \r\n", + "\r\n", + "(1) 分词: \r\n", + "- 词组解析:使用分词工具切分并提取题目文本中的词。 \r\n", + " 本项目目前支持的分词工具有:`jieba` \r\n", + "- 单字解析:按字符划分。\r\n", + " \r\n", + " \r\n", + "(2) 筛选:过滤指定的停用词。 \r\n", + "- 本项目默认使用的停用词表:[stopwords](https://github.com/bigdata-ustc/EduNLP/blob/master/EduNLP/meta_data/sif_stopwords.txt) \r\n", + "- 你也可以使用自己的停用词表,具体使用方法见下面的示例。\r\n" ], "metadata": {} }, @@ -68,7 +79,7 @@ "cell_type": "code", "execution_count": 1, "source": [ - "# 导入模块\n", + "# 导入模块\r\n", "from EduNLP.SIF.tokenization.text import tokenize " ], "outputs": [], @@ -78,7 +89,7 @@ "cell_type": "code", "execution_count": 2, "source": [ - "# 输入\n", + "# 输入\r\n", "text = \"三角函数是基本初等函数之一\"" ], "outputs": [], @@ -87,8 +98,8 @@ { "cell_type": "markdown", "source": [ - "#### 词组解析\n", - "\n", + "##### 词组解析\r\n", + "\r\n", "分词粒度参数选择 word: `granularity = \"word\"` " ], "metadata": {} @@ -97,7 +108,7 @@ "cell_type": "code", "execution_count": 3, "source": [ - "# 输出:默认使用 EduNLP 项目提供的停用词表\n", + "# 输出:默认使用 EduNLP 项目提供的停用词表\r\n", "tokenize(text, granularity=\"word\")" ], "outputs": [ @@ -117,8 +128,8 @@ { "cell_type": "markdown", "source": [ - "#### 单字解析\n", - "\n", + "##### 单字解析\r\n", + "\r\n", "分词粒度参数选择 word: `granularity = \"char\"` " ], "metadata": {} @@ -127,7 +138,7 @@ "cell_type": "code", "execution_count": 4, "source": [ - "# 输出:默认使用 EduNLP 项目提供的停用词表\n", + "# 输出:默认使用 EduNLP 项目提供的停用词表\r\n", "tokenize(text, granularity=\"char\")" ], "outputs": [ @@ -147,7 +158,7 @@ { "cell_type": "markdown", "source": [ - "#### 停用词表" + "##### 停用词表" ], "metadata": {} }, @@ -155,10 +166,10 @@ "cell_type": "code", "execution_count": 5, "source": [ - "# 获取自己的停用词表\n", - "spath = \"test_stopwords.txt\"\n", - "from EduNLP.SIF.tokenization.text.stopwords import get_stopwords\n", - "stopwords = get_stopwords(spath)\n", + "# 获取自己的停用词表\r\n", + "spath = \"test_stopwords.txt\"\r\n", + "from EduNLP.SIF.tokenization.text.stopwords import get_stopwords\r\n", + "stopwords = get_stopwords(spath)\r\n", "stopwords" ], "outputs": [ @@ -179,7 +190,7 @@ "cell_type": "code", "execution_count": 6, "source": [ - "# 输出:传入停用词表(stopwords)\n", + "# 输出:传入停用词表(stopwords)\r\n", "tokenize(text,granularity=\"word\",stopwords=stopwords)" ], "outputs": [ @@ -199,7 +210,7 @@ { "cell_type": "markdown", "source": [ - "## 公式解析\n", + "### 公式解析\r\n", "切分出 latex 公式的每个标记符号。针对本模块更加详细的解释参见 [formula](../formula/formula.ipynb)" ], "metadata": {} @@ -208,7 +219,7 @@ "cell_type": "code", "execution_count": 7, "source": [ - "# 导入模块\n", + "# 导入模块\r\n", "from EduNLP.SIF.tokenization.formula import tokenize" ], "outputs": [], @@ -305,7 +316,7 @@ "cell_type": "code", "execution_count": 11, "source": [ - "# 输出形式选择抽象语法分析树(ast)且将公式变量名转换成 token\n", + "# 输出形式选择抽象语法分析树(ast)且将公式变量名转换成 token\r\n", "tokenize(formula, method=\"ast\", return_type=\"list\", ord2token=True)" ], "outputs": [ @@ -343,7 +354,7 @@ "cell_type": "code", "execution_count": 12, "source": [ - "# 输出形式选择抽象语法分析树(ast)且将公式变量名转换成带编号的 token\n", + "# 输出形式选择抽象语法分析树(ast)且将公式变量名转换成带编号的 token\r\n", "tokenize(formula, method=\"ast\", return_type=\"list\", ord2token=True, var_numbering=True)" ], "outputs": [ @@ -373,13 +384,13 @@ { "cell_type": "markdown", "source": [ - "## 综合解析\n", - "\n", - "综合解析,即综合以上两种解析方式(标记解析 + 公式解析),提供对题目文本的全解析。另外,如遇到特殊符号将转换成常量,例如:\n", - "```python\n", - "FIGURE_SYMBOL = \"[FIGURE]\" # $\\SIFChoice$\n", - "QUES_MARK_SYMBOL = \"[MARK]\" # $\\FigureID{1}$\n", - "```\n" + "### 综合解析\r\n", + "\r\n", + "综合解析,即综合以上两种解析方式(标记解析 + 公式解析),提供对题目文本的全解析。另外,如遇到特殊符号将转换成常量,例如:\r\n", + "```python\r\n", + "FIGURE_SYMBOL = \"[FIGURE]\" # $\\SIFChoice$\r\n", + "QUES_MARK_SYMBOL = \"[MARK]\" # $\\FigureID{1}$\r\n", + "```\r\n" ], "metadata": {} }, @@ -387,17 +398,17 @@ "cell_type": "code", "execution_count": 39, "source": [ - "# 导入模块\n", - "from EduNLP.Tokenizer import get_tokenizer\n", - "\n", - "# 输入\n", - "item = {\n", - " \"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\n", - "}\n", - "\n", - "# 输出\n", - "tokenizer = get_tokenizer(\"text\")\n", - "tokens = tokenizer(item)\n", + "# 导入模块\r\n", + "from EduNLP.Tokenizer import get_tokenizer\r\n", + "\r\n", + "# 输入\r\n", + "item = {\r\n", + " \"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\r\n", + "}\r\n", + "\r\n", + "# 输出\r\n", + "tokenizer = get_tokenizer(\"text\")\r\n", + "tokens = tokenizer(item)\r\n", "next(tokens) " ], "outputs": [ @@ -472,6 +483,1125 @@ } ], "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 调用sif4sci函数来进行令牌化\r\n", + "\r\n", + "### 概述\r\n", + "\r\n", + "SIFSci 是一个提供试题切分和标注的模块。它可定制化的将文本切分为令牌(token)序列,为后续试题的向量化做准备。" + ], + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "本文将以下面这道题目(来源自 LUNA 题库)为例,展示 SIFSci 的使用方法。 \n", + "\n", + "![Figure](../../asset/_static/item.png)" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 符合 [SIF 格式](https://edunlp.readthedocs.io/en/docs_dev/tutorial/zh/sif.html) 的题目录入格式为:" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "item = {\r\n", + " \"stem\": r\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\r\n", + " \"options\": [\"$p_1=p_2$\", \"$p_1=p_3$\", \"$p_2=p_3$\", \"$p_1=p_2+p_3$\"]\r\n", + "}\r\n", + "item[\"stem\"]" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "'如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\\\SIFChoice$$\\\\FigureID{1}$'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "- 加载图片:`$\\\\FigureID{1}$`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "from PIL import Image\r\n", + "img = Image.open(\"../../asset/_static/item_figure.png\")\r\n", + "figures = {\"1\": img}\r\n", + "img" + ], + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAOYAAACICAYAAADzlJeRAAAe3klEQVR4nO2df1RT9/3/nyE/SWJyBdSUOgs72mKP1XhK24j2CGon4FagbtTMtpZt6rDzR7rjKt+2C+rnHGxdBbr2xB9zgc4j1NMJbpXg2hlsp2DnjrSHVaqcgquWbFqNLUgQkvf3DwgiSSA/7uUm8f04530O3Ny87zu5ed7X+8fr/XoJCCEEFAolrIjhuwEUCsUTKkwKJQyhwqRQwhAR3w2IBJqamuBwOHy+rtFokJKSMo4tokQ7VJgAGhoa0NzcDLvdjn//+99oa2uDw+FAa2srAIBhmFHf73K58O233wIAYmNj8cADDwAAZs+ejblz50Kr1UKn00Emk3H6OSjRg+Bum5Vtbm5Gc3MzrFYrPvnkE7S2toJhGPT09KC3t5f160mlUsTGxqKrqwsMw2D69OlYunQpdDodMjMzWb8eJTqIemHa7XbU19fjT3/6E/72t79BoVDA6XSiq6uL13YxDIOuri48/vjjyM/PR2ZmJpKSknhtEyV8iEph2mw21NbW4tChQzh16hSkUulQVzMckcvliImJgUwmQ3Z2NgwGA7RaLd/NovBIVAmzuroar732Gs6fPw8AuHnzJs8tCg61Wg2FQoFXX30VK1asGHOMS4k+Il6YNpsNf/jDH7Bz504ACGvLGChyuRxOpxOLFy/G+vXr6Zj0LiJihdnU1IQ33ngDdXV1cLlcoy5nRAMMw0AkEmHnzp14/vnn+W4OhWMiTpjNzc0oLCxEa2sr7HY7380Zd5RKJWQyGRVolBMxwrTZbFi/fj0++OAD3Lhxg+/m8A4VaHQT9sK02+347W9/i8rKyqgaP7KFUqmESqXCn//8Z+h0Or6bQ2GJsPaV3bdvH5KTk7Fv3z4qSh90dXXh66+/RlZWFjZs2HBXdu+jEhKGdHZ2Ep1OR9RqNQFAi59FJpORSZMmEYvFwuLdqCNlZRdYrI/iD2FnMfft24dZs2ahqamJjiUDxOFw4MqVK9Dr9XjqqadYsJ5tKJ+fjUNsNI4SGHw/Gdxcv36d5OXlUSvJUhGLxWTSpEmksbEx6HtyoSyNACBr6li80RS/CAuL2dDQgPvvvx/vv/8+tZIs0dfXhytXruBHP/oRSkpKAq+grRw78QrK0thvG2VseJ+VLS0txbZt2+ikBYeoVCrMmzcP1dXVfrr3WbB2/nlsPnk/dgr+Dw9eOImN07lrX0NDA4Db+17/+c9/4uuvv/Z67uzZs5GcnAwASEpKuqNEFXya65/97GeEYRjeu313QxGLxSQ5OZmcO3duzPtSt2YNqSOEkAtlJA2Df7NAT08PsVqtxGg0Ep1OR2JjYwkAwjAMYRiGiESigD6TUqkkDMMQpVJJABCtVkuys7OJ0WgMqQsfDvAizOvXr5NHH3106MbQMn4lLi6OHDlyxOe9cY8rb5fQhHnu3Dny8ssvk+TkZCISicbtQcwwDImNjSUZGRnEZDKR9vb2ED7F+DPuwmxvbycpKSlEIpHw/iO9W8uUKVPIO++843lzLpSRNcOXRurWEKSVkUAXSxobG0lhYSFJSEggKpUqYEvIdpHL5USpVJLExERiMpnI9evXQ/kJjwvjKsyzZ8+SxMRE3n+YtIBMnDiRvP3227dvzoUykjZChBfK0vwWZk9PDzGbzSQxMTGshydyuZwoFAqSnZ3N8novu4ybMM+ePUs0Gg3vN4aW2yUuLm5InAPH0siAwbxAytKGn+u7O9vZ2Uk2bdpEJkyYMDTWi5TCMAxJSEggZrN5nFTgP+MizPb2dmopw7TExcWRjRs3BnxPOzs7ycqVK4lcLue9qxpqUSqVYSdQzoXZ3t5OkpKSeP/yafFdGIbxW5w9PT3EaDSS+Ph4IhaLeW87m8Ut0JqaGk414Q+cCpOKMnIKwzBk165do97PqqoqotFoiEKh4L29XBa1Wk10Oh2vM7mcCfP69eskJSWF9y+ZFv/LlClTvC6luDcVqFQq3ts4noVhGGI0GklPTw9XMvEJJ54/DocDjzzyCFpaWtiuOuqRy+WQSCQecW6Tk5OhVqvvODcnJwcA8NFHH+G///0vrl27ht7eXvT19QUdnnPKlCloaGgYiixfX1+PZ555Bt98802QnyiyUSgUiI+Px5EjR8Y1ciEnwnzqqafw/vvvo6+vj+2qowaGYYYiuLtFl5OTM5RuQavVshodr6GhAXa7Hc3Nzaivr4fNZsNXX30FAIiJiUF/f//QuSqVCrt27cJf/vIXfPTRR9RdEkB8fDxKSkqwevXqcbke68Kkvq+eiEQiKJVKdHV1ITU1FUuXLkV6enpY5Dxxi7WjowN//etfceLECXz33XdwuVx3iJUyEFZ00aJFOHjwIPfpLtjsFzc2NpKEhATexwZ8F7frmUgkIjqdjpSUlESU7+bhw4eJRqMharWayOVyIpVKSUxMDO/fazgUsVhMZsyY4ZfPcSiwJszOzk4ybdo03r84vopMJiNKpZJotdqIE+JwTCYTSU9PJ+3t7aSqqoqsWLGCnDt3jphMJpKZmUkkEknULZMEU6ZMmULOnj3L2X1gRZg9PT0kNTWV9y+Lj6JWq0liYiLZvn17xDlKD6enp4esWLGC/PKXvyQ9PT3k448/Jh9//DHZtGkTKS0tveM8s9k8tDtEKBTyfg/4KhqNhrMHMCvCNBqNUb+2NVKMKpWKPPfcc5w+NceLc+fOEa1WS6qqqoYE6S49PT0kMzOTWK1Wj/e1t7eT7du3k8TERCKXy3m/L3yUhISEUXfrBEvIwmxvbw9rp2U2i1KpJNnZ2WHhGcIWVVVVRKfTkXPnznmI0l06OzuJVqslnZ2dPus5e/Ysee6554hSqSRSqZT3ezWexdf6byiEPCs7b948NDU1hVJFWCMSiSCXy5Gbm4utW7dGzU55h8OBoqIi2Gw2mEymMdecRSIRioqKYLFYRp2RdDgceOutt1BSUoLu7m5Oco6GI1OmTMHevXvx5JNPslJfSMLct28fNm/eHJVxemQyGWQyGX7yk59g27Zt0Gg0fDeJNWw2G/R6PXJycpCamur3+86cOYOLFy+itLR0zHMdDgcqKirw6quvoru7Gz09PaE0OSKIj4/Hhx9+yIojQtDCtNlsmDt3Lmw2W8iNCCcUCgViYmKwbt06bNmyJepS4NXX18NgMMBsNge1Trl//34sXLgwoLQMFRUV2Lx5M3p6etDd3R3wNSOJpKQkWK3WkHtWQUfJW7duXVS5aYnFYjAMg+LiYvzvf//Djh07ok6UxcXFKC8vR2NjY9DOAyaTCZWVlQENX55//nlcuXIFBw4cQHJyMmJjY4O6diTQ0dGBH/zgByFnnwvKYjY3NyM9PT1qurAMw2DZsmV46623ok6MwIB3T15eHhYuXIglS5aEXN/UqVORl5cHq9Ua1Pf1u9/9Djt27IDdbofT6Qy5PeGGRCKBVqvF6dOng64jKGEuWrQIVqs16IuGCxMmTMC9994Ls9kctQl5mpqaUFBQAJPJBJFIxFq9/f392Lp1a9C/A5vNhnXr1uH48eNR84AfjkqlwpYtW1BUVBTU+wMWZjRYS7FYDIVCgW3btmH9+vV8N4czysrKcOTIEVRVVaGtrY31+v/xj3/gxo0bwQWUHqShoQErV67E9evXo26CKJTJoIDHmC+++GJEi5JhGOTn56O9vT1qRWm326HX63Hx4kVYLBZORAkACxYsQGtrK2pra4OuIz09HZcvX8ZvfvMbxMfHs9e4MOCbb75BTk5OUOPNgCxmJFtLsViMyZMn47333ovabisAtLa2oqCgABs3bsTUqVM5v96sWbOQlZUFs9kc8k4Zm82GvLw8fPbZZ7h58yZLLeQXsViM/Px8HDhwIKD3BWQxI9VaMgyDRYsWoaWlJapFWVFRAb1eD7PZPC6iBICWlhaYzWbo9fqQZyI1Gg0aGxuxefPmqJmE6+vrw9GjR1FfXx/Q+/y2mE1NTcjKyoq4fZZxcXF45ZVXYDAY+G4KZzgcDhgMBjgcDphMJpw5c2bc23D16lVUVlaipqaGlfqamprw5JNP4sqVK6zUxzfJycn48ssv/T7fb4tpMpkiSpRisRjTpk3DyZMno1qUHR0dyMjIwJw5c/Dzn/+cF1ECQEJCAlJSUrBjxw5W6tPpdDh//jx0Oh3kcjkrdfLJ1atX8fvf/97v8/0SpsPhwKFDkZO+dOLEifjhD3+IL774gvcIAVxSW1uLvLw8mEwmzJo1i+/mYNmyZTh27NhQ9q5QYRgmarq23333HYqLi/3u7vslzOrqalbXwLgkISEB5eXlOHz4MPfhH3ikqKgIlZWVsFqtQQfe4oKamhoYDAZ0dHSwVmdxcTEsFgsSExNZq5MPent7/e5R+DXGnDlzJlpbW0NuGNdoNBpUVVUhPT2d76ZwhtsBfenSpViwYAHfzfGKvztRAqWjowNZWVkR8Vv0hUqlwhdffDHmpogxLWZra6vPJKLhRGJiIiwWS1SL0j0BZzQaw1aUwIBX0KpVq1BYWMhqvUlJSWhsbERqamrE9OBG4nQ6/Vo6GVOYe/bsCfs1pXvvvRcnT54c17if482OHTtgMBhgsVgi4kc5ffp0yGQy7N69m9V6GYbBxx9/jLS0tIh0hu/u7kZJScmYY80xhXngwIGwDWMokUiQkpKClpaWqNnAPBK3A/qNGzewc+dOzrx4uKC0tBTvvvsu6xvpZTIZTpw4gSeeeMIjCHYk4HQ6x1zXHHWM2dHRgYceeiisJhfcyOVyzJkzB3V1dRE/Y+eL5uZmFBQUwGg0IiEhge/mBMXUqVOh1+tRU1PDyWbzTZs2oaKiIuIcX8Za1xzVYtbX18PlcrHeqFCRSCSYM2cOjh8/HrWi3L17NwoLC1FTUxOxogSAS5cuoaSkBAUFBSF7BnmjrKwMRqMREydOZL1uLrl27Rqam5t9vj6qMA8dOhSW48v7778fx48fj8rlEIfDgYKCAnz66aewWq24dOkS300KGZFIhKVLlwa9BWosDAYDnnvuuYjq1nZ3d6OystLn6z67sg6HAxMnTuTkKRcKKSkpaGxsjEpL6XZAX7t2LaZPn853c1jn7bffRk5ODlasWMFJ/Xq9HrW1tWH3m/VFQkKCT5dDnxazqakp7CzStGnTgt41H+7U1tYObWiORlECgNlsxmuvvcbZOmRVVRUeffTRiJi1BgaWlXxNjPkU5ocffhhWkz6JiYk4ceJEVEWrc2MwGFBZWQmLxRJW3znbnDlzBjU1NdDr9Zz5XR87dgyPPPIIxGIxJ/WzSVdXF959912vr/nsyoZT+BCNRgOLxRJ165Tu/YdPP/10QGEkIx22d6KMxG63Y968eRHhIeSrO+vTYv7rX//itEH+kpCQgJqamqgTZUNDAzIyMlBaWnpXiRIYuKdz5sxBcXExJ/UzDAOr1Ypp06ZxUj+bOBwOr37FXoVpt9vDIoK2SqXCiy++GHWbm4uLi7F169aQwkhGOkuWLMHp06cD3kDsLxqNBseOHUNcXBwn9bOJt904XoXZ3NzMu7uTWCyGVqvlbIqdD+x2O7KysgAA27dvHzMtQbRTVVWFoqIiVneiDMe9P1SlUnFSPxt0dXXhk08+8TjuU5h8RyybPHkyjhw5wmsb2KSpqQkZGRnYuHEjK7Fdo4GWlhZUVVUhLy+PsyWO1atXY/HixWE9U+ut1+BVmGfPnuW1K5uQkID33nsvapZFdu/ejaKiItTU1ECpVPLdnLDi6tWreOmll1BQUMDZNQ4ePBjWezm/+uorjweTV2F+9tln49Igb0TTuNLhcECv1+PTTz+FxWKJCi8eLpg6dSo0Gg3Kyso4qV8mk4X1eFOpVHq453kVJl8/ILFYjMWLF0fFuLK1tRXz5s1DTk4OVq5cyVssnkihpKSE1bAkIwnn8abL5fIYZ/ucleWD+Ph4HDx4kJdrs0l1dTX0ej2qqqrGLYxkpHPmzBmYzWYYDAbOMsi5x5tCoZCT+oOlt7fX4zPfIUzi6se3Xd9BoIoF1ApgHD3yGIaB2WwOOzfAQHA4HCgsLMSRI0fQ2NiIq1ev8t2kiKKtrQ0mk4mVGLW++OMf/xh2Xdre3l6cP3/+jmN3CFMQI8K1K/+DTCwGbnYD4+QLLBaL8dhjjyEzM3N8LsgB7ng0c+bMwQsvvEC7rkHS39+PnJwczoYzDMPgjTfeCLsu7UgvJY+ubMfFi4iJCTptZlCoVCpUVFSM6zXZpL6+Hnl5eSgpKQmLMJKRTmpqKux2O2e/iWeffRbf//73Oak7WDo7O+/432Nxx27/dvAvMaCWDPzpugV81wfIZIBUCPQOWlOxGJAPntM7zMIOPw4ncGOY6R3x2gSXGEaj0dM5nThxy+kCAQBBDCRCIQQjj0MAsUiEGBD09/fDCcHA+MHZDycAxIggFbhunx8jgjRGENAXNhbFxcU4ffo0rFbrXe8wwCYmkwlZWVlISUnhZIbebDZj8eLFuHbtGut1B8PIYY+HabTbrw9GLegDbt4C4BwQJQA4nED/rUEBigfGoDe6gRu3AKkMEA8elwsHusI3uoF+IaBwe/p7vjbpnslesm4R9DsJYoRiSEUiCOGCk4w8LoY4hqDPRQAIIBLGQAACp9OJGJEYUmEMBK5+9LoA8dD/TrAVj8FutyMjIwMA8PLLL1NRsox7MqiwsJCTyUitVouVK1eG7S6U0fusfX1AL24LSyYEnH23/46RDEwSqSUAhIBw8LjLCQyehlu3AJHw9nuGvaaWSlFd9a6nWIgTzhghRAJgQHTigb+JE06BAMJBoxcjGCk2AYRC0bAPJYAwZtDSDuIKOE2vJ01NTZg3bx6MRiP14uGQS5cuobS0FHl5eZzU//rrr2PChAmc1B0qvoXpFpbDOSAsMQCh884Jof5bgxZzsLhfixllOnrwNZFIhPzlyzFn9hzv5xF3d3XkceL9+DhRVlYGg8EAq9Ua1m5e0QKXYUlkMhn2798flt5YQ8J0ufrQ6yIDyxXEOeyUQaspV9y2lsCgYCXDllQGu7YOJwDh7eNCIdDvvP2ewddUKhXeLC8FiRF6eToIICDu7isAkAFLJxBCCIL+IbNHfLyffdxhJC9evBhxYSQjHTYS5PoiNzc3LCfshn7TAggAVz/iJ09GTP+ICRKHE8AIa4k+oNcJSBWD3Vnh4OuDY1P3cbET6O67/Z6bt6BQxWHz/9sCgUQGydBkDEF/fx/6BwUoEQrgdPaht78Pvf1OYKhbGwO4+geOuwSD7yfod7pAQOB09sM15v+B0dzcjKysLKxatQrLly8P8N0UNuAyLInJZAo7q+kRwaChoQF5eXkjBtxiQNbH2rpmXFwcLl++HBHOBBUVFdizZw/MZjN1GOCZhIQE6PV6NDY2sv7b4Ttix8hIBv71AmVCwDn2af6gUChQVFQU9qJ0e/GcOHECVquVijIMuHr1KoxGI/R6Pet179q1i1erOdJ100OYdwhGMdgdheP2ZFCISKVS/OpXv2KnMo4Il2SwFE/YTpDrRqvVYtKkSazWGQgjDZWHMDUaze1wF90jZltDJBKsZbglg6V4wnaCXDe7du3iLXv1mBYzKSmJsxCK4W4twzUZLMUTLhLk5ubmYsqUKazVFwgjrbXXMSYXDr7hbC1tNhvmzZsHtVqNX//619SLJwJoaWmByWRiPScKH1ZTLpdj9uzZdxzzKsz4+HjWLy4UCvHMM8+wXm+oNDQ0ICsrC6WlpWGdDJbiCRcJcnNzczF58mTW6vMHiUTiEZ7VqzC5MOcPP/xw2EVR37FjB7Zu3QqLxXLXhpGMdLhIkPvSSy+Nq1dXT08PUlJS7jjmVZhs77pXq9XYsGEDq3WGwvBksNu3b6dePBEO2wlyc3NzIZFIxj6RJaRSqUfgOa/CfOCBB1h9YgiFwrDZBN3c3IyMjAysWrUKy5Yt47s5FBZgOyyJRqPBY489xkLL/OPBBx/0OOZVmAsWLGBtsVUkEmH58uVhMekTLclgKZ6wnSB3w4YNkEqlLLRsdEQiERYvXuxx3KswdTodawGfFQoF1q1bx0pdwRKNyWApnrC5EyUzM3NcjIlSqfS6ddCrMBmGYW1m9p577uE1IZA7jOTChQtpGMm7gNTUVNhsNlRXV4dUj0wmQ05ODkut8o3D4fAaocGnrywbYpLL5di4cWPI9QRLdXU1CgoKUFVVFbXJYCmesLUTpbCwkHOrqdVqvV7DpzCfeOKJkPvYTqeTs7Teo+FwOGAwGHDkyBFYLBbqgH6XwVaCXJ1Ox2mEA6lUiqefftrraz6FqdPpQs74NXPmzHHPP2Kz2ZCVlYX77rsPL7zwAvXiuUu5dOkSjEZjyDlR1qxZw1KLPJFKpT5XK0btyjqdwe/1EolE474c4U4GW1JSctclg6V4wkaC3CVLlnAWsCs+Pt7DscCNT2HKZDI8/vjjQV/U12wTV9BksBRvhJogV6fTcZJSYazVilE3Sj/77LNBO7Q7nc5xydhFk8FSxiKUBLkymQxz585lv1EAfvGLX/h8bVRhZmZmBt2dTU1N5XxGyx1GkiaDpYxGqAly8/PzIRCwGyh84cKFo86/jCpMhmHw0EMPBXxRqVSKJ598MuD3BUJZWRmKiopgtVrDLpASJfwIJUFueno6q+NMhmG8BDm/kzFj/qxatSrg/WmjzTaFit1uh16vx8WLF2GxWKgDOsVvgk2Qy7aDzMSJE8fUx5jCzM3NDfjCEonE52xTKLS2tiIrKws5OTlYvnw59eKhBEywCXLZGmeq1Wrs2rVrzPPGFGYwnvaPPvpoQOf7gzsZrNlspslgKUET7E6UH//4x6xcPy4uzi9j51f4yg0bNkCtVvt1YZFIFNIyy0hoMlgK2wSTIHfJkiUhp6f011oCXgI+++Lee+/F119/7dfFKyoqguoCj6SjowMFBQV4+umnacQ6CuucOXMGFy9eRGlp6ZjnOhwOyOVy+CkXryQnJ+PLL7/061y/HwEGgwEKhWLM8wghrIwvaTJYCtcEkiBXJpOFtPynVqtx+PBhv8/3W5ijLYYO5+bNmyELs6ioCOXl5bBardSLh8IpJpMJlZWVfoUlCTZmlUgkQn5+fkCzu34Lk2EY/PSnPx0z5Mj3vvc9vy8+EpvNhoyMDKjVapoMljIuBJIgN9iZWZVKhTfffDOg9wQ0ml23bt2Y3dmZM2cG1AA3TU1NyMrKgtFopGEkKeOKvwlytVptwB5AarUa+/fvD7gbHJAwtVot8vPzfVrNYGdk3clgLRYLTQZL4QV/wpI88sgjAdUpFouxaNGioCZCA57/3bZtm09PIIVCEdD4kiaDpYQTYyXITUlJCchi3nPPPTh48GBQbQlYmBqNBqtWrfJqmgOZkaXJYCnhyGhhSZKSkvxeLomLi8OxY8eCnskNasX09ddf92o1XS6XXw2pqKhAYWEhzGYzDSNJCStaWlpgNpt9Oh/4EwharVZjx44dIa1OBCVMmUyGoqIij4kgh8Mx6pSyO4wkTQZLCWdGS5A7Vhws97hy9erVIbXBb8+fkTgcDsyePRsXLly447iv6lpbW1FQUIC1a9fSiHWUiODo0aNQq9XYsmXL0LH4+Hhcu3bN6/kSiQQPP/wwjh8/HvJe5KCd/2QyGd555507Nnv6inZQW1uLgoICmEwmKkpKxOAtQe5oET1mz57NiiiBEIQJDMRDefbZZ4ca4q3/bTAYUFlZCYvFQpPBUiKOkQlyfW2YTklJwQcffMBa1I7Q3OUxMBHk9vYZvh3LnQz2vvvuo8lgKRHLyAS53nZZpaSkoLGxkdVQrSELUyaT4dChQ3eY+OHJYGkYSUqkMzxB7vAJT3dAALZFCbAgTGDAIygtLQ0zZsygyWApUYk7Qa5UKoVQKERsbCy0Wi0nogQCEKZlrQACwYgyvxxuX50lS5bAarWitbUVf//736kXDyXqKC0txX/+8x/I5XLk5OTg9OnTnGUaCGi5pK18PmZ8/grIniwAFqwVZGPvmjqQPVk4cOAAtmzZgsuXL7PeSK1Wy8oXkJ6ePvR3RUVFUHFGKRSRSIS+vj5OrxGAMNtQPn8GPn+FYM9AfGVY1gqQjbpBoVIo0U9HRwc6OjrueMgDACxrIcjeO/TvmjqCPViLtdiDYOTh/xiz7SgOnVqD3MGLtJXPR/beNJRtpqKk3D0kJSWNEGUbyucLIMgG6ggBGSy5tQIIslvw4IzgruO3MNuOHsIp7EX24PhyFSpByElspP4ClLuWgV7kJpThAtmD4SYqa08d1mAW7g9SH34Ksw1HD53CmrrBJ8KFMmDTDMwvpxM8lLuXtvJV2HQqDWWVG+Gpvxl4sCwXwfYn/RtjtpVj/ozP8cqwp4JlrQDZe9egbsSTgkK5Oxi0lrO4mWPxy2K2HT2EU2uGq78N51sApD2IILvQFEqEcwGfnwLSgh1EjoEfcTwGu7Gv3JalZe0MbDoFrKnzZsIplLuAtvPg0sl0TIspEAyIcG/2bceC7L1pKLtAgpoGplCigun3YxaAU59f8Pqypfy2800wBL0fk0K56xlcu0wru4CTQ8sTnuv9wUCFSaGEQls55s/YhFNDB9JQdiH0ZUQqTAolDGFldwmFQmEXKkwKJQz5/6GFNae7ZI7lAAAAAElFTkSuQmCC", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "### 导入模块" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "from EduNLP.SIF import sif4sci, is_sif, to_sif" + ], + "outputs": [], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "### 验证题目格式" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "is_sif(item['stem'])" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 若发现题目因为公式没有包含在 `$$` 中而不符合 SIF 格式,则可以使用 `to_sif` 模块转成标准格式。示例如下:" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", + "is_sif(text)" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", + "to_sif(text)\r\n" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "'某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 题目切分及令牌化\r\n", + "\r\n", + "现在我们得到了符合标准格式的题目文本,接下来可以对题目做进一步的预训练,例如:切分和令牌化。" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "#### 题目切分" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "##### 基本切分\r\n", + "分离文本、公式、图片和特殊符号。" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "segments = sif4sci(item[\"stem\"], figures=figures, tokenization=False)\r\n", + "segments" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形', 'ABC', '的斜边', 'BC', ', 直角边', 'AB', ', ', 'AC', '.', '\\\\bigtriangleup ABC', '的三边所围成的区域记为', 'I', ',黑色部分记为', 'II', ', 其余部分记为', 'III', '.在整个图形中随机取一点,此点取自', 'I,II,III', '的概率分别记为', 'p_1,p_2,p_3', ',则', '\\\\SIFChoice', \\FigureID{1}]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 文本部分" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "segments.text_segments" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形',\n", + " '的斜边',\n", + " ', 直角边',\n", + " ', ',\n", + " '.',\n", + " '的三边所围成的区域记为',\n", + " ',黑色部分记为',\n", + " ', 其余部分记为',\n", + " '.在整个图形中随机取一点,此点取自',\n", + " '的概率分别记为',\n", + " ',则']" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 公式部分" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "segments.formula_segments\r\n" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['ABC',\n", + " 'BC',\n", + " 'AB',\n", + " 'AC',\n", + " '\\\\bigtriangleup ABC',\n", + " 'I',\n", + " 'II',\n", + " 'III',\n", + " 'I,II,III',\n", + " 'p_1,p_2,p_3']" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 图片部分" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "segments.figure_segments" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "[\\FigureID{1}]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "segments.figure_segments[0].figure" + ], + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAOYAAACICAYAAADzlJeRAAAe3klEQVR4nO2df1RT9/3/nyE/SWJyBdSUOgs72mKP1XhK24j2CGon4FagbtTMtpZt6rDzR7rjKt+2C+rnHGxdBbr2xB9zgc4j1NMJbpXg2hlsp2DnjrSHVaqcgquWbFqNLUgQkvf3DwgiSSA/7uUm8f04530O3Ny87zu5ed7X+8fr/XoJCCEEFAolrIjhuwEUCsUTKkwKJQyhwqRQwhAR3w2IBJqamuBwOHy+rtFokJKSMo4tokQ7VJgAGhoa0NzcDLvdjn//+99oa2uDw+FAa2srAIBhmFHf73K58O233wIAYmNj8cADDwAAZs+ejblz50Kr1UKn00Emk3H6OSjRg+Bum5Vtbm5Gc3MzrFYrPvnkE7S2toJhGPT09KC3t5f160mlUsTGxqKrqwsMw2D69OlYunQpdDodMjMzWb8eJTqIemHa7XbU19fjT3/6E/72t79BoVDA6XSiq6uL13YxDIOuri48/vjjyM/PR2ZmJpKSknhtEyV8iEph2mw21NbW4tChQzh16hSkUulQVzMckcvliImJgUwmQ3Z2NgwGA7RaLd/NovBIVAmzuroar732Gs6fPw8AuHnzJs8tCg61Wg2FQoFXX30VK1asGHOMS4k+Il6YNpsNf/jDH7Bz504ACGvLGChyuRxOpxOLFy/G+vXr6Zj0LiJihdnU1IQ33ngDdXV1cLlcoy5nRAMMw0AkEmHnzp14/vnn+W4OhWMiTpjNzc0oLCxEa2sr7HY7380Zd5RKJWQyGRVolBMxwrTZbFi/fj0++OAD3Lhxg+/m8A4VaHQT9sK02+347W9/i8rKyqgaP7KFUqmESqXCn//8Z+h0Or6bQ2GJsPaV3bdvH5KTk7Fv3z4qSh90dXXh66+/RlZWFjZs2HBXdu+jEhKGdHZ2Ep1OR9RqNQFAi59FJpORSZMmEYvFwuLdqCNlZRdYrI/iD2FnMfft24dZs2ahqamJjiUDxOFw4MqVK9Dr9XjqqadYsJ5tKJ+fjUNsNI4SGHw/Gdxcv36d5OXlUSvJUhGLxWTSpEmksbEx6HtyoSyNACBr6li80RS/CAuL2dDQgPvvvx/vv/8+tZIs0dfXhytXruBHP/oRSkpKAq+grRw78QrK0thvG2VseJ+VLS0txbZt2+ikBYeoVCrMmzcP1dXVfrr3WbB2/nlsPnk/dgr+Dw9eOImN07lrX0NDA4Db+17/+c9/4uuvv/Z67uzZs5GcnAwASEpKuqNEFXya65/97GeEYRjeu313QxGLxSQ5OZmcO3duzPtSt2YNqSOEkAtlJA2Df7NAT08PsVqtxGg0Ep1OR2JjYwkAwjAMYRiGiESigD6TUqkkDMMQpVJJABCtVkuys7OJ0WgMqQsfDvAizOvXr5NHH3106MbQMn4lLi6OHDlyxOe9cY8rb5fQhHnu3Dny8ssvk+TkZCISicbtQcwwDImNjSUZGRnEZDKR9vb2ED7F+DPuwmxvbycpKSlEIpHw/iO9W8uUKVPIO++843lzLpSRNcOXRurWEKSVkUAXSxobG0lhYSFJSEggKpUqYEvIdpHL5USpVJLExERiMpnI9evXQ/kJjwvjKsyzZ8+SxMRE3n+YtIBMnDiRvP3227dvzoUykjZChBfK0vwWZk9PDzGbzSQxMTGshydyuZwoFAqSnZ3N8novu4ybMM+ePUs0Gg3vN4aW2yUuLm5InAPH0siAwbxAytKGn+u7O9vZ2Uk2bdpEJkyYMDTWi5TCMAxJSEggZrN5nFTgP+MizPb2dmopw7TExcWRjRs3BnxPOzs7ycqVK4lcLue9qxpqUSqVYSdQzoXZ3t5OkpKSeP/yafFdGIbxW5w9PT3EaDSS+Ph4IhaLeW87m8Ut0JqaGk414Q+cCpOKMnIKwzBk165do97PqqoqotFoiEKh4L29XBa1Wk10Oh2vM7mcCfP69eskJSWF9y+ZFv/LlClTvC6luDcVqFQq3ts4noVhGGI0GklPTw9XMvEJJ54/DocDjzzyCFpaWtiuOuqRy+WQSCQecW6Tk5OhVqvvODcnJwcA8NFHH+G///0vrl27ht7eXvT19QUdnnPKlCloaGgYiixfX1+PZ555Bt98802QnyiyUSgUiI+Px5EjR8Y1ciEnwnzqqafw/vvvo6+vj+2qowaGYYYiuLtFl5OTM5RuQavVshodr6GhAXa7Hc3Nzaivr4fNZsNXX30FAIiJiUF/f//QuSqVCrt27cJf/vIXfPTRR9RdEkB8fDxKSkqwevXqcbke68Kkvq+eiEQiKJVKdHV1ITU1FUuXLkV6enpY5Dxxi7WjowN//etfceLECXz33XdwuVx3iJUyEFZ00aJFOHjwIPfpLtjsFzc2NpKEhATexwZ8F7frmUgkIjqdjpSUlESU7+bhw4eJRqMharWayOVyIpVKSUxMDO/fazgUsVhMZsyY4ZfPcSiwJszOzk4ybdo03r84vopMJiNKpZJotdqIE+JwTCYTSU9PJ+3t7aSqqoqsWLGCnDt3jphMJpKZmUkkEknULZMEU6ZMmULOnj3L2X1gRZg9PT0kNTWV9y+Lj6JWq0liYiLZvn17xDlKD6enp4esWLGC/PKXvyQ9PT3k448/Jh9//DHZtGkTKS0tveM8s9k8tDtEKBTyfg/4KhqNhrMHMCvCNBqNUb+2NVKMKpWKPPfcc5w+NceLc+fOEa1WS6qqqoYE6S49PT0kMzOTWK1Wj/e1t7eT7du3k8TERCKXy3m/L3yUhISEUXfrBEvIwmxvbw9rp2U2i1KpJNnZ2WHhGcIWVVVVRKfTkXPnznmI0l06OzuJVqslnZ2dPus5e/Ysee6554hSqSRSqZT3ezWexdf6byiEPCs7b948NDU1hVJFWCMSiSCXy5Gbm4utW7dGzU55h8OBoqIi2Gw2mEymMdecRSIRioqKYLFYRp2RdDgceOutt1BSUoLu7m5Oco6GI1OmTMHevXvx5JNPslJfSMLct28fNm/eHJVxemQyGWQyGX7yk59g27Zt0Gg0fDeJNWw2G/R6PXJycpCamur3+86cOYOLFy+itLR0zHMdDgcqKirw6quvoru7Gz09PaE0OSKIj4/Hhx9+yIojQtDCtNlsmDt3Lmw2W8iNCCcUCgViYmKwbt06bNmyJepS4NXX18NgMMBsNge1Trl//34sXLgwoLQMFRUV2Lx5M3p6etDd3R3wNSOJpKQkWK3WkHtWQUfJW7duXVS5aYnFYjAMg+LiYvzvf//Djh07ok6UxcXFKC8vR2NjY9DOAyaTCZWVlQENX55//nlcuXIFBw4cQHJyMmJjY4O6diTQ0dGBH/zgByFnnwvKYjY3NyM9PT1qurAMw2DZsmV46623ok6MwIB3T15eHhYuXIglS5aEXN/UqVORl5cHq9Ua1Pf1u9/9Djt27IDdbofT6Qy5PeGGRCKBVqvF6dOng64jKGEuWrQIVqs16IuGCxMmTMC9994Ls9kctQl5mpqaUFBQAJPJBJFIxFq9/f392Lp1a9C/A5vNhnXr1uH48eNR84AfjkqlwpYtW1BUVBTU+wMWZjRYS7FYDIVCgW3btmH9+vV8N4czysrKcOTIEVRVVaGtrY31+v/xj3/gxo0bwQWUHqShoQErV67E9evXo26CKJTJoIDHmC+++GJEi5JhGOTn56O9vT1qRWm326HX63Hx4kVYLBZORAkACxYsQGtrK2pra4OuIz09HZcvX8ZvfvMbxMfHs9e4MOCbb75BTk5OUOPNgCxmJFtLsViMyZMn47333ovabisAtLa2oqCgABs3bsTUqVM5v96sWbOQlZUFs9kc8k4Zm82GvLw8fPbZZ7h58yZLLeQXsViM/Px8HDhwIKD3BWQxI9VaMgyDRYsWoaWlJapFWVFRAb1eD7PZPC6iBICWlhaYzWbo9fqQZyI1Gg0aGxuxefPmqJmE6+vrw9GjR1FfXx/Q+/y2mE1NTcjKyoq4fZZxcXF45ZVXYDAY+G4KZzgcDhgMBjgcDphMJpw5c2bc23D16lVUVlaipqaGlfqamprw5JNP4sqVK6zUxzfJycn48ssv/T7fb4tpMpkiSpRisRjTpk3DyZMno1qUHR0dyMjIwJw5c/Dzn/+cF1ECQEJCAlJSUrBjxw5W6tPpdDh//jx0Oh3kcjkrdfLJ1atX8fvf/97v8/0SpsPhwKFDkZO+dOLEifjhD3+IL774gvcIAVxSW1uLvLw8mEwmzJo1i+/mYNmyZTh27NhQ9q5QYRgmarq23333HYqLi/3u7vslzOrqalbXwLgkISEB5eXlOHz4MPfhH3ikqKgIlZWVsFqtQQfe4oKamhoYDAZ0dHSwVmdxcTEsFgsSExNZq5MPent7/e5R+DXGnDlzJlpbW0NuGNdoNBpUVVUhPT2d76ZwhtsBfenSpViwYAHfzfGKvztRAqWjowNZWVkR8Vv0hUqlwhdffDHmpogxLWZra6vPJKLhRGJiIiwWS1SL0j0BZzQaw1aUwIBX0KpVq1BYWMhqvUlJSWhsbERqamrE9OBG4nQ6/Vo6GVOYe/bsCfs1pXvvvRcnT54c17if482OHTtgMBhgsVgi4kc5ffp0yGQy7N69m9V6GYbBxx9/jLS0tIh0hu/u7kZJScmYY80xhXngwIGwDWMokUiQkpKClpaWqNnAPBK3A/qNGzewc+dOzrx4uKC0tBTvvvsu6xvpZTIZTpw4gSeeeMIjCHYk4HQ6x1zXHHWM2dHRgYceeiisJhfcyOVyzJkzB3V1dRE/Y+eL5uZmFBQUwGg0IiEhge/mBMXUqVOh1+tRU1PDyWbzTZs2oaKiIuIcX8Za1xzVYtbX18PlcrHeqFCRSCSYM2cOjh8/HrWi3L17NwoLC1FTUxOxogSAS5cuoaSkBAUFBSF7BnmjrKwMRqMREydOZL1uLrl27Rqam5t9vj6qMA8dOhSW48v7778fx48fj8rlEIfDgYKCAnz66aewWq24dOkS300KGZFIhKVLlwa9BWosDAYDnnvuuYjq1nZ3d6OystLn6z67sg6HAxMnTuTkKRcKKSkpaGxsjEpL6XZAX7t2LaZPn853c1jn7bffRk5ODlasWMFJ/Xq9HrW1tWH3m/VFQkKCT5dDnxazqakp7CzStGnTgt41H+7U1tYObWiORlECgNlsxmuvvcbZOmRVVRUeffTRiJi1BgaWlXxNjPkU5ocffhhWkz6JiYk4ceJEVEWrc2MwGFBZWQmLxRJW3znbnDlzBjU1NdDr9Zz5XR87dgyPPPIIxGIxJ/WzSVdXF959912vr/nsyoZT+BCNRgOLxRJ165Tu/YdPP/10QGEkIx22d6KMxG63Y968eRHhIeSrO+vTYv7rX//itEH+kpCQgJqamqgTZUNDAzIyMlBaWnpXiRIYuKdz5sxBcXExJ/UzDAOr1Ypp06ZxUj+bOBwOr37FXoVpt9vDIoK2SqXCiy++GHWbm4uLi7F169aQwkhGOkuWLMHp06cD3kDsLxqNBseOHUNcXBwn9bOJt904XoXZ3NzMu7uTWCyGVqvlbIqdD+x2O7KysgAA27dvHzMtQbRTVVWFoqIiVneiDMe9P1SlUnFSPxt0dXXhk08+8TjuU5h8RyybPHkyjhw5wmsb2KSpqQkZGRnYuHEjK7Fdo4GWlhZUVVUhLy+PsyWO1atXY/HixWE9U+ut1+BVmGfPnuW1K5uQkID33nsvapZFdu/ejaKiItTU1ECpVPLdnLDi6tWreOmll1BQUMDZNQ4ePBjWezm/+uorjweTV2F+9tln49Igb0TTuNLhcECv1+PTTz+FxWKJCi8eLpg6dSo0Gg3Kyso4qV8mk4X1eFOpVHq453kVJl8/ILFYjMWLF0fFuLK1tRXz5s1DTk4OVq5cyVssnkihpKSE1bAkIwnn8abL5fIYZ/ucleWD+Ph4HDx4kJdrs0l1dTX0ej2qqqrGLYxkpHPmzBmYzWYYDAbOMsi5x5tCoZCT+oOlt7fX4zPfIUzi6se3Xd9BoIoF1ApgHD3yGIaB2WwOOzfAQHA4HCgsLMSRI0fQ2NiIq1ev8t2kiKKtrQ0mk4mVGLW++OMf/xh2Xdre3l6cP3/+jmN3CFMQI8K1K/+DTCwGbnYD4+QLLBaL8dhjjyEzM3N8LsgB7ng0c+bMwQsvvEC7rkHS39+PnJwczoYzDMPgjTfeCLsu7UgvJY+ubMfFi4iJCTptZlCoVCpUVFSM6zXZpL6+Hnl5eSgpKQmLMJKRTmpqKux2O2e/iWeffRbf//73Oak7WDo7O+/432Nxx27/dvAvMaCWDPzpugV81wfIZIBUCPQOWlOxGJAPntM7zMIOPw4ncGOY6R3x2gSXGEaj0dM5nThxy+kCAQBBDCRCIQQjj0MAsUiEGBD09/fDCcHA+MHZDycAxIggFbhunx8jgjRGENAXNhbFxcU4ffo0rFbrXe8wwCYmkwlZWVlISUnhZIbebDZj8eLFuHbtGut1B8PIYY+HabTbrw9GLegDbt4C4BwQJQA4nED/rUEBigfGoDe6gRu3AKkMEA8elwsHusI3uoF+IaBwe/p7vjbpnslesm4R9DsJYoRiSEUiCOGCk4w8LoY4hqDPRQAIIBLGQAACp9OJGJEYUmEMBK5+9LoA8dD/TrAVj8FutyMjIwMA8PLLL1NRsox7MqiwsJCTyUitVouVK1eG7S6U0fusfX1AL24LSyYEnH23/46RDEwSqSUAhIBw8LjLCQyehlu3AJHw9nuGvaaWSlFd9a6nWIgTzhghRAJgQHTigb+JE06BAMJBoxcjGCk2AYRC0bAPJYAwZtDSDuIKOE2vJ01NTZg3bx6MRiP14uGQS5cuobS0FHl5eZzU//rrr2PChAmc1B0qvoXpFpbDOSAsMQCh884Jof5bgxZzsLhfixllOnrwNZFIhPzlyzFn9hzv5xF3d3XkceL9+DhRVlYGg8EAq9Ua1m5e0QKXYUlkMhn2798flt5YQ8J0ufrQ6yIDyxXEOeyUQaspV9y2lsCgYCXDllQGu7YOJwDh7eNCIdDvvP2ewddUKhXeLC8FiRF6eToIICDu7isAkAFLJxBCCIL+IbNHfLyffdxhJC9evBhxYSQjHTYS5PoiNzc3LCfshn7TAggAVz/iJ09GTP+ICRKHE8AIa4k+oNcJSBWD3Vnh4OuDY1P3cbET6O67/Z6bt6BQxWHz/9sCgUQGydBkDEF/fx/6BwUoEQrgdPaht78Pvf1OYKhbGwO4+geOuwSD7yfod7pAQOB09sM15v+B0dzcjKysLKxatQrLly8P8N0UNuAyLInJZAo7q+kRwaChoQF5eXkjBtxiQNbH2rpmXFwcLl++HBHOBBUVFdizZw/MZjN1GOCZhIQE6PV6NDY2sv7b4Ttix8hIBv71AmVCwDn2af6gUChQVFQU9qJ0e/GcOHECVquVijIMuHr1KoxGI/R6Pet179q1i1erOdJ100OYdwhGMdgdheP2ZFCISKVS/OpXv2KnMo4Il2SwFE/YTpDrRqvVYtKkSazWGQgjDZWHMDUaze1wF90jZltDJBKsZbglg6V4wnaCXDe7du3iLXv1mBYzKSmJsxCK4W4twzUZLMUTLhLk5ubmYsqUKazVFwgjrbXXMSYXDr7hbC1tNhvmzZsHtVqNX//619SLJwJoaWmByWRiPScKH1ZTLpdj9uzZdxzzKsz4+HjWLy4UCvHMM8+wXm+oNDQ0ICsrC6WlpWGdDJbiCRcJcnNzczF58mTW6vMHiUTiEZ7VqzC5MOcPP/xw2EVR37FjB7Zu3QqLxXLXhpGMdLhIkPvSSy+Nq1dXT08PUlJS7jjmVZhs77pXq9XYsGEDq3WGwvBksNu3b6dePBEO2wlyc3NzIZFIxj6RJaRSqUfgOa/CfOCBB1h9YgiFwrDZBN3c3IyMjAysWrUKy5Yt47s5FBZgOyyJRqPBY489xkLL/OPBBx/0OOZVmAsWLGBtsVUkEmH58uVhMekTLclgKZ6wnSB3w4YNkEqlLLRsdEQiERYvXuxx3KswdTodawGfFQoF1q1bx0pdwRKNyWApnrC5EyUzM3NcjIlSqfS6ddCrMBmGYW1m9p577uE1IZA7jOTChQtpGMm7gNTUVNhsNlRXV4dUj0wmQ05ODkut8o3D4fAaocGnrywbYpLL5di4cWPI9QRLdXU1CgoKUFVVFbXJYCmesLUTpbCwkHOrqdVqvV7DpzCfeOKJkPvYTqeTs7Teo+FwOGAwGHDkyBFYLBbqgH6XwVaCXJ1Ox2mEA6lUiqefftrraz6FqdPpQs74NXPmzHHPP2Kz2ZCVlYX77rsPL7zwAvXiuUu5dOkSjEZjyDlR1qxZw1KLPJFKpT5XK0btyjqdwe/1EolE474c4U4GW1JSctclg6V4wkaC3CVLlnAWsCs+Pt7DscCNT2HKZDI8/vjjQV/U12wTV9BksBRvhJogV6fTcZJSYazVilE3Sj/77LNBO7Q7nc5xydhFk8FSxiKUBLkymQxz585lv1EAfvGLX/h8bVRhZmZmBt2dTU1N5XxGyx1GkiaDpYxGqAly8/PzIRCwGyh84cKFo86/jCpMhmHw0EMPBXxRqVSKJ598MuD3BUJZWRmKiopgtVrDLpASJfwIJUFueno6q+NMhmG8BDm/kzFj/qxatSrg/WmjzTaFit1uh16vx8WLF2GxWKgDOsVvgk2Qy7aDzMSJE8fUx5jCzM3NDfjCEonE52xTKLS2tiIrKws5OTlYvnw59eKhBEywCXLZGmeq1Wrs2rVrzPPGFGYwnvaPPvpoQOf7gzsZrNlspslgKUET7E6UH//4x6xcPy4uzi9j51f4yg0bNkCtVvt1YZFIFNIyy0hoMlgK2wSTIHfJkiUhp6f011oCXgI+++Lee+/F119/7dfFKyoqguoCj6SjowMFBQV4+umnacQ6CuucOXMGFy9eRGlp6ZjnOhwOyOVy+CkXryQnJ+PLL7/061y/HwEGgwEKhWLM8wghrIwvaTJYCtcEkiBXJpOFtPynVqtx+PBhv8/3W5ijLYYO5+bNmyELs6ioCOXl5bBardSLh8IpJpMJlZWVfoUlCTZmlUgkQn5+fkCzu34Lk2EY/PSnPx0z5Mj3vvc9vy8+EpvNhoyMDKjVapoMljIuBJIgN9iZWZVKhTfffDOg9wQ0ml23bt2Y3dmZM2cG1AA3TU1NyMrKgtFopGEkKeOKvwlytVptwB5AarUa+/fvD7gbHJAwtVot8vPzfVrNYGdk3clgLRYLTQZL4QV/wpI88sgjAdUpFouxaNGioCZCA57/3bZtm09PIIVCEdD4kiaDpYQTYyXITUlJCchi3nPPPTh48GBQbQlYmBqNBqtWrfJqmgOZkaXJYCnhyGhhSZKSkvxeLomLi8OxY8eCnskNasX09ddf92o1XS6XXw2pqKhAYWEhzGYzDSNJCStaWlpgNpt9Oh/4EwharVZjx44dIa1OBCVMmUyGoqIij4kgh8Mx6pSyO4wkTQZLCWdGS5A7Vhws97hy9erVIbXBb8+fkTgcDsyePRsXLly447iv6lpbW1FQUIC1a9fSiHWUiODo0aNQq9XYsmXL0LH4+Hhcu3bN6/kSiQQPP/wwjh8/HvJe5KCd/2QyGd555507Nnv6inZQW1uLgoICmEwmKkpKxOAtQe5oET1mz57NiiiBEIQJDMRDefbZZ4ca4q3/bTAYUFlZCYvFQpPBUiKOkQlyfW2YTklJwQcffMBa1I7Q3OUxMBHk9vYZvh3LnQz2vvvuo8lgKRHLyAS53nZZpaSkoLGxkdVQrSELUyaT4dChQ3eY+OHJYGkYSUqkMzxB7vAJT3dAALZFCbAgTGDAIygtLQ0zZsygyWApUYk7Qa5UKoVQKERsbCy0Wi0nogQCEKZlrQACwYgyvxxuX50lS5bAarWitbUVf//736kXDyXqKC0txX/+8x/I5XLk5OTg9OnTnGUaCGi5pK18PmZ8/grIniwAFqwVZGPvmjqQPVk4cOAAtmzZgsuXL7PeSK1Wy8oXkJ6ePvR3RUVFUHFGKRSRSIS+vj5OrxGAMNtQPn8GPn+FYM9AfGVY1gqQjbpBoVIo0U9HRwc6OjrueMgDACxrIcjeO/TvmjqCPViLtdiDYOTh/xiz7SgOnVqD3MGLtJXPR/beNJRtpqKk3D0kJSWNEGUbyucLIMgG6ggBGSy5tQIIslvw4IzgruO3MNuOHsIp7EX24PhyFSpByElspP4ClLuWgV7kJpThAtmD4SYqa08d1mAW7g9SH34Ksw1HD53CmrrBJ8KFMmDTDMwvpxM8lLuXtvJV2HQqDWWVG+Gpvxl4sCwXwfYn/RtjtpVj/ozP8cqwp4JlrQDZe9egbsSTgkK5Oxi0lrO4mWPxy2K2HT2EU2uGq78N51sApD2IILvQFEqEcwGfnwLSgh1EjoEfcTwGu7Gv3JalZe0MbDoFrKnzZsIplLuAtvPg0sl0TIspEAyIcG/2bceC7L1pKLtAgpoGplCigun3YxaAU59f8Pqypfy2800wBL0fk0K56xlcu0wru4CTQ8sTnuv9wUCFSaGEQls55s/YhFNDB9JQdiH0ZUQqTAolDGFldwmFQmEXKkwKJQz5/6GFNae7ZI7lAAAAAElFTkSuQmCC", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 特殊符号" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "segments.ques_mark_segments" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['\\\\SIFChoice']" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "##### 标记化切分 \r\n", + "如果您不注重题目文本和公式的具体内容,仅仅是对题目的整体(或部分)构成感兴趣,那么可以通过修改 `symbol` 参数来将不同的成分转化成特定标记,方便您的研究。" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + " - symbol:\n", + " - \"t\": text\n", + " - \"f\": formula\n", + " - \"g\": figure\n", + " - \"m\": question mark" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "sif4sci(item[\"stem\"], figures=figures, tokenization=False, symbol=\"tfgm\")" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[FIGURE]']" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "#### 令牌化\r\n", + "\r\n", + "为了方便后续向量化表征试题,本模块提供题目文本的令牌化解析(Tokenization),即将题目转换成令牌序列。 \r\n", + "\r\n", + "根据构成题目的元素类型,解析功能分为 **“文本解析”** 和 **“公式解析”** 两部分。更具体的过程解析参见 [令牌化](../Tokenizer/tokenizer.ipynb)。" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "tokens = sif4sci(item[\"stem\"], figures=figures, tokenization=True)" + ], + "outputs": [], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "- 文本解析结果" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "tokens.text_tokens" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['如图',\n", + " '古希腊',\n", + " '数学家',\n", + " '希波',\n", + " '克拉底',\n", + " '研究',\n", + " '几何图形',\n", + " '此图',\n", + " '三个',\n", + " '半圆',\n", + " '三个',\n", + " '半圆',\n", + " '直径',\n", + " '直角三角形',\n", + " '斜边',\n", + " '直角',\n", + " '三边',\n", + " '围成',\n", + " '区域',\n", + " '记',\n", + " '黑色',\n", + " '记',\n", + " '其余部分',\n", + " '记',\n", + " '图形',\n", + " '中',\n", + " '随机',\n", + " '取',\n", + " '一点',\n", + " '此点',\n", + " '取自',\n", + " '概率',\n", + " '记']" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "##### 公式解析结果" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "tokens.formula_tokens" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['ABC',\n", + " 'BC',\n", + " 'AB',\n", + " 'AC',\n", + " '\\\\bigtriangleup',\n", + " 'ABC',\n", + " 'I',\n", + " 'II',\n", + " 'III',\n", + " 'I',\n", + " ',',\n", + " 'II',\n", + " ',',\n", + " 'III',\n", + " 'p',\n", + " '_',\n", + " '1',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '2',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '3']" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "- 自定义参数,得到定制化解析结果" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(1)如果您想按 latex 语法标记拆分公式的各个部分,并得到顺序序列结果,输出方法(`method`)可以选择:`linear`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "sif4sci(\r\n", + " item[\"stem\"],\r\n", + " figures=figures,\r\n", + " tokenization=True,\r\n", + " tokenization_params={\r\n", + " \"formula_params\": {\r\n", + " \"method\": \"linear\",\r\n", + " }\r\n", + " }\r\n", + ").formula_tokens" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['ABC',\n", + " 'BC',\n", + " 'AB',\n", + " 'AC',\n", + " '\\\\bigtriangleup',\n", + " 'ABC',\n", + " 'I',\n", + " 'II',\n", + " 'III',\n", + " 'I',\n", + " ',',\n", + " 'II',\n", + " ',',\n", + " 'III',\n", + " 'p',\n", + " '_',\n", + " '1',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '2',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '3']" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "(2) 如果您想得到公式解析出的语法分析树序列,输出方法可以选择:`ast`\n", + "> 抽象语法分析树,简称语法树(Syntax tree),是源代码语法结构的一种抽象表示。它以树状的形式表现编程语言的语法结构,树上的每个节点都表示源代码中的一种结构。 \n", + "> 因此,ast 可以看做是公式的语法结构表征。" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "sif4sci(\r\n", + " item[\"stem\"],\r\n", + " figures=figures,\r\n", + " tokenization=True,\r\n", + " tokenization_params={\r\n", + " \"formula_params\":{\r\n", + " \"method\": \"ast\",\r\n", + " }\r\n", + " }\r\n", + ").formula_tokens\r\n" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "[,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "- 语法树展示:" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "f = sif4sci(\r\n", + " item[\"stem\"],\r\n", + " figures=figures,\r\n", + " tokenization=True,\r\n", + " tokenization_params={\r\n", + " \"formula_params\":{\r\n", + " \"method\": \"ast\",\r\n", + " \"return_type\": \"ast\",\r\n", + " \"ord2token\": True,\r\n", + " \"var_numbering\": True,\r\n", + " }\r\n", + " }\r\n", + ").formula_tokens\r\n", + "f\r\n" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "[,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "for i in range(0, len(f)):\r\n", + " ForestPlotter().export(\r\n", + " f[i], root_list=[node for node in f[i]],\r\n", + " )\r\n", + "# plt.show()\r\n" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(3)如果您只是关心公式的结构和类型,并不关心变量具体是什么,比如二元二次方程 `x^2 + y = 1` ,它从公式结构和类型上来说,和 `w^2 + z = 1` 没有区别。 \n", + "此时,您可以设置如下参数:`ord2token = True`,将公式变量名转换成 token" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "sif4sci(\r\n", + " item[\"stem\"],\r\n", + " figures=figures,\r\n", + " tokenization=True,\r\n", + " tokenization_params={\r\n", + " \"formula_params\":{\r\n", + " \"method\": \"ast\",\r\n", + " \"return_type\": \"list\",\r\n", + " \"ord2token\": True,\r\n", + " }\r\n", + " }\r\n", + ").formula_tokens" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " '\\\\bigtriangleup',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " ',',\n", + " 'mathord',\n", + " 'mathord',\n", + " ',',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord',\n", + " 'textord',\n", + " '\\\\supsub']" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "(4) 如果您除了 (3) 中提供的功能之外,还需要区分不同的变量。此时可以另外设置参数:`var_numbering=True`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "sif4sci(\r\n", + " item[\"stem\"],\r\n", + " figures=figures,\r\n", + " tokenization=True,\r\n", + " tokenization_params={\r\n", + " \"formula_params\":{\r\n", + " \"method\": \"ast\",\r\n", + " \"ord2token\": True,\r\n", + " \"return_type\": \"list\",\r\n", + " \"var_numbering\": True\r\n", + " }\r\n", + " }\r\n", + ").formula_tokens" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['mathord_0',\n", + " 'mathord_1',\n", + " 'mathord_2',\n", + " 'mathord_1',\n", + " 'mathord_2',\n", + " 'mathord_0',\n", + " 'mathord_1',\n", + " 'mathord_0',\n", + " 'mathord_2',\n", + " '\\\\bigtriangleup',\n", + " 'mathord_0',\n", + " 'mathord_1',\n", + " 'mathord_2',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " ',',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " ',',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_4',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord_4',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord_4',\n", + " 'textord',\n", + " '\\\\supsub']" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "### 综合训练\r\n", + "\r\n", + "综合上述方法,将题目转换成令牌序列,为后续向量化做准备。" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "sif4sci(item[\"stem\"], figures=figures, tokenization=True,\r\n", + " symbol=\"fgm\")" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['如图', '古希腊', '数学家', '希波', '克拉底', '研究', '几何图形', '此图', '三个', '半圆', '三个', '半圆', '直径', '直角三角形', '[FORMULA]', '斜边', '[FORMULA]', '直角', '[FORMULA]', '[FORMULA]', '[FORMULA]', '三边', '围成', '区域', '记', '[FORMULA]', '黑色', '记', '[FORMULA]', '其余部分', '记', '[FORMULA]', '图形', '中', '随机', '取', '一点', '此点', '取自', '[FORMULA]', '概率', '记', '[FORMULA]', '[MARK]', '[FIGURE]']" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Tokenizer" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "from EduNLP.Tokenizer import PureTextTokenizer, TextTokenizer, get_tokenizer" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "D:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", + " warnings.warn(msg)\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### TextTokenizer" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "items = [{\r\n", + " \"stem\": \"已知集合$A=\\\\left\\\\{x \\\\mid x^{2}-3 x-4<0\\\\right\\\\}, \\\\quad B=\\\\{-4,1,3,5\\\\}, \\\\quad$ 则 $A \\\\cap B=$\",\r\n", + " \"options\": [\"1\", \"2\"]\r\n", + " }]\r\n", + "tokenizer = get_tokenizer(\"text\") # tokenizer = TextTokenizer()\r\n", + "tokens = tokenizer(items, key=lambda x: x[\"stem\"])\r\n", + "print(next(tokens))" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['已知', '集合', 'A', '=', '\\\\left', '\\\\{', 'x', '\\\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\\\right', '\\\\}', ',', '\\\\quad', 'B', '=', '\\\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\\\}', ',', '\\\\quad', 'A', '\\\\cap', 'B', '=']\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "items = [\"有公式$\\\\FormFigureID{wrong1?}$,如图$\\\\FigureID{088f15ea-xxx}$,若$x,y$满足约束条件公式$\\\\FormFigureBase64{wrong2?}$,$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$\"]" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "\r\n", + "tokenizer = get_tokenizer(\"text\") # tokenizer = TextTokenizer()\r\n", + "tokens = [t for t in tokenizer(items)]\r\n", + "tokens" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "[['公式',\n", + " '[FORMULA]',\n", + " '如图',\n", + " '[FIGURE]',\n", + " 'x',\n", + " ',',\n", + " 'y',\n", + " '约束条件',\n", + " '公式',\n", + " '[FORMULA]',\n", + " '[SEP]',\n", + " 'z',\n", + " '=',\n", + " 'x',\n", + " '+',\n", + " '7',\n", + " 'y',\n", + " '最大值',\n", + " '[MARK]']]" + ] + }, + "metadata": {} + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### PureTextTokenizer" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "tokenizer = get_tokenizer(\"pure_text\") # tokenizer = PureTextTokenizer()\r\n", + "tokens = [t for t in tokenizer(items)]\r\n", + "tokens" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "[['公式',\n", + " '如图',\n", + " '[FIGURE]',\n", + " 'x',\n", + " ',',\n", + " 'y',\n", + " '约束条件',\n", + " '公式',\n", + " '[SEP]',\n", + " 'z',\n", + " '=',\n", + " 'x',\n", + " '+',\n", + " '7',\n", + " 'y',\n", + " '最大值',\n", + " '[MARK]']]" + ] + }, + "metadata": {} + } + ], + "metadata": {} } ], "metadata": { diff --git a/examples/tokenizer/tokenizer2.ipynb b/examples/tokenizer/tokenizer2.ipynb new file mode 100644 index 00000000..4819b00d --- /dev/null +++ b/examples/tokenizer/tokenizer2.ipynb @@ -0,0 +1,501 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Tokenizer\n", + "\n", + "## 概述\n", + "\n", + "为了方便后续向量化表征试题,本模块提供题目文本的令牌化解析(Tokenization),即将题目转换成令牌序列。 \n", + "\n", + "根据构成题目的元素类型,解析功能分为 **“文本解析”** 和 **“公式解析”** 两部分。\n", + "\n", + "### 文本解析\n", + "\n", + "根据题目文本切分粒度的大小,文本解析又分为 **“句解析”** 和 **“词解析”**。\n", + "\n", + "(1) 句解析(sentence-tokenization):将较长的文档切分成若干句子的过程称为“分句”。每个句子为一个“令牌”(token)。(待实现) \n", + " \n", + "\n", + "(2) 词解析(text-tokenization):一个句子(不含公式)是由若干“词”按顺序构成的,将一个句子切分为若干词的过程称为“词解析”。根据词的粒度大小,又可细分为“词组解析”和\"单字解析\"。\n", + "- 词组解析 (word-tokenization):每一个词组为一个“令牌”(token)。\n", + "- 单字解析 (char-tokenization):单个字符即为一个“令牌”(token)。\n", + "\n", + "### 公式解析\n", + "\n", + "公式解析(formula-tokenization):理科类文本中常常含有公式。将一个符合 latex 语法的公式切分为标记字符列表的过程称为“公式解析”。每个标记字符为一个“令牌”(token)。 \n", + " " + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 文本解析" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 句解析\n", + "\n", + "待实现..." + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 词解析\n", + "\n", + "词解析分为两个主要步骤: \n", + "\n", + "(1) 分词: \n", + "- 词组解析:使用分词工具切分并提取题目文本中的词。 \n", + " 本项目目前支持的分词工具有:`jieba` \n", + "- 单字解析:按字符划分。\n", + " \n", + " \n", + "(2) 筛选:过滤指定的停用词。 \n", + "- 本项目默认使用的停用词表:[stopwords](https://github.com/bigdata-ustc/EduNLP/blob/master/EduNLP/meta_data/sif_stopwords.txt) \n", + "- 你也可以使用自己的停用词表,具体使用方法见下面的示例。\n" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "# 导入模块\n", + "from EduNLP.SIF.tokenization.text import tokenize " + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "# 输入\n", + "text = \"三角函数是基本初等函数之一\"" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "#### 词组解析\n", + "\n", + "分词粒度参数选择 word: `granularity = \"word\"` " + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "# 输出:默认使用 EduNLP 项目提供的停用词表\n", + "tokenize(text, granularity=\"word\")" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['三角函数', '初等', '函数']" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "#### 单字解析\n", + "\n", + "分词粒度参数选择 word: `granularity = \"char\"` " + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "# 输出:默认使用 EduNLP 项目提供的停用词表\n", + "tokenize(text, granularity=\"char\")" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['三', '角', '函', '数', '基', '初', '函', '数']" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "#### 停用词表" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "# 获取自己的停用词表\n", + "spath = \"test_stopwords.txt\"\n", + "from EduNLP.SIF.tokenization.text.stopwords import get_stopwords\n", + "stopwords = get_stopwords(spath)\n", + "stopwords" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'一旦', '一时', '一来', '一样', '一次', '一片', '一番', '一直', '一致'}" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "# 输出:传入停用词表(stopwords)\n", + "tokenize(text,granularity=\"word\",stopwords=stopwords)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['三角函数', '是', '基本', '初等', '函数', '之一']" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 公式解析\n", + "切分出 latex 公式的每个标记符号。针对本模块更加详细的解释参见 [formula](../formula/formula.ipynb)" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "# 导入模块\n", + "from EduNLP.SIF.tokenization.formula import tokenize" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 输入" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 8, + "source": [ + "formula = \"\\\\frac{\\\\pi}{x + y} + 1 = x\"" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 输出" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(1)如果您想按 latex 语法标记拆分公式的各个部分,并得到顺序序列结果,输出方法可以选择:`linear`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 9, + "source": [ + "tokenize(formula, method=\"linear\")" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['\\\\frac', '{', '\\\\pi', '}', '{', 'x', '+', 'y', '}', '+', '1', '=', 'x']" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(2) 如果您想得到公式解析出的语法分析树序列,输出方法可以选择:`ast`\n", + "> 抽象语法分析树,简称语法树(Syntax tree),是源代码语法结构的一种抽象表示。它以树状的形式表现编程语言的语法结构,树上的每个节点都表示源代码中的一种结构。 \n", + "> 因此,ast 可以看做是公式的语法结构表征。" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 10, + "source": [ + "tokenize(formula, method=\"ast\", return_type=\"list\", ord2token=False)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['\\\\pi', '{ }', 'x', '+', 'y', '{ }', '\\\\frac', '+', '1', '=', 'x']" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(3)如果您只是关心公式的结构和类型,并不关心变量具体是什么,比如二元二次方程 `x^2 + y = 1` ,它从公式结构和类型上来说,和 `w^2 + z = 1` 没有区别。 \n", + "此时,您可以设置如下参数:`ord2token = True`,将公式变量名转换成 token" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 11, + "source": [ + "# 输出形式选择抽象语法分析树(ast)且将公式变量名转换成 token\n", + "tokenize(formula, method=\"ast\", return_type=\"list\", ord2token=True)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['mathord',\n", + " '{ }',\n", + " 'mathord',\n", + " '+',\n", + " 'mathord',\n", + " '{ }',\n", + " '\\\\frac',\n", + " '+',\n", + " 'textord',\n", + " '=',\n", + " 'mathord']" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(4) 如果您除了 (3) 中提供的功能之外,还需要区分不同的变量。此时可以另外设置参数:`var_numbering=True`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 12, + "source": [ + "# 输出形式选择抽象语法分析树(ast)且将公式变量名转换成带编号的 token\n", + "tokenize(formula, method=\"ast\", return_type=\"list\", ord2token=True, var_numbering=True)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['mathord_con',\n", + " '{ }',\n", + " 'mathord_0',\n", + " '+',\n", + " 'mathord_1',\n", + " '{ }',\n", + " '\\\\frac',\n", + " '+',\n", + " 'textord',\n", + " '=',\n", + " 'mathord_0']" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 综合解析\n", + "\n", + "综合解析,即综合以上两种解析方式(标记解析 + 公式解析),提供对题目文本的全解析。另外,如遇到特殊符号将转换成常量,例如:\n", + "```python\n", + "FIGURE_SYMBOL = \"[FIGURE]\" # $\\SIFChoice$\n", + "QUES_MARK_SYMBOL = \"[MARK]\" # $\\FigureID{1}$\n", + "```\n" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 39, + "source": [ + "# 导入模块\n", + "from EduNLP.Tokenizer import get_tokenizer\n", + "\n", + "# 输入\n", + "item = {\n", + " \"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\n", + "}\n", + "\n", + "# 输出\n", + "tokenizer = get_tokenizer(\"text\")\n", + "tokens = tokenizer(item)\n", + "next(tokens) " + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['如图',\n", + " '古希腊',\n", + " '数学家',\n", + " '希波',\n", + " '克拉底',\n", + " '研究',\n", + " '几何图形',\n", + " '此图',\n", + " '三个',\n", + " '半圆',\n", + " '三个',\n", + " '半圆',\n", + " '直径',\n", + " '直角三角形',\n", + " 'ABC',\n", + " '斜边',\n", + " 'BC',\n", + " '直角',\n", + " 'AB',\n", + " 'AC',\n", + " '\\x08',\n", + " 'igtriangleupABC',\n", + " '三边',\n", + " '围成',\n", + " '区域',\n", + " '记',\n", + " 'I',\n", + " '黑色',\n", + " '记',\n", + " 'II',\n", + " '其余部分',\n", + " '记',\n", + " 'III',\n", + " '图形',\n", + " '中',\n", + " '随机',\n", + " '取',\n", + " '一点',\n", + " '此点',\n", + " '取自',\n", + " 'I',\n", + " ',',\n", + " 'II',\n", + " ',',\n", + " 'III',\n", + " '概率',\n", + " '记',\n", + " 'p',\n", + " '_',\n", + " '1',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '2',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '3',\n", + " '[MARK]',\n", + " '[FIGURE]']" + ] + }, + "metadata": {}, + "execution_count": 39 + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/utils/data.ipynb b/examples/utils/data.ipynb index 8009e8d5..ded161cf 100644 --- a/examples/utils/data.ipynb +++ b/examples/utils/data.ipynb @@ -7,6 +7,13 @@ ], "metadata": {} }, + { + "cell_type": "markdown", + "source": [ + "## 导入模块" + ], + "metadata": {} + }, { "cell_type": "code", "execution_count": 1, @@ -18,21 +25,28 @@ "output_type": "stream", "name": "stderr", "text": [ - "/home/lvrui/.local/lib/python3.8/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", - " warnings.warn(msg)\n" + "/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (1.26.6) or chardet (3.0.4) doesn't match a supported version!\n", + " warnings.warn(\"urllib3 ({}) or chardet ({}) doesn't match a supported \"\n" ] } ], "metadata": {} }, + { + "cell_type": "markdown", + "source": [ + "## 测试数据" + ], + "metadata": {} + }, { "cell_type": "code", "execution_count": 3, "source": [ - "item = {\r\n", - " \"stem\": r\"若复数$z=1+2 i+i^{3}$,则$|z|=$\",\r\n", - " \"options\": ['0', '1', r'$\\sqrt{2}$', '2'],\r\n", - " }\r\n", + "item = {\n", + " \"stem\": r\"若复数$z=1+2 i+i^{3}$,则$|z|=$\",\n", + " \"options\": ['0', '1', r'$\\sqrt{2}$', '2'],\n", + " }\n", "item" ], "outputs": [ @@ -50,23 +64,40 @@ ], "metadata": {} }, + { + "cell_type": "markdown", + "source": [ + "## 区分题目成分" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 通过添加特殊关键词的方式,区分题目的题干和选项,选项之间按顺序做编号标记。" + ], + "metadata": {} + }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 15, "source": [ - "# 给题目各个部分加标签\r\n", - "dict2str4sif(item)" + "dict2str4sif(item,key_as_tag=True,\n", + " add_list_no_tag=False,\n", + " # keys=[\"options\"],\n", + " tag_mode=\"head\"\n", + " )\n" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ - "'$\\\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\\\SIFTag{stem_end}$$\\\\SIFTag{options_begin}$$\\\\SIFTag{list_0}$0$\\\\SIFTag{list_1}$1$\\\\SIFTag{list_2}$$\\\\sqrt{2}$$\\\\SIFTag{list_3}$2$\\\\SIFTag{options_end}$'" + "'$\\\\SIFTag{stem}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\\\SIFTag{options}$0$\\\\SIFSep$1$\\\\SIFSep$$\\\\sqrt{2}$$\\\\SIFSep$2'" ] }, "metadata": {}, - "execution_count": 4 + "execution_count": 15 } ], "metadata": {} @@ -93,60 +124,60 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "source": [ - "dict2str4sif(item, tag_mode=\"head\")" + "dict2str4sif(item, add_list_no_tag=False)" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ - "'$\\\\SIFTag{stem}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\\\SIFTag{options}$$\\\\SIFTag{list_0}$0$\\\\SIFTag{list_1}$1$\\\\SIFTag{list_2}$$\\\\sqrt{2}$$\\\\SIFTag{list_3}$2'" + "'$\\\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\\\SIFTag{stem_end}$$\\\\SIFTag{options_begin}$0$\\\\SIFSep$1$\\\\SIFSep$$\\\\sqrt{2}$$\\\\SIFSep$2$\\\\SIFTag{options_end}$'" ] }, "metadata": {}, - "execution_count": 7 + "execution_count": 6 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "source": [ - "dict2str4sif(item, tag_mode=\"tail\")" + "dict2str4sif(item, tag_mode=\"head\")" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ - "'若复数$z=1+2 i+i^{3}$,则$|z|=$$\\\\SIFTag{stem}$$\\\\SIFTag{list_0}$0$\\\\SIFTag{list_1}$1$\\\\SIFTag{list_2}$$\\\\sqrt{2}$$\\\\SIFTag{list_3}$2$\\\\SIFTag{options}$'" + "'$\\\\SIFTag{stem}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\\\SIFTag{options}$$\\\\SIFTag{list_0}$0$\\\\SIFTag{list_1}$1$\\\\SIFTag{list_2}$$\\\\sqrt{2}$$\\\\SIFTag{list_3}$2'" ] }, "metadata": {}, - "execution_count": 9 + "execution_count": 7 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "source": [ - "dict2str4sif(item, add_list_no_tag=False)" + "dict2str4sif(item, tag_mode=\"tail\")" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ - "'$\\\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\\\SIFTag{stem_end}$$\\\\SIFTag{options_begin}$0$\\\\SIFSep$1$\\\\SIFSep$$\\\\sqrt{2}$$\\\\SIFSep$2$\\\\SIFTag{options_end}$'" + "'若复数$z=1+2 i+i^{3}$,则$|z|=$$\\\\SIFTag{stem}$$\\\\SIFTag{list_0}$0$\\\\SIFTag{list_1}$1$\\\\SIFTag{list_2}$$\\\\sqrt{2}$$\\\\SIFTag{list_3}$2$\\\\SIFTag{options}$'" ] }, "metadata": {}, - "execution_count": 10 + "execution_count": 9 } ], "metadata": {} diff --git a/examples/vectorization/total_vector.ipynb b/examples/vectorization/total_vector.ipynb new file mode 100644 index 00000000..4a91634c --- /dev/null +++ b/examples/vectorization/total_vector.ipynb @@ -0,0 +1,557 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# 向量化\r\n", + "\r\n", + "## 简述\r\n", + "\r\n", + "向量化过程是将item转换为向量的过程,其前置步骤为语法解析、成分分解、令牌化,本部分将先后介绍如何获得数据集、如何使用本地的预训练模型、如何直接调用远程提供的预训练模型。" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 获得数据集\r\n", + "\r\n", + "### 概述\r\n", + "\r\n", + "此部分通过调用 [OpenLUNA.json](http://base.ustc.edu.cn/data/OpenLUNA/OpenLUNA.json) 获得。" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## I2V\r\n", + "\r\n", + "### 概述\r\n", + "\r\n", + "使用自己提供的任一预训练模型(给出模型存放路径即可)将给定的题目文本转成向量。\r\n", + "\r\n", + "- 优点:可以使用自己的模型,另可调整训练参数,灵活性强。" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### D2V" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "#### 导入类" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "from EduNLP.I2V import D2V" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "D:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", + " warnings.warn(msg)\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "#### 输入\r\n", + "\r\n", + "类型:str \r\n", + "内容:题目文本 (text)" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "items = [\r\n", + "r\"1如图几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\r\n", + "r\"2如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\r\n", + "]" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "#### 输出" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "model_path = \"./d2v/test_d2v_256.bin\"\r\n", + "i2v = D2V(\"pure_text\",\"d2v\",filepath=model_path, pretrained_t2v = False)\r\n", + "\r\n", + "item_vectors, token_vectors = i2v(items)\r\n", + "print(item_vectors[0])\r\n", + "print(token_vectors) # For d2v, token_vector is None\r\n", + "print(\"shape of item_vector: \",len(item_vectors), item_vectors[0].shape) " + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[ 0.10603202 -0.10537548 -0.04773913 0.15573525 0.25898772 -0.06423073\n", + " -0.02817309 0.0068187 -0.07323898 0.06517941 0.07943465 0.14800762\n", + " -0.06772996 -0.23892336 0.04638071 0.1539897 0.17565852 0.02895202\n", + " -0.18859927 0.2180874 0.00909669 0.06621908 -0.02090263 -0.13006955\n", + " -0.21020882 0.00618349 0.00531093 -0.04877732 -0.06709669 -0.04705636\n", + " 0.09211092 0.13896106 -0.07455818 0.06019318 -0.09071473 0.12701215\n", + " 0.13018885 -0.02784999 0.10064025 -0.07757548 -0.05522636 -0.02657779\n", + " -0.04159601 -0.03008493 0.10995369 -0.00587291 0.05902484 0.06532726\n", + " 0.04887666 0.01902074 0.03713945 0.03691795 0.12516327 0.07410683\n", + " -0.14467879 0.05678609 0.02574336 -0.1320522 0.07502684 0.07929367\n", + " -0.06655917 -0.0144536 0.02595847 0.04403471 0.21743318 -0.02525017\n", + " -0.0416184 0.21441495 -0.09308876 -0.09418222 0.08030997 0.00492512\n", + " -0.04921608 -0.07808654 -0.03323801 0.0879296 -0.04668022 -0.0696011\n", + " 0.06708417 0.06555629 -0.07418457 -0.13050951 -0.01802611 0.11730465\n", + " -0.0479078 0.06389603 0.12324224 -0.17746696 -0.09874132 -0.07683054\n", + " 0.06596514 -0.04210603 0.03182372 -0.1455575 0.03900012 0.13290605\n", + " -0.07672353 -0.02826704 -0.00803517 -0.09681892 -0.15212329 -0.10524812\n", + " 0.03367848 0.10413344 -0.0089777 0.0583192 -0.01553376 0.02675472\n", + " 0.12278829 0.01667286 0.01958599 -0.06468913 0.08307286 0.07304061\n", + " -0.10451686 -0.04367925 0.0143903 0.11394493 0.00759796 -0.03158598\n", + " -0.01733392 -0.12918264 0.1761386 -0.02913121 -0.01364522 0.01497996\n", + " 0.09318532 -0.03958051 0.00465893 -0.01766865 -0.03531685 0.01445563\n", + " 0.05919004 -0.10480376 -0.08359206 -0.08283877 -0.04920156 0.0486405\n", + " 0.0059151 -0.03783213 -0.01815955 -0.0157437 0.2334638 0.15233137\n", + " -0.2698607 -0.04492244 0.03728078 0.06730984 0.09165722 0.07212968\n", + " -0.1418279 -0.10517611 -0.0469548 -0.01878718 -0.08850995 0.07481015\n", + " 0.15206474 0.0923347 -0.08849481 0.01736124 0.12647657 -0.03515046\n", + " 0.07980374 -0.06639698 0.00411603 0.0479564 0.04197159 0.0854824\n", + " 0.103918 -0.01195896 0.05059687 -0.03206704 0.0277859 0.05210226\n", + " -0.15160614 -0.01996467 -0.00720571 -0.01154042 0.10944121 -0.00173247\n", + " 0.11439639 -0.04765575 0.05989955 -0.05265343 0.11914644 0.0085329\n", + " -0.13220952 -0.1538407 -0.07261448 0.04143476 0.15447438 0.02005473\n", + " 0.14354227 0.10015973 0.12290012 0.05011315 0.0425972 -0.13731483\n", + " 0.02323116 -0.1031343 -0.17960383 -0.04875064 0.14352156 0.04516263\n", + " -0.04433561 0.11548021 -0.2057457 -0.02778868 -0.06643672 0.05604808\n", + " 0.04864014 -0.03015646 0.07734285 0.00573904 0.01155302 0.02486293\n", + " 0.16259493 0.05099423 -0.15283771 -0.01909443 -0.12749314 0.06718695\n", + " 0.08334705 -0.05442797 0.03448674 -0.00542413 0.00832719 0.02702984\n", + " -0.02359959 -0.00855793 -0.19381124 -0.13036375 -0.0351354 -0.03983364\n", + " 0.0133928 0.07395492 0.04119737 0.05661048 0.08151852 -0.1529391\n", + " 0.00742581 0.05521343 0.02089992 -0.00824985 -0.00211842 -0.05555268\n", + " 0.05448649 -0.02032894 -0.0760811 -0.01713146 -0.16146915 0.10822926\n", + " -0.1240218 -0.03639562 -0.20028785 -0.02452293]\n", + "None\n", + "shape of item_vector: 2 (256,)\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### W2V" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "from EduNLP.I2V import W2V" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "model_path = \"./w2v/general_literal_300/general_literal_300.kv\"\r\n", + "i2v = W2V(\"pure_text\",\"w2v\",filepath=model_path, pretrained_t2v = False)\r\n", + "\r\n", + "item_vectors, token_vectors = i2v(items)\r\n", + "\r\n", + "print(item_vectors[0])\r\n", + "print(token_vectors[0][0])\r\n", + "print(\"shape of item_vectors: \", len(item_vectors), item_vectors[0].shape)\r\n", + "print(\"shape of token_vectors: \", len(token_vectors), len(token_vectors[0]), len(token_vectors[0][0]))" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[-1.34266680e-03 5.19845746e-02 -1.98070258e-02 -4.17470075e-02\n", + " 4.92814295e-02 -1.70883536e-01 -2.16597781e-01 -3.12069029e-01\n", + " 8.96430463e-02 -1.31331667e-01 9.16494895e-03 -3.22572999e-02\n", + " 3.07940125e-01 1.92060292e-01 1.31043345e-02 6.10962026e-02\n", + " 2.21019030e-01 -3.53541046e-01 1.34150490e-01 1.14867561e-01\n", + " 1.17448963e-01 2.27990672e-01 -1.65213019e-01 2.78246611e-01\n", + " -4.36594114e-02 -1.37816787e-01 -1.07707813e-01 -1.80805102e-01\n", + " 1.20028563e-01 -1.14409983e-01 6.19181581e-02 -1.79836392e-01\n", + " 7.68677965e-02 2.41688967e-01 6.20721914e-02 -7.59824514e-02\n", + " 1.79465964e-01 1.69306010e-01 -1.99512452e-01 -9.75036696e-02\n", + " 1.02485821e-01 -1.59723386e-01 -1.67252243e-01 1.52240042e-02\n", + " -5.98842278e-03 6.47612512e-02 8.48228261e-02 2.67874986e-01\n", + " -1.73656959e-02 -4.40101810e-02 9.11948457e-02 1.40905827e-01\n", + " 6.33735815e-03 2.03221604e-01 -1.97303146e-01 1.32987842e-01\n", + " -1.80283263e-01 3.64040211e-02 2.49624569e-02 7.49479085e-02\n", + " -2.05568615e-02 -4.02397066e-02 -1.08619891e-01 -1.04757406e-01\n", + " -8.36341307e-02 6.61163032e-02 -1.11632387e-03 3.96131463e-02\n", + " -3.51454802e-02 9.09155831e-02 1.87938929e-01 -2.40521863e-01\n", + " -5.97307160e-02 1.74426511e-01 1.56350788e-02 -4.20243442e-02\n", + " -1.90285146e-02 -3.85696471e-01 -1.01543151e-01 -1.42145246e-01\n", + " 2.33298853e-01 1.85939763e-02 -2.68633634e-01 -3.60178575e-02\n", + " 3.64447385e-02 -1.70443758e-01 1.03326524e-02 -1.47003353e-01\n", + " -9.38110873e-02 1.63190335e-01 1.20674491e-01 -2.36976147e-02\n", + " -6.52602538e-02 1.29773334e-01 -4.23593611e-01 2.43276700e-01\n", + " -8.00978579e-03 -9.39133018e-03 1.17623486e-01 1.16482794e-01\n", + " -9.27330479e-02 6.17316999e-02 1.51295820e-02 2.25901395e-01\n", + " 7.31975585e-02 2.29724105e-02 9.95925292e-02 -1.10697523e-01\n", + " 2.28960160e-02 8.65939483e-02 1.16645083e-01 -7.00058565e-02\n", + " 1.13389529e-01 -5.30471019e-02 1.43660516e-01 1.61379650e-02\n", + " 6.77419230e-02 8.09707418e-02 2.09957212e-01 -6.64654151e-02\n", + " -1.81450248e-01 2.21659631e-01 -4.53737518e-03 4.69567403e-02\n", + " 8.59350115e-02 1.17934339e-01 -1.98988736e-01 -4.13361974e-02\n", + " 1.26167178e-01 3.84825058e-02 1.64396539e-01 -1.63344927e-02\n", + " 9.12889242e-02 -1.13650873e-01 -1.37156844e-02 2.06742659e-02\n", + " -9.15742964e-02 7.41296187e-02 2.50813574e-01 -1.35987863e-01\n", + " -1.11708120e-01 -1.52451068e-01 1.08608082e-01 4.99855466e-02\n", + " 1.68440521e-01 -2.47063249e-01 -2.21773341e-01 4.81536575e-02\n", + " -7.66365305e-02 2.55189091e-01 -5.60788438e-03 -2.69066542e-02\n", + " 2.07698755e-02 1.36008840e-02 1.33086294e-01 -3.80828045e-02\n", + " -7.03251585e-02 -6.18199483e-02 9.03518647e-02 -1.89310908e-01\n", + " -5.30523732e-02 -2.04426926e-02 -2.27697566e-01 7.68405125e-02\n", + " 1.28568143e-01 1.07449636e-01 -1.98028013e-01 -2.67155319e-01\n", + " -5.17064631e-02 -1.62200809e-01 1.87425911e-01 4.74511087e-02\n", + " -4.24213745e-02 -2.71449953e-01 -2.83543557e-01 -2.36278087e-01\n", + " -4.38764729e-02 1.67618364e-01 -2.51966029e-01 -2.73265123e-01\n", + " -1.68406263e-01 -3.58684808e-01 2.44145632e-01 -2.55741596e-01\n", + " -2.28520826e-01 2.39279866e-01 -1.68833986e-01 1.66422993e-01\n", + " -3.53969544e-01 -1.10907286e-01 -6.29489049e-02 -4.55605611e-02\n", + " 1.46765754e-01 1.95176788e-02 -3.80197394e-04 3.36615089e-03\n", + " -1.42359287e-01 -1.06109239e-01 -3.36164385e-02 3.16832401e-02\n", + " 1.09924652e-01 2.10711379e-02 1.58359021e-01 1.71957895e-01\n", + " 4.08717275e-01 -4.28679548e-02 -6.48310632e-02 1.27063962e-02\n", + " 5.73479272e-02 1.40002951e-01 -3.66613895e-01 8.07148069e-02\n", + " 2.11823225e-01 -1.10516161e-01 -2.01001287e-01 3.22122797e-02\n", + " 5.47345541e-02 2.30176803e-02 -9.94866490e-02 -4.44128877e-03\n", + " 6.64432272e-02 1.28168091e-01 -2.34743133e-01 3.17057431e-01\n", + " -8.75139013e-02 2.66474396e-01 -3.12204093e-01 7.78969377e-03\n", + " 6.21694922e-02 7.64596611e-02 -8.79013091e-02 1.01901866e-01\n", + " 3.23867425e-02 -2.27650225e-01 9.44062844e-02 -5.54776154e-02\n", + " -7.03687780e-03 5.66167049e-02 -1.87480077e-01 9.11692008e-02\n", + " 1.51293352e-01 -1.92774653e-01 2.23165095e-01 2.26982050e-02\n", + " -2.70489484e-01 1.25889871e-02 -2.30410039e-01 1.40989587e-01\n", + " 2.20341813e-02 2.70313285e-02 6.07572980e-02 8.79322216e-02\n", + " 7.42911696e-02 -2.76499927e-01 2.05189809e-01 -1.84953049e-01\n", + " -1.68468937e-01 1.85525760e-01 -3.32091609e-03 2.29632735e-01\n", + " 7.13749304e-02 -2.75445748e-02 2.74335817e-02 1.65132031e-01\n", + " -1.64373592e-01 -1.14925921e-01 4.98081557e-02 2.10796613e-02\n", + " -2.07561441e-02 5.90056814e-02 -1.25214513e-02 3.78197022e-02\n", + " 3.62618983e-01 1.72744930e-01 -8.75385627e-02 1.52320743e-01\n", + " 1.29331559e-01 -1.34815618e-01 6.12287596e-02 7.30569959e-02\n", + " 5.37401363e-02 -1.46815628e-01 -2.61263877e-01 2.18300954e-01\n", + " 8.95068944e-02 -6.59529120e-02 -8.52308050e-02 2.63195664e-01\n", + " 2.09921718e-01 -1.73417434e-01 2.11869497e-02 7.06950724e-02\n", + " -7.89924189e-02 1.11086138e-01 -1.29149273e-01 1.16233543e-01\n", + " 2.16104537e-01 -3.05427730e-01 -2.46336535e-01 7.59556964e-02]\n", + "[-9.74057533e-04 1.39671087e-03 -2.67836265e-04 3.15364590e-03\n", + " 2.96666636e-04 2.81736051e-04 -2.63743475e-03 1.52303779e-03\n", + " 1.01379235e-03 1.57282199e-03 -1.71113803e-04 8.02559836e-04\n", + " 2.57097790e-03 1.81893981e-03 2.63088616e-03 3.40178027e-04\n", + " -2.11292668e-03 -2.50976160e-03 -1.20709895e-03 1.67239667e-03\n", + " -2.58512655e-03 -1.26207829e-03 1.39700493e-03 1.95603608e-03\n", + " 2.89038429e-03 -2.39552581e-03 -3.91247275e-04 -3.21114226e-03\n", + " 9.58531688e-04 8.29325523e-04 -1.59795280e-03 1.25081465e-03\n", + " 3.81096208e-04 1.59411912e-03 -8.54889571e-04 -3.06331483e-03\n", + " 1.97919217e-04 -9.37395904e-04 2.09570490e-03 1.22304517e-03\n", + " -2.73981970e-03 1.85640890e-03 -8.50516954e-04 2.05107126e-03\n", + " 3.25771095e-03 -2.02651741e-03 -2.99406121e-03 -2.29128683e-03\n", + " 7.10814027e-04 -2.45556026e-03 3.29233892e-03 -1.30950764e-03\n", + " -1.89368729e-03 1.27877074e-03 2.70718103e-03 -2.21936312e-03\n", + " 1.69272022e-03 7.79648602e-04 2.15323060e-03 -1.90569717e-03\n", + " -2.24422058e-03 -7.12279463e-04 1.38582790e-03 2.27209576e-03\n", + " -2.48074066e-03 -1.57340372e-03 -2.78435787e-03 2.53080134e-03\n", + " 9.29941365e-04 9.57158394e-04 -4.04856197e-04 7.77039502e-04\n", + " 2.93451315e-03 1.50868180e-03 -2.39667180e-03 1.94984837e-03\n", + " -1.30266906e-03 -3.10783624e-03 2.75730062e-03 6.42884581e-04\n", + " -5.66801231e-04 7.95386615e-04 -2.41047610e-03 1.00338063e-03\n", + " 2.82178726e-03 -2.43772753e-03 5.73688361e-04 1.23452744e-03\n", + " -3.01872566e-03 -1.07384368e-03 3.28231254e-03 -5.47548116e-04\n", + " 2.18831864e-03 -3.27980524e-04 1.68147963e-03 2.44990899e-03\n", + " -2.45229807e-03 1.02455064e-03 -2.29938584e-03 2.91304989e-03\n", + " 1.60753564e-03 1.12590473e-03 3.00752558e-03 1.44218525e-03\n", + " 3.16225761e-03 -1.64008932e-04 -5.27421653e-04 1.06547831e-03\n", + " 1.11937604e-03 4.77150286e-04 -2.42268969e-03 -3.12998053e-03\n", + " 4.55578178e-04 2.90129334e-03 -3.05265025e-03 -1.28805637e-03\n", + " -2.08641519e-03 3.26466886e-03 -2.95106089e-03 -2.08173040e-03\n", + " -1.99576933e-03 4.53641405e-04 -2.70907651e-03 -2.34400504e-03\n", + " 1.16086320e-03 -2.44718627e-03 9.25636268e-05 9.73496411e-04\n", + " -1.01899146e-03 -3.02827288e-03 9.58363991e-04 -3.27257067e-03\n", + " 2.40717572e-03 1.20117664e-04 1.10580446e-03 -4.41495577e-05\n", + " -2.85318610e-03 1.54357916e-03 3.11869616e-03 6.44255488e-04\n", + " -1.31027703e-03 -1.52749463e-03 8.79097788e-04 7.01892364e-04\n", + " 1.34046946e-03 -8.91715696e-04 -1.93791394e-03 -4.34041809e-04\n", + " 1.12010317e-03 -2.24535773e-03 -1.76302914e-03 -1.11521804e-03\n", + " -2.68377946e-03 -2.55486579e-03 2.67607206e-03 -2.09635729e-03\n", + " 4.45536774e-04 4.23340796e-04 -2.36946181e-03 1.01201690e-03\n", + " 2.53369007e-03 -8.69231240e-04 -2.23573043e-05 4.58726077e-04\n", + " 9.46683111e-04 -1.58690137e-03 1.31600059e-03 2.19468423e-03\n", + " -1.69886113e-03 1.71214901e-03 -1.43307843e-04 -1.10225752e-03\n", + " 3.13180522e-03 1.78616366e-03 -4.65679186e-05 -1.40959187e-03\n", + " -6.96717121e-04 -5.70511795e-04 -1.54102559e-03 -2.30318774e-03\n", + " 2.54784338e-03 -1.62216101e-03 3.14650533e-04 -2.94532534e-03\n", + " -1.02099183e-03 2.99499906e-03 -6.38728146e-04 -2.72372481e-03\n", + " 3.22340080e-03 -1.49127806e-03 2.27723271e-03 2.73366761e-03\n", + " 2.62600114e-03 2.68271845e-03 3.20440996e-03 -4.97240224e-04\n", + " -1.02938886e-03 3.26999027e-04 9.46711691e-04 1.76053529e-03\n", + " 1.74157624e-03 1.49760721e-03 -3.09546776e-05 2.48821010e-03\n", + " 2.15774146e-03 2.42709951e-03 -2.46135960e-03 1.82637456e-03\n", + " -3.11999000e-03 -2.49591586e-03 -3.27967643e-03 -1.17016002e-03\n", + " 6.43555308e-04 3.32132494e-03 -2.58475146e-03 -7.75608991e-04\n", + " 3.30572366e-03 6.71840506e-04 -2.23828160e-04 2.99876463e-03\n", + " 3.10293835e-05 -1.25048554e-03 2.48837401e-03 -4.16146126e-04\n", + " -8.01149989e-04 -2.19148802e-04 -2.70171487e-03 1.73141161e-04\n", + " -2.53586681e-03 3.11773620e-03 1.13646187e-04 2.82005151e-03\n", + " -3.23787535e-04 1.52152695e-03 3.21076158e-03 -2.29426223e-04\n", + " -2.22376501e-03 -3.26833175e-03 5.72812569e-04 3.06089874e-03\n", + " 8.33402446e-04 1.29480439e-03 1.32911524e-03 2.61883345e-03\n", + " -2.53178203e-03 6.48000219e-04 2.66361074e-03 -3.05172289e-03\n", + " -9.23413434e-04 -2.13261833e-03 8.54914193e-04 -1.48963137e-03\n", + " -1.95632223e-03 -7.69955339e-04 -3.29735363e-03 1.98830920e-03\n", + " 1.31162966e-03 1.10320176e-03 -3.22533771e-03 2.04978790e-03\n", + " -5.25970478e-04 -1.89223525e-03 2.42309878e-03 8.27315671e-04\n", + " 9.63741913e-04 8.84156208e-04 1.02529768e-03 -1.41616585e-03\n", + " 6.75518531e-04 -6.47147477e-04 2.71809031e-03 2.17319001e-03\n", + " 9.71910951e-04 -2.93364283e-03 2.43404706e-04 1.14709849e-03\n", + " -1.99730392e-04 3.82491737e-04 -3.08531453e-03 -2.20424891e-03\n", + " 2.87708524e-03 1.51069486e-03 9.24036489e-04 -1.09619542e-03\n", + " 1.36686012e-03 -2.61674239e-03 -1.52974128e-04 -2.72300956e-03\n", + " 1.70241436e-03 -6.61658472e-04 -2.15324806e-03 -2.46914220e-03\n", + " 1.41488796e-03 -3.25874239e-03 -2.29610526e-03 -2.22696620e-03\n", + " -2.09132349e-03 -2.79461709e-03 -3.24834906e-03 -1.12362858e-03]\n", + "shape of item_vectors: 2 (300,)\n", + "shape of token_vectors: 2 55 300\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## get_pretrained_i2v\r\n", + "\r\n", + "### 概述\r\n", + "\r\n", + "使用 EduNLP 项目组给定的预训练模型将给定的题目文本转成向量。\r\n", + "\r\n", + "- 优点:简单方便。\r\n", + "- 缺点:只能使用项目中给定的模型,局限性较大。\r\n" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 导入功能块" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "from EduNLP import get_pretrained_i2v" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 输入\r\n", + "\r\n", + "类型:str \r\n", + "内容:题目文本 (text)" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "items = [\r\n", + "\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\r\n", + "]\r\n" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 模型选择与使用" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "根据题目所属学科选择预训练模型: \r\n", + "\r\n", + " 预训练模型名称 | 模型训练数据的所属学科 \r\n", + " -------------- | ---------------------- \r\n", + " d2v_all_256 | 全学科 \r\n", + " d2v_sci_256 | 理科 \r\n", + " d2v_eng_256 | 英语 \r\n", + " d2v_lit_256 | 文科 \r\n", + " w2v_eng_300 | 英语 \r\n", + " w2v_lit_300 | 文科 " + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 8, + "source": [ + "i2v = get_pretrained_i2v(\"d2v_sci_256\", model_dir=\"./d2v\")" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "EduNLP, INFO Use pretrained t2v model d2v_sci_256\n", + "downloader, INFO http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_science_256.zip is saved as d2v\\general_science_256.zip\n", + "downloader, INFO file existed, skipped\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 注意:\n", + " 默认的 EduNLP 项目存储地址为根目录(`~/.EduNLP`),模型存储地址为项目存储地址下的 `model` 文件夹。您可以通过修改下面的环境变量来修改模型存储地址:\n", + " - EduNLP 项目存储地址:`EDUNLPPATH = xx/xx/xx`\n", + " - 模型存储地址:`EDUNLPMODELPATH = xx/xx/xx`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 9, + "source": [ + "item_vectors, token_vectors = i2v(items)\r\n", + "print(item_vectors)\r\n", + "print(token_vectors)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[array([-0.23861311, 0.06892798, -0.27065727, 0.16547263, 0.02818857,\n", + " -0.18185084, 0.09226187, 0.01612627, 0.0921795 , 0.3134312 ,\n", + " 0.09265023, -0.22529641, -0.25788078, 0.06702194, 0.09765045,\n", + " -0.19932257, 0.08527228, -0.22684543, -0.1776405 , -0.03682012,\n", + " 0.6210964 , -0.26637274, 0.08060682, -0.15860714, -0.17825642,\n", + " -0.13271384, 0.27331072, 0.18202724, 0.08430962, 0.23299456,\n", + " 0.179898 , 0.1571772 , -0.1406754 , -0.19508898, -0.11265783,\n", + " 0.11396482, 0.0223774 , 0.07824919, -0.2421433 , 0.06195279,\n", + " -0.04763965, -0.02037446, 0.07481094, -0.1908799 , 0.09688905,\n", + " 0.3995564 , 0.28225863, 0.30547026, -0.46538818, -0.02891348,\n", + " -0.19343005, 0.01966276, -0.21590087, 0.09774096, -0.26137134,\n", + " -0.23963049, 0.34259936, 0.14825426, -0.2987728 , -0.38039675,\n", + " -0.12087625, 0.05897354, 0.06351897, 0.10188989, 0.12092843,\n", + " 0.13229063, 0.12786968, -0.15378596, 0.00724137, -0.13644631,\n", + " -0.15164569, 0.11535735, -0.24394232, -0.08835315, 0.05014084,\n", + " -0.05980775, 0.03040357, -0.05804552, -0.04122322, 0.31905708,\n", + " -0.02468318, 0.06953011, -0.1299219 , 0.01482821, -0.00126122,\n", + " -0.20185567, -0.00784766, -0.28023243, -0.16416278, -0.04939609,\n", + " -0.22619021, -0.17099814, 0.1434735 , -0.13193442, -0.18329675,\n", + " -0.06873035, -0.21638975, -0.0767743 , 0.17778671, 0.0459166 ,\n", + " 0.0719557 , 0.0797654 , -0.15445784, -0.20094277, 0.11860424,\n", + " 0.09521067, -0.10993416, -0.01273298, -0.0857757 , -0.05475522,\n", + " -0.09463413, 0.00845256, 0.06638184, -0.22701578, 0.06599791,\n", + " 0.1323833 , 0.2227748 , 0.13431212, -0.08537175, 0.14300612,\n", + " 0.24459998, 0.00735889, -0.07123663, 0.24863936, 0.10320719,\n", + " -0.06399037, 0.0537433 , 0.00862593, -0.10747737, -0.01009098,\n", + " 0.01707896, 0.07951383, -0.2245529 , 0.03152119, 0.19090259,\n", + " 0.27611575, -0.16507478, 0.05977706, 0.09740735, 0.32154247,\n", + " -0.02540598, -0.20875612, 0.11484967, 0.12112009, -0.00937327,\n", + " -0.03855037, -0.03728763, 0.13645649, 0.42706412, 0.14456204,\n", + " -0.1542145 , 0.07858715, 0.14076898, 0.01195827, 0.16896723,\n", + " -0.0516856 , 0.05795754, 0.09602529, 0.02058077, 0.14346235,\n", + " 0.3984762 , 0.06770886, -0.5524451 , -0.18779868, 0.11151859,\n", + " -0.06967582, 0.09465033, 0.2242416 , -0.17179447, 0.20837718,\n", + " 0.43269685, -0.33945957, 0.00746959, -0.14856125, -0.04883511,\n", + " 0.0790235 , 0.18130969, -0.06500382, -0.05761597, 0.15247819,\n", + " 0.22402437, 0.33508143, -0.02544755, 0.10404763, -0.0392291 ,\n", + " 0.14048643, -0.39408255, -0.04759403, -0.09290893, -0.10062248,\n", + " 0.3836949 , -0.04212417, 0.04195033, -0.34143335, 0.02139966,\n", + " 0.00748172, 0.09670173, 0.11287135, 0.07313446, -0.06884305,\n", + " -0.27654266, -0.02745902, 0.11782443, -0.05509072, -0.02731109,\n", + " 0.02932139, 0.20647307, -0.09912065, 0.08175386, 0.04051739,\n", + " -0.13783188, 0.2178767 , 0.01360986, -0.11862064, 0.02632025,\n", + " 0.01305837, -0.07418288, -0.11537156, 0.07784148, -0.02828423,\n", + " 0.0152778 , -0.27535534, -0.26457086, -0.2426946 , 0.17839569,\n", + " 0.41153124, -0.06237097, 0.28373018, 0.09847705, -0.2693095 ,\n", + " 0.15109962, 0.02665104, 0.12224031, 0.0053689 , 0.08057593,\n", + " 0.0029663 , -0.01309686, 0.04294159, -0.26014623, -0.09540065,\n", + " -0.19017759, -0.02596658, -0.21918078, -0.04269371, 0.09444954,\n", + " -0.05112423, 0.21732539, 0.2555126 , 0.06598321, -0.00912136,\n", + " 0.01300732, -0.02216252, 0.16752972, 0.00181198, 0.02385568,\n", + " -0.0017939 ], dtype=float32)]\n", + "None\n" + ] + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.6.3", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.6.3 64-bit" + }, + "interpreter": { + "hash": "6f23ddf1f0697a8f0c43dd2435bdb82528077c79e9967f824fba6a3b52b05faf" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file