From 9000793f1ebc28f94499d85110bd7fd81ce29a4b Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Tue, 7 Sep 2021 10:28:03 +0800 Subject: [PATCH 1/6] Create parse.rst --- docs/source/tutorial/zh/parse.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/tutorial/zh/parse.rst b/docs/source/tutorial/zh/parse.rst index 3758dcb5..6c1dfe6a 100644 --- a/docs/source/tutorial/zh/parse.rst +++ b/docs/source/tutorial/zh/parse.rst @@ -120,7 +120,7 @@ item为str 或 List[Dict]类型,具体内容为latex 公式 或 公式经解 >>> plt.show() -.. figure:: ../../../_static/formula.png +.. figure:: ../../_static/formula.png 变量标准化 @@ -206,7 +206,7 @@ FormulaGroup [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] >>> ForestPlotter().export(fs.ast_graph, root_list=[node["val"]["id"] for node in fs.ast if node["structure"]["father"] is None],) -.. figure:: ../../../_static/formulagroup.png +.. figure:: ../../_static/formulagroup.png 文本语法结构解析 From d3e44223596f12c4821a733bfb8d528800856b3f Mon Sep 17 00:00:00 2001 From: fannazya Date: Sat, 23 Oct 2021 20:15:21 +0800 Subject: [PATCH 2/6] Add English version tutorial --- docs/source/tutorial/en/index.rst | 52 +++- docs/source/tutorial/en/parse.rst | 290 ++++++++++++++++++ .../parse/FormulaSyntaxStructureParsing.rst | 168 ++++++++++ .../en/parse/TextSyntaxStructureParsing.rst | 72 +++++ docs/source/tutorial/en/pretrain.rst | 130 ++++++++ docs/source/tutorial/en/pretrain/loading.rst | 11 + docs/source/tutorial/en/pretrain/pub.rst | 74 +++++ docs/source/tutorial/en/pretrain/start.rst | 24 ++ docs/source/tutorial/en/seg.rst | 187 +++++++++++ .../en/seg/SemanticComponentSegmentation.rst | 47 +++ .../seg/StructuralComponentSegmentation.rst | 67 ++++ docs/source/tutorial/en/sif.rst | 145 ++++++++- .../en/tokenization/GensimSegTokenizer.rst | 9 + .../en/tokenization/GensimWordTokenizer.rst | 23 ++ .../en/tokenization/PureTextTokenizer.rst | 31 ++ .../en/tokenization/TextTokenizer.rst | 27 ++ docs/source/tutorial/en/tokenize.rst | 172 +++++++++++ .../en/tokenize/Sentence Segmentation.rst | 3 + .../tutorial/en/tokenize/Tokenization.rst | 29 ++ .../tutorial/en/tokenize/WordSegmentation.rst | 36 +++ docs/source/tutorial/en/vectorization.rst | 157 ++++++++++ .../en/vectorization/WithPre-trainedModel.rst | 42 +++ .../vectorization/WithoutPre-trainedModel.rst | 21 ++ 23 files changed, 1815 insertions(+), 2 deletions(-) create mode 100644 docs/source/tutorial/en/parse.rst create mode 100644 docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst create mode 100644 docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst create mode 100644 docs/source/tutorial/en/pretrain.rst create mode 100644 docs/source/tutorial/en/pretrain/loading.rst create mode 100644 docs/source/tutorial/en/pretrain/pub.rst create mode 100644 docs/source/tutorial/en/pretrain/start.rst create mode 100644 docs/source/tutorial/en/seg.rst create mode 100644 docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst create mode 100644 docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst create mode 100644 docs/source/tutorial/en/tokenization/GensimSegTokenizer.rst create mode 100644 docs/source/tutorial/en/tokenization/GensimWordTokenizer.rst create mode 100644 docs/source/tutorial/en/tokenization/PureTextTokenizer.rst create mode 100644 docs/source/tutorial/en/tokenization/TextTokenizer.rst create mode 100644 docs/source/tutorial/en/tokenize.rst create mode 100644 docs/source/tutorial/en/tokenize/Sentence Segmentation.rst create mode 100644 docs/source/tutorial/en/tokenize/Tokenization.rst create mode 100644 docs/source/tutorial/en/tokenize/WordSegmentation.rst create mode 100644 docs/source/tutorial/en/vectorization.rst create mode 100644 docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst create mode 100644 docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst diff --git a/docs/source/tutorial/en/index.rst b/docs/source/tutorial/en/index.rst index 108a9487..4c8cc040 100644 --- a/docs/source/tutorial/en/index.rst +++ b/docs/source/tutorial/en/index.rst @@ -1,2 +1,52 @@ Get Started -=========== +===== + +* `Standard Item Format `_ + +* `Syntax Parsing `_ + +* `Component Segmentation `_ + +* `Tokenization `_ + +* `Pre-training `_ + +* `Vectorization `_ + +Main process +---------- + +.. figure:: ../../_static/new_flow.png + +* `Syntax Parsing `_ : Its function is to convert the incoming item into SIF format, which means letters and numbers should be between ``$...$`` and the brackets and underlines of the choice questions should be converted to special symbols we defined in SIF) + +* `Component Segmentation `_ : Its function is to segment items in SIF format according to the types of items, so as to serve the later tokenization module.(that is, elements in different types can be tokenized using their corresponding methods)。 + +* `Tokenization `_: Its function is to tokenize segmented items, so as to serve the later tokenization module. + Generally, the tokenization method in the text form can be used directly. For formulas, the ast method can also be used for parsing(call the formula module); + +* `Vectorization `_: This part mainly calls I2V class and its subclasses. Its function is to vectorize the list of tokenized items, so as to get the corresponding static vectors. + For vectorization module, You can call your own trained model or directly call the provided pre-training model(call get_ pretrained_ I2V module). + +* **Downstream Model**:Process the obtained vectors to get the desired results。 + +Examples +-------- + +To help you quickly understand the functions of this project, this section only shows the usages of common function interface. Intermediate function modules (such as parse, formula, segment, etc.) and more subdivided interface methods are not shown. For further study, please refer to relevant documents. + +.. nbgallery:: + :caption: This is a thumbnail gallery: + :name: tokenize_gallery + :glob: + + Tokenization <../../build/blitz/tokenizer/tokenizer.ipynb> + + + +.. nbgallery:: + :caption: This is a thumbnail gallery: + :name: vectorization_gallery + :glob: + + Vectorization <../../build/blitz/vectorization/total_vector.ipynb> diff --git a/docs/source/tutorial/en/parse.rst b/docs/source/tutorial/en/parse.rst new file mode 100644 index 00000000..69608c5e --- /dev/null +++ b/docs/source/tutorial/en/parse.rst @@ -0,0 +1,290 @@ +Syntax Parsing +========= + +In educational resources, texts and formulas have internal implicit or explicit syntax structures. It is of great benefit for further processing to extract these structures. +* Text syntax structure parsing + +* Formula syntax structure parsing + +The purpose is as follows: + + +1. Represent underlines of blanks and brackets of choices with special identifiers. And the alphabets and formulas should be wrapped with $$, so that items of different types can be cut accurately through the symbol $. +2. Determine whether the current item is legal and report the error type. + +Specific processing content +-------------------- + +1.Its function is to match alphabets and numbers other than formulas. Only the alphabets and numbers between two Chinese characters will be corrected, and the rest of the cases are regarded as formulas that do not conform to latex syntax. + +2.Match brackets like "( )" (both English format and Chinese format), that is, brackets with no content or spaces, which should be replaced with ``$\\SIFChoice$`` + +3.Match continuous underscores or underscores with spaces to replace them with ``$\\SIFBlank$``. + +4.Match latex formulas,check the completeness and analyzability of latex formulas, and report an error for illegal formula. + +Formula syntax structure parsing +-------------------- + +This section is mainly realized by EduNLP.Formula modules, which can determine if the text has syntax errors and convert the syntax formula into the form of ast tree. In practice, this module is often used as part of an intermediate process, and the relevant parameters of this module can be automatically chosen by calling the corresponding model, so it generally does not need special attention. + +Introduction of Main Introduction ++++++++++++++++ + +1.Formula: determine whether the single formula passed in is in str form. If so, use the ast method for processing, otherwise an error will be reported. In addition, parameter variable_standardization is given. If this parameter is true, the variable standardization method will be used to make sure the same variable has the same variable number. + +2.FormulaGroup: If you need to pass in a formula set, you can call this interface to get an ast forest. The tree structure in the forest is the same as that of Formula. + +Formula +>>>>>>>>>>>> + +Formula: firstly, in the word segmentation function, the formula of the original text is segmented. In addition, ``Formula parse tree`` function is provided, which can represent the abstract syntax analysis tree of mathematical formula in the form of text or picture. + +This module also provides the function of formula variable standardization, such as determining that 'x' in several sub formulas is the same variable. + +Call the library ++++++++++ + +:: + + import matplotlib.pyplot as plt + from EduNLP.Formula import Formula + from EduNLP.Formula.viz import ForestPlotter + +Initialization ++++++++++ + +Incoming parameters: item + +Item is the latex formula or the abstract syntax parse tree generated after the formula is parsed and its type is str or List[Dict]. + +:: + + >>> f=Formula("x^2 + x+1 = y") + >>> f + + +View the specific content after formula segmentation +++++++++++++++++++++++++++++ + +- View node elements after formula segmentation + +:: + + >>> f.elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, + {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}] + +- View the abstract parse tree of formulas + +:: + + >>> f.ast + [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 3],'child': [1, 2],'father': None,'forest': None}}, + {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + 'structure': {'bro': [None, 2], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, + 'structure': {'bro': [3, 5], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + 'structure': {'bro': [4, 6], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + 'structure': {'bro': [5, 7], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + 'structure': {'bro': [6, 8], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}, + 'structure': {'bro': [7, None],'child': None,'father': None,'forest': None}}] + + >>> print('nodes: ',f.ast_graph.nodes) + nodes: [0, 1, 2, 3, 4, 5, 6, 7, 8] + >>> print('edges: ' ,f.ast_graph.edges) + edges: [(0, 1), (0, 2)] + +- show the abstract parse tree by a picture + +:: + + >>> ForestPlotter().export(f.ast_graph, root_list=[node["val"]["id"] for node in f.ast if node["structure"]["father"] is None],) + >>> plt.show() + + +.. figure:: ../../_static/formula.png + + +Variable standardization ++++++++++++ + +This parameter makes the same variable have the same variable number. + +For example: the number of variable ``x`` is ``0`` and the number of variable ``y`` is ``1``. + +:: + + >>> f.variable_standardization().elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}, + {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] + +FormulaGroup +>>>>>>>>>>>>>>> + +Call ``FormulaGroup`` class to parse the equations. The related attributes and functions are the same as those above. + +:: + + import matplotlib.pyplot as plt + from EduNLP.Formula import Formula + from EduNLP.Formula import FormulaGroup + from EduNLP.Formula.viz import ForestPlotter + >>> fs = FormulaGroup(["x^2 = y", "x^3 = y^2", "x + y = \pi"]) + >>> fs + ;;> + >>> fs.elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, + {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 7, 'type': 'textord', 'text': '3', 'role': 'sup'}, + {'id': 8, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 9, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 10, 'type': 'mathord', 'text': 'y', 'role': 'base'}, + {'id': 11, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 12, 'type': 'mathord', 'text': 'x', 'role': None}, + {'id': 13, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 14, 'type': 'mathord', 'text': 'y', 'role': None}, + {'id': 15, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 16, 'type': 'mathord', 'text': '\\pi', 'role': None}] + >>> fs.ast + [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 3], + 'child': [1, 2], + 'father': None, + 'forest': None}}, + {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + 'structure': {'bro': [None, 2], + 'child': None, + 'father': 0, + 'forest': [6, 12]}}, + {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, + 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, + 'structure': {'bro': [3, None], + 'child': None, + 'father': None, + 'forest': [10, 14]}}, + {'val': {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 8], + 'child': [6, 7], + 'father': None, + 'forest': None}}, + {'val': {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + show more (open the raw output data in a text editor) ... + >>> fs.variable_standardization()[0] + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] + >>> ForestPlotter().export(fs.ast_graph, root_list=[node["val"]["id"] for node in fs.ast if node["structure"]["father"] is None],) + +.. figure:: ../../_static/formulagroup.png + + +Text syntax structure parsing +-------------------- + +This section is mainly realized by EduNLP.SIF.Parse module. Its main function is to extract letters and numbers in the text and convert them into standard format. + +This module is mainly used as an *middle module* to parse the input text. Users generally do not call this module directly. + +Introduction of main content ++++++++++++++++ + +1. Judge the type of the incoming text in the following order + +* is_chinese: its function is to match Chinese characters[\u4e00-\u9fa5]. + +* is_alphabet: its function is to match alphabets other than formulas. Only the alphabets between two Chinese characters will be corrected (wrapped with $$), and the rest of the cases are regarded as formulas that do not conform to latex syntax. + +* is_number: its function is to match numbers other than formulas. Only the numbers between two Chinese characters will be corrected, and the rest of the cases are regarded as formulas that do not conform to latex syntax. + +2. Match latex formula + +* If Chinese characters appear in latex, print warning only once. + +* Use _is_formula_legal function, check the completeness and analyzability of latex formula, and report an error for formulas that do not conform to latex syntax. + +Call the library +>>>>>>>>>>>> + +:: + + from EduNLP.SIF.Parser import Parser + +Input +>>>>>>> + +Types: str + +Content: question text + +:: + + >>> text1 = '生产某种零件的A工厂25名工人的日加工零件数_ _' + >>> text2 = 'X的分布列为( )' + >>> text3 = '① AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.AC的中点为D' + >>> text4 = '支持公式如$\\frac{y}{x}$,$\\SIFBlank$,$\\FigureID{1}$,不支持公式如$\\frac{ \\dddot y}{x}$' + +Parsing +>>>>>>>>>>>>>>>>>>>> + +:: + + >>> text_parser1 = Parser(text1) + >>> text_parser2 = Parser(text2) + >>> text_parser3 = Parser(text3) + >>> text_parser4 = Parser(text4) + +Related parameters description(?) +>>>>>>>>>>>> + +- Try to convert text to standard format + +:: + + >>> text_parser1.description_list() + >>> print('text_parser1.text:',text_parser1.text) + text_parser1.text: 生产某种零件的$A$工厂$25$名工人的日加工零件数$\SIFBlank$ + >>> text_parser2.description_list() + >>> print('text_parser2.text:',text_parser2.text) + text_parser2.text: $X$的分布列为$\SIFChoice$ + +- Determine if the text has syntax errors + +:: + + >>> text_parser3.description_list() + >>> print('text_parser3.error_flag: ',text_parser3.error_flag) + text_parser3.error_flag: 1 + >>> text_parser4.description_list() + >>> print('text_parser4.fomula_illegal_flag: ',text_parser4.fomula_illegal_flag) + text_parser4.fomula_illegal_flag: 1 + diff --git a/docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst b/docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst new file mode 100644 index 00000000..2fc479c5 --- /dev/null +++ b/docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst @@ -0,0 +1,168 @@ +Formula syntax structure parsing +-------------------- + +This section is mainly realized by EduNLP.Formula modules, which can determine if the text has syntax errors and convert the syntax formula into the form of ast tree. In practice, this module is often used as part of an intermediate process, and the relevant parameters of this module can be automatically chosen by calling the corresponding model, so it generally does not need special attention. + +Introduction of Main Content ++++++++++++++++ + +1.Formula: determine whether the single formula passed in is in str form. If so, use the ast method for processing, otherwise an error will be reported. In addition, parameter variable_standardization is given. If this parameter is true, the variable standardization method will be used to make sure the same variable has the same variable number. + +2.FormulaGroup: If you need to pass in a formula set, you can call this interface to get an ast forest. The tree structure in the forest is the same as that of Formula. + +Formula +>>>>>>>>>>>> + +Formula: firstly, in the word segmentation function, the formula of the original text is segmented. In addition, ``Formula parse tree`` function is provided, which can represent the abstract syntax analysis tree of mathematical formula in the form of text or picture. + +This module also provides the function of formula variable standardization, such as determining that 'x' in several sub formulas is the same variable. + +Initialization ++++++++++ + +Incoming parameters: item + +Item is the latex formula or the abstract syntax parse tree generated after the formula is parsed and its type is str or List[Dict]. + +:: + + >>> f=Formula("x^2 + x+1 = y") + >>> f + + +View the specific content after formula segmentation +++++++++++++++++++++++++++++ + +- View node elements after formula segmentation + +:: + + >>> f.elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, + {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}] + +- View the abstract parsing tree of formulas + +:: + + >>> f.ast + [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 3],'child': [1, 2],'father': None,'forest': None}}, + {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + 'structure': {'bro': [None, 2], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, + 'structure': {'bro': [3, 5], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + 'structure': {'bro': [4, 6], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + 'structure': {'bro': [5, 7], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + 'structure': {'bro': [6, 8], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}, + 'structure': {'bro': [7, None],'child': None,'father': None,'forest': None}}] + + >>> print('nodes: ',f.ast_graph.nodes) + nodes: [0, 1, 2, 3, 4, 5, 6, 7, 8] + >>> print('edges: ' ,f.ast_graph.edges) + edges: [(0, 1), (0, 2)] + +- show the abstract parse tree by a picture + +:: + + >>> ForestPlotter().export(f.ast_graph, root_list=[node["val"]["id"] for node in f.ast if node["structure"]["father"] is None],) + >>> plt.show() + +.. figure:: ../../../_static/formula.png + +Variable Standardization ++++++++++++ + +This parameter makes the same variable have the same variable number. + +For example: the number of variable ``x`` is ``0`` and the number of variable ``y`` is ``1``. + +:: + + >>> f.variable_standardization().elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}, + {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] + +FormulaGroup +>>>>>>>>>>>>>>> + +Call ``FormulaGroup`` class to parse the equations. The related attributes and functions are the same as those above. + +:: + + >>> fs = FormulaGroup(["x^2 = y", "x^3 = y^2", "x + y = \pi"]) + >>> fs + ;;> + >>> fs.elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, + {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 7, 'type': 'textord', 'text': '3', 'role': 'sup'}, + {'id': 8, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 9, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 10, 'type': 'mathord', 'text': 'y', 'role': 'base'}, + {'id': 11, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 12, 'type': 'mathord', 'text': 'x', 'role': None}, + {'id': 13, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 14, 'type': 'mathord', 'text': 'y', 'role': None}, + {'id': 15, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 16, 'type': 'mathord', 'text': '\\pi', 'role': None}] + >>> fs.ast + [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 3], + 'child': [1, 2], + 'father': None, + 'forest': None}}, + {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + 'structure': {'bro': [None, 2], + 'child': None, + 'father': 0, + 'forest': [6, 12]}}, + {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, + 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, + 'structure': {'bro': [3, None], + 'child': None, + 'father': None, + 'forest': [10, 14]}}, + {'val': {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 8], + 'child': [6, 7], + 'father': None, + 'forest': None}}, + {'val': {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + show more (open the raw output data in a text editor) ... + >>> fs.variable_standardization()[0] + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] + >>> ForestPlotter().export(fs.ast_graph, root_list=[node["val"]["id"] for node in fs.ast if node["structure"]["father"] is None],) + +.. figure:: ../../../_static/formulagroup.png diff --git a/docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst b/docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst new file mode 100644 index 00000000..bdfe6848 --- /dev/null +++ b/docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst @@ -0,0 +1,72 @@ +Text syntax structure parsing +-------------------- + +This section is mainly realized by EduNLP.SIF.Parse module. Its main function is to extract letters and numbers in the text and convert them into standard format. + +This module is mainly used as an *middle module* to parse the input text. Users generally do not call this module directly. + +Introduction of Main Content ++++++++++++++++ + +1. Judge the type of the incoming text in the following order + +* is_chinese: its function is to match Chinese characters[\u4e00-\u9fa5]. + +* is_alphabet: its function is to match alphabets other than formulas. Only the alphabets between two Chinese characters will be corrected (wrapped with $$), and the rest of the cases are regarded as formulas that do not conform to latex syntax. + +* is_number: its function is to match numbers other than formulas. Only the numbers between two Chinese characters will be corrected, and the rest of the cases are regarded as formulas that do not conform to latex syntax. + +2. Match latex formula + +* If Chinese characters appear in latex, print warning only once. + +* Use _is_formula_legal function, check the completeness and analyzability of latex formula, and report an error for formulas that do not conform to latex syntax. + +Input +>>>>>>> + +Type: str + +Content:question text + +:: + + >>> text1 = '生产某种零件的A工厂25名工人的日加工零件数_ _' + >>> text2 = 'X的分布列为( )' + >>> text3 = '① AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.AC的中点为D' + >>> text4 = '支持公式如$\\frac{y}{x}$,$\\SIFBlank$,$\\FigureID{1}$,不支持公式如$\\frac{ \\dddot y}{x}$' + +Parsing +>>>>>>>>>>>>>>>>>>>> + +:: + + >>> text_parser1 = Parser(text1) + >>> text_parser2 = Parser(text2) + >>> text_parser3 = Parser(text3) + >>> text_parser4 = Parser(text4) + +Related parameters description(?) +>>>>>>>>>>>> + +- Try to convert text to standard format + +:: + + >>> text_parser1.description_list() + >>> print('text_parser1.text:',text_parser1.text) + text_parser1.text: 生产某种零件的$A$工厂$25$名工人的日加工零件数$\SIFBlank$ + >>> text_parser2.description_list() + >>> print('text_parser2.text:',text_parser2.text) + text_parser2.text: $X$的分布列为$\SIFChoice$ + +- Determine if the text has syntax errors + +:: + + >>> text_parser3.description_list() + >>> print('text_parser3.error_flag: ',text_parser3.error_flag) + text_parser3.error_flag: 1 + >>> text_parser4.description_list() + >>> print('text_parser4.fomula_illegal_flag: ',text_parser4.fomula_illegal_flag) + text_parser4.fomula_illegal_flag: 1 diff --git a/docs/source/tutorial/en/pretrain.rst b/docs/source/tutorial/en/pretrain.rst new file mode 100644 index 00000000..9319b87d --- /dev/null +++ b/docs/source/tutorial/en/pretrain.rst @@ -0,0 +1,130 @@ +Pre-training +======= + +In the field of NLP, Pre-trained Language Models has become a very important basic technology. +In this chapter, we will introduce the pre training tools in EduNLP: + +* How to train with a corpus to get a pre-trained model +* How to load the pre-trained model +* Public pre-trained models + +Import modules +---------- + +:: + + from EduNLP.I2V import get_pretrained_i2v + from EduNLP.Vector import get_pretrained_t2v + +Train the Model +------------ + +Call train_Vector function interface directly to make the training model easier. This section calls the relevant training models in the gensim library. At present, the training methods of "sg"、 "cbow"、 "fastext"、 "d2v"、 "bow"、 "tfidf" are provided. Parameter embedding_dim is also provided for users to determine vector dimension according to their needs. + +Basic Steps +################## + +1.Determine the type of model and select the appropriate tokenizer (GensimWordTokenizer、 GensimSegTokenizer) to finish tokenization. + +2.Call train_vector function to get the required pre-trained model。 + +Examples: + +:: + + >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] + + # 10 dimension with fasstext method + train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="d2v") + + +Load models +-------- + +Transfer the obtained model to the I2V module to load the model. + +Examples: + +:: + + >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) + +The overview of our public model +------------ + +Version description +################## + +First level version: + +* Public version 1 (luna_pub): college entrance examination +* Public version 2 (luna_pub_large): college entrance examination + regional examination + +Second level version: + +* Minor subjects(Chinese,Math,English,History,Geography,Politics,Biology,Physics,Chemistry) +* Major subjects(science, arts and all subject) + +Third level version【to be finished】: + +* Don't use third-party initializers +* Use third-party initializers + +Description of train data in models +################## + +* Currently, the data used in w2v and d2v models are the subjects of senior high school. +* test data:`[OpenLUNA.json] `_ + +At present, the following models are provided. More models of different subjects and question types are being trained. Please look forward to it. + "d2v_all_256" (all subject), "d2v_sci_256" (Science), "d2v_eng_256" (English),"d2v_lit_256" (Arts) + + +Examples of Model Training +------------ + +Get the dataset +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + prepare_dataset <../../build/blitz/pretrain/prepare_dataset.ipynb> + +An example of d2v in gensim model +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + d2v_bow_tfidf <../../build/blitz/pretrain/gensim/d2v_bow_tfidf.ipynb> + d2v_general <../../build/blitz/pretrain/gensim/d2v_general.ipynb> + d2v_stem_tf <../../build/blitz/pretrain/gensim/d2v_stem_tf.ipynb> + +An example of w2v in gensim model +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + w2v_stem_text <../../build/blitz/pretrain/gensim/w2v_stem_text.ipynb> + w2v_stem_tf <../../build/blitz/pretrain/gensim/w2v_stem_tf.ipynb> + +An example of seg_token +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + d2v.ipynb <../../build/blitz/pretrain/seg_token/d2v.ipynb> + d2v_d1 <../../build/blitz/pretrain/seg_token/d2v_d1.ipynb> + d2v_d2 <../../build/blitz/pretrain/seg_token/d2v_d2.ipynb> \ No newline at end of file diff --git a/docs/source/tutorial/en/pretrain/loading.rst b/docs/source/tutorial/en/pretrain/loading.rst new file mode 100644 index 00000000..31fa3ea8 --- /dev/null +++ b/docs/source/tutorial/en/pretrain/loading.rst @@ -0,0 +1,11 @@ +Load models +-------- + +Transfer the obtained model to the I2V module to load the model. + +Examples: + +:: + + >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) diff --git a/docs/source/tutorial/en/pretrain/pub.rst b/docs/source/tutorial/en/pretrain/pub.rst new file mode 100644 index 00000000..34407745 --- /dev/null +++ b/docs/source/tutorial/en/pretrain/pub.rst @@ -0,0 +1,74 @@ +The overview of our public model +------------ + + +Version Description +################## + +First level version: + +* Public version 1 (luna_pub): college entrance examination +* Public version 2 (luna_pub_large): college entrance examination + regional examination + +Second level version: + +* Minor subjects(Chinese,Math,English,History,Geography,Politics,Biology,Physics,Chemistry) +* Major subjects(science, arts and all subject) + +Third level version【to be finished】: + +* Don't use third-party initializers +* Use third-party initializers + +Description of train data in models +################## + +* Currently, the data used in w2v and d2v models are the subjects of senior high school. +* test data:`[OpenLUNA.json] `_ + +At present, the following models are provided. More models of different subjects and question types are being trained. Please look forward to it. + "d2v_all_256" (all subject), "d2v_sci_256" (Science), "d2v_eng_256" (English),"d2v_lit_256" (Arts) + +Examples of model training +------------ + +Get the dataset +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + prepare_dataset <../../../build/blitz/pretrain/prepare_dataset.ipynb> + +An example of d2v in gensim model +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + d2v_bow_tfidf <../../../build/blitz/pretrain/gensim/d2v_bow_tfidf.ipynb> + d2v_general <../../../build/blitz/pretrain/gensim/d2v_general.ipynb> + d2v_stem_tf <../../../build/blitz/pretrain/gensim/d2v_stem_tf.ipynb> + +An example of w2v in gensim model +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + w2v_stem_text <../../../build/blitz/pretrain/gensim/w2v_stem_text.ipynb> + w2v_stem_tf <../../../build/blitz/pretrain/gensim/w2v_stem_tf.ipynb> + +An example of seg_token +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + d2v.ipynb <../../../build/blitz/pretrain/seg_token/d2v.ipynb> + d2v_d1 <../../../build/blitz/pretrain/seg_token/d2v_d1.ipynb> + d2v_d2 <../../../build/blitz/pretrain/seg_token/d2v_d2.ipynb> diff --git a/docs/source/tutorial/en/pretrain/start.rst b/docs/source/tutorial/en/pretrain/start.rst new file mode 100644 index 00000000..9c5bc241 --- /dev/null +++ b/docs/source/tutorial/en/pretrain/start.rst @@ -0,0 +1,24 @@ +Train the model +------------ + +Call train_Vector function interface directly to make the training model easier. This section calls the relevant training models in the gensim library. At present, the training methods of "sg"、 "cbow"、 "fastext"、 "d2v"、 "bow"、 "tfidf" are provided. Parameter embedding_dim is also provided for users to determine vector dimension according to their needs. + +Basic Steps +################## + +1.Determine the type of model and select the appropriate tokenizer (GensimWordTokenizer、 GensimSegTokenizer) to finish tokenization. + +2.Call train_vector function to get the required pre-trained model。 + +Examples: + +:: + + >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] + + # 10 dimension with fasstext method + train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="d2v") diff --git a/docs/source/tutorial/en/seg.rst b/docs/source/tutorial/en/seg.rst new file mode 100644 index 00000000..ad2696a2 --- /dev/null +++ b/docs/source/tutorial/en/seg.rst @@ -0,0 +1,187 @@ +Component Segmentation +========= + +Educational resource is a kind of multimodal data, including data such as text, picture, formula and so on. +At the same time, it may also contain different components semantically, such as question stems, options, etc. Therefore, we first need to identify and segment the different components of educational resources: + +* Semantic Component Segmentation +* Structural Component Segmentation + +Main Processing Contents +-------------------- + +1. Convert multiple-choice questions in the form of dict to qualified item by `Syntax parsing `_; + +2. The input items are segmented and grouped according to the element type. + +Semantic Component Segmentation +------------ + +Because multiple-choice questions are given in the form of dict, it is necessary to convert them into text format while retaining their data relationship. This function can be realized by dict2str4sif function which can convert multiple-choice question items into character format and identify question stem and options。 + +Import Modules ++++++++++ + +:: + + from EduNLP.utils import dict2str4sif + +Basic Usage +++++++++++++++++++ + +:: + + >>> item = { + ... "stem": r"若复数$z=1+2 i+i^{3}$,则$|z|=$", + ... "options": ['0', '1', r'$\sqrt{2}$', '2'], + ... } + >>> dict2str4sif(item) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' + +Optional additional parameters / interfaces +++++++++++++++++++++++ + +1.add_list_no_tag: if this parameter is true, it means that you need to count the labels in the options section. + +:: + + >>> dict2str4sif(item, add_list_no_tag=True) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' + + >>> dict2str4sif(item, add_list_no_tag=False) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2$\\SIFTag{options_end}$' + +2.tag_mode: The location for the label can be selected using this parameter. 'delimiter' is to label both the beginning and the end,'head' is to label only the head, and 'tail' is to label only the tail. + +:: + + >>> dict2str4sif(item, tag_mode="head") # doctest: +ELLIPSIS + '$\\SIFTag{stem}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{options}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2' + + >>> dict2str4sif(item, tag_mode="tail") # doctest: +ELLIPSIS + '若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options}$' + +3.key_as_tag: If this parameter is false, this process will only adds $\SIFSep$ between the options without distinguishing the type of segmentation label. + +:: + + >>> dict2str4sif(item, key_as_tag=False) + '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' + +Structural Component Segmentation +------------ + +This step is to segment sliced items. In this step, there is a depth option. You can select all positions or some labels for segmentation according to your needs, such as \SIFSep and \SIFTag. You can also select where to add labels, either at the head and tail or only at the head or tail. + + +There are two modes: + +* linear mode: it is used for text processing (word segmentation using jieba library); + +* ast mode: it is used to parse the formula. + +Basic Segmentation process: + +- Match components with regular expression matching + +- Process the components with special structures, such as converting the base64 encoded picture to numpy form + +- Classify the elements into each element group + +- Enter the corresponding parameters as required to get the filtered results + +Import Modules ++++++++++ + +:: + + from EduNLP.SIF.segment import seg + from EduNLP.SIF import sif4sci + +Basic Usage +++++++++++++++++++ + +:: + + >>> test_item = r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$" + >>> seg(test_item) + >>> ['如图所示,则', '\\bigtriangleup ABC', '的面积是', '\\SIFBlank', '。', \FigureID{1}] + +Optional additional parameters/interfaces +++++++++++++++++++++++ + +1.describe: count the number of elements of different types + +:: + + >>> s.describe() + {'t': 3, 'f': 1, 'g': 1, 'm': 1} + +2.filter: this interface can screen out one or more types of elements. + +Using this interface, you can pass in a "keep" parameter or a special character directly to choose what type of elements to retain. + +Element type represented by symbol: + +- "t": text +- "f": formula +- "g": figure +- "m": question mark +- "a": tag +- "s": sep tag + +:: + + >>> with s.filter("f"): + ... s + ['如图所示,则', '的面积是', '\\SIFBlank', '。', \FigureID{1}] + >>> with s.filter(keep="t"): + ... s + ['如图所示,则', '的面积是', '。'] + +3.symbol: this interface can convert some types of data into special symbols + +Element type represented by symbol: + +- "t": text +- "f": formula +- "g": figure +- "m": question mark + +:: + + >>> seg(test_item, symbol="fgm") + ['如图所示,则', '[FORMULA]', '的面积是', '[MARK]', '。', '[FIGURE]'] + >>> seg(test_item, symbol="tfgm") + ['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]'] + +In addition,sif4sci function is also provided, which can easily convert items into the result processed by Structural Component Segmentation + +:: + + >>> segments = sif4sci(item["stem"], figures=figures, tokenization=False) + >>> segments + ['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形', 'ABC', '的斜边', 'BC', ', 直角边', 'AB', ', ', 'AC', '.', '\\bigtriangleup ABC', '的三边所围成的区域记为', 'I', ',黑色部分记为', 'II', ', 其余部分记为', 'III', '.在整个图形中随机取一点,此点取自', 'I,II,III', '的概率分别记为', 'p_1,p_2,p_3', ',则', '\\SIFChoice', \FigureID{1}] + +- When calling this function, you can selectively output a certain type of data according to your needs + +:: + + >>> segments.formula_segments + ['ABC', + 'BC', + 'AB', + 'AC', + '\\bigtriangleup ABC', + 'I', + 'II', + 'III', + 'I,II,III', + 'p_1,p_2,p_3'] + +- Similar to seg function, sif4sci function also provides depth options to help with your research ----- By modifying the ``symbol`` parameter, different components can be transformed into specific markers. + +:: + + >>> sif4sci(item["stem"], figures=figures, tokenization=False, symbol="tfgm") + ['[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[FIGURE]'] diff --git a/docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst b/docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst new file mode 100644 index 00000000..3901f4cb --- /dev/null +++ b/docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst @@ -0,0 +1,47 @@ +Semantic Component Segmentation +------------ + +Because multiple-choice questions are given in the form of dict, it is necessary to convert them into text format while retaining their data relationship. This function can be realized by dict2str4sif function which can convert multiple-choice question items into character format and identify question stem and options。 + + +Basic Usage +++++++++++++++++++ + +:: + + >>> item = { + ... "stem": r"若复数$z=1+2 i+i^{3}$,则$|z|=$", + ... "options": ['0', '1', r'$\sqrt{2}$', '2'], + ... } + >>> dict2str4sif(item) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' + +Optional additional parameters / interfaces +++++++++++++++++++++++ + +1.add_list_no_tag: if this parameter is true, it means that you need to count the labels in the options section. + +:: + + >>> dict2str4sif(item, add_list_no_tag=True) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' + + >>> dict2str4sif(item, add_list_no_tag=False) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2$\\SIFTag{options_end}$' + +2.tag_mode: The location for the label can be selected using this parameter. 'delimiter' is to label both the beginning and the end,'head' is to label only the head, and 'tail' is to label only the tail. + +:: + + >>> dict2str4sif(item, tag_mode="head") # doctest: +ELLIPSIS + '$\\SIFTag{stem}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{options}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2' + + >>> dict2str4sif(item, tag_mode="tail") # doctest: +ELLIPSIS + '若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options}$' + +3.key_as_tag: If this parameter is false, this process will only adds $\SIFSep$ between the options without distinguishing the type of segmentation label. + +:: + + >>> dict2str4sif(item, key_as_tag=False) + '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' \ No newline at end of file diff --git a/docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst b/docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst new file mode 100644 index 00000000..8661c3d6 --- /dev/null +++ b/docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst @@ -0,0 +1,67 @@ +Structural Component Segmentation +------------ + +This step is to segment sliced items. In this step, there is a depth option. You can select all positions or some labels for segmentation according to your needs, such as \SIFSep and \SIFTag. You can also select where to add labels, either at the head and tail or only at the head or tail. + + +There are two modes: + +* linear mode: it is used for text processing (word segmentation using jieba library); + +* ast mode: it is used to parse the formula. + +Basic Usage +++++++++++++++++++ + +:: + + >>> test_item = r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$" + >>> seg(test_item) + >>> ['如图所示,则', '\\bigtriangleup ABC', '的面积是', '\\SIFBlank', '。', \FigureID{1}] + +Optional additional parameters/interfaces +++++++++++++++++++++++ + +1.describe: count the number of elements of different types + +:: + + >>> s.describe() + {'t': 3, 'f': 1, 'g': 1, 'm': 1} + +2.filter: this interface can screen out one or more types of elements. + +Using this interface, you can pass in a "keep" parameter or a special character directly to choose what type of elements to retain. + +Element type represented by symbol: + "t": text + "f": formula + "g": figure + "m": question mark + "a": tag + "s": sep tag + +:: + + >>> with s.filter("f"): + ... s + ['如图所示,则', '的面积是', '\\SIFBlank', '。', \FigureID{1}] + >>> with s.filter(keep="t"): + ... s + ['如图所示,则', '的面积是', '。'] + +3.symbol: this interface can convert some types of data into special symbols + +Element type represented by symbol: + +- "t": text +- "f": formula +- "g": figure +- "m": question mark + +:: + + >>> seg(test_item, symbol="fgm") + ['如图所示,则', '[FORMULA]', '的面积是', '[MARK]', '。', '[FIGURE]'] + >>> seg(test_item, symbol="tfgm") + ['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]'] diff --git a/docs/source/tutorial/en/sif.rst b/docs/source/tutorial/en/sif.rst index 0cbe7cf3..877cb503 100644 --- a/docs/source/tutorial/en/sif.rst +++ b/docs/source/tutorial/en/sif.rst @@ -1,2 +1,145 @@ Standard Item Format -==================== +=============== + +version: 0.2 + +For the convenience of follow-up research and use, we need a unified test question grammar standard. + +Grammar Rules +----------- + +1. Only Chinese characters, Chinese and English punctuation and line breaks are allowed in the question text. + +2. Represent underlines of blanks and brackets of choices with ``\$\SIFBlank\$`` and ``\$\SIFChoice\$`` respectively. + +3. We use ``$\FigureID{ uuid }$`` or Base64 to represent pictures. Especially, ``$\FormFigureID{ uuid }$`` is used to represent formulas pictures. + +4. Text format description: we represent text in different styles with ``$\textf{item,CHAR_EN}$``. Currently, we have defined some styles: b-bold, i-italic, u-underline, w-wave, d-dotted, t-title. CHAR_EN Labels can be mixed and sorted alphabetically. An example: $\textf{EduNLP, b}$ looks **EduNLP** + +5. Other mathematical symbols like English letters, Roman characters and numbers need to be expressed in latex format, that is, embedded in `$$`. + +6. For the entry standard of molecular formula, please refer to `INCHI `_ for the time being. + +7. Currently, there are no requirements for latex internal syntax. + +:: + + 1. Item -> CHARACTER|EN_PUN_LIST|CH_PUN_LIST|FORMULA|QUES_MARK + 2. EN_PUN_LIST -> [',', '.', '?', '!', ':', ';', '\'', '\"', '(', ')', ' ','_','/','|','\\','<','>','[',']','-'] + 3. CH_PUN_LIST -> [',', '。', '!', '?', ':',';', '‘', '’', '“', '”', '(', ')', ' ', '、','《','》','—','.'] + 4. FORMULA -> $latex formula$ | $\FormFigureID{UUID}$ | $\FormFigureBase64{BASE64}$ + 5. FIGURE -> $\FigureID{UUID}$ | $\FigureBase64{BASE64}$ + 6. UUID -> [a-zA-Z\-0-9]+ + 7. CHARACTER -> CHAR_EN | CHAR_CH + 8. CHAR_EN -> [a-zA-Z]+ + 9. CHAR_CH -> [\u4e00-\u9fa5]+ + 10. DIGITAL -> [0-9]+ + 11. QUES_MARK -> $\SIFBlank$ | $\SIFChoice$ + + +Tips ++++++++++++++++ + +1. Reserved characters and escape characters. + +2. Numbers. + +3. Choices and blanks. + +4. A single number or letter is also required to be between `$$` (automatic verification could already realize it). + +5. Try to make sure Chinese is not included in the latex formula such as ``\text{CHAR_CH}``. + +6. When importing data using MySQL database, an ``\`` is automatically ignored which needs to be further processed as``\\``. + +Examples +----------------- + +Standard Format: + +:: + + 1. 若$x,y$满足约束条件$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,则$z=x+7 y$的最大值$\\SIFUnderline$' + + 2. 已知函数$f(x)=|3 x+1|-2|x|$画出$y=f(x)$的图像求不等式$f(x)>f(x+1)$的解集$\\PictureID{3bf2ddf4-8af1-11eb-b750-b46bfc50aa29}$$\\PictureID{59b8bd14-8af1-11eb-93a5-b46bfc50aa29}$$\\PictureID{63118b3a-8b75-11eb-a5c0-b46bfc50aa29}$$\\PictureID{6a006179-8b76-11eb-b386-b46bfc50aa29}$$\\PictureID{088f15eb-8b7c-11eb-a86f-b46bfc50aa29}$ + +Non-standard Format: + +1. Letters, numbers and mathematical symbols are mixed: + + For example: + + ``完成下面的2x2列联表,`` + + ``(单位:m3)`` + + ``则输出的n=`` + +2. Some special mathematical symbols are not represented by the latex formula: + + For example: + + ``命题中真命题的序号是 ①`` + + ``AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.若D为AC的中点`` + +3. There are unicode encoded characters in the text: + + For example: + ``则$a$的取值范围是(\u3000\u3000)`` + +Functions for judging whether text is in SIF format and converting to SIF format +-------------------------------------------- + +Call the Library +++++++++ +:: + + from EduNLP.SIF import is_sif, to_sif + +is_sif ++++++++++++ + +:: + + >>> text1 = '若$x,y$满足约束条件' + >>> text2 = '$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,' + >>> text3 = '则$z=x+7 y$的最大值$\\SIFUnderline$' + >>> text4 = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...' + >>> is_sif(text1) + True + >>> is_sif(text2) + True + >>> is_sif(text3) + True + >>> is_sif(text4) + False + +to_sif ++++++++++++ + +:: + + >>> text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...' + >>> to_sif(text) + '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...' + + +Change Log +---------------- + +2021-05-18 + +Changed + +1. Originally, we use ``\$\SIFUnderline\$`` and ``\$\SIFBracket\$`` to represent underlines of blanks and brackets of choices. Now we represent them with ``\$\SIFBlank\$`` and ``\$\SIFChoice\$``. + +2. Originally, we used ``$\PictureID{ uuid }$`` to represent pictures, but now we use ``$\FigureID{ uuid }$`` instead. Especially, ``$\FormFigureID{ uuid }$`` is used to represent formulas pictures. + +2021-06-28 + +Added: + +1. There should not be line breaks between the notation ``$$``. + +2. Add text format description. diff --git a/docs/source/tutorial/en/tokenization/GensimSegTokenizer.rst b/docs/source/tutorial/en/tokenization/GensimSegTokenizer.rst new file mode 100644 index 00000000..eb624e94 --- /dev/null +++ b/docs/source/tutorial/en/tokenization/GensimSegTokenizer.rst @@ -0,0 +1,9 @@ +GensimSegTokenizer +===================== + +By default, the pictures, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text, formulas and labels. Also, the tokenizer uses linear analysis method for text and abstract analysis method of syntax tree for formulas. + +Compared to GensimWordTokenizer, the main differences are: + +* It provides the depth option for segmentation position, such as \SIFSep and \SIFTag. +* By default, labels are inserted in the header of item components (such as text and formula). \ No newline at end of file diff --git a/docs/source/tutorial/en/tokenization/GensimWordTokenizer.rst b/docs/source/tutorial/en/tokenization/GensimWordTokenizer.rst new file mode 100644 index 00000000..98d4b10a --- /dev/null +++ b/docs/source/tutorial/en/tokenization/GensimWordTokenizer.rst @@ -0,0 +1,23 @@ +GensimWordTokenizer +===================== + +By default, the pictures, blanks in the question text and other parts of the incoming item are converted into special characters for data security and the tokenization of text, formulas, labels and separators. Also, the tokenizer uses linear analysis method for text and abstract syntax tree method for formulas respectively. You can choose each of them by ``general`` parameter: + +-true, it means that the incoming item conforms to SIF and the linear analysis method should be used. +-false, it means that the incoming item doesn't conform to SIF and the abstract syntax tree method should be used. + +Examples +---------- + +:: + + >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] + >>> tokenizer = GensimWordTokenizer(symbol="fgmas", general=False) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]'] diff --git a/docs/source/tutorial/en/tokenization/PureTextTokenizer.rst b/docs/source/tutorial/en/tokenization/PureTextTokenizer.rst new file mode 100644 index 00000000..8c36e67c --- /dev/null +++ b/docs/source/tutorial/en/tokenization/PureTextTokenizer.rst @@ -0,0 +1,31 @@ +PureTextTokenizer +================ + +By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security. At the same time, special formulas such as $\\FormFigureID{...}$ and $\\FormFigureBase64{...}$ are screened out to facilitate the tokenization of text and plain text formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future. + +Examples +---------- + +:: + + >>> tokenizer = PureTextTokenizer() + >>> items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"] + >>> tokens = tokenizer(items) + >>> next(tokens)[:10] + ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z'] + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + >>> items = [{ + ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", + ... "options": ["1", "2"] + ... }] + >>> tokens = tokenizer(items, key=lambda x: x["stem"]) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] diff --git a/docs/source/tutorial/en/tokenization/TextTokenizer.rst b/docs/source/tutorial/en/tokenization/TextTokenizer.rst new file mode 100644 index 00000000..08991be6 --- /dev/null +++ b/docs/source/tutorial/en/tokenization/TextTokenizer.rst @@ -0,0 +1,27 @@ +TextTokenizer +================ + +By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text and formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future. + + +Examples +---------- + +:: + + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokenizer = TextTokenizer() + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + >>> items = [{ + ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", + ... "options": ["1", "2"] + ... }] + >>> tokens = tokenizer(items, key=lambda x: x["stem"]) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] diff --git a/docs/source/tutorial/en/tokenize.rst b/docs/source/tutorial/en/tokenize.rst new file mode 100644 index 00000000..3411b74b --- /dev/null +++ b/docs/source/tutorial/en/tokenize.rst @@ -0,0 +1,172 @@ +Tokenization +======= + +Tokenization, known as word segmentation and sentence segmentation, is a basic but very important step in the field of NLP. +In EduNLP, we divided Tokenization into different levels according to different granularity. To avoid ambiguity, we define as follows: + +* Word/char level: word segmentation + +* Sentence level: sentence segmentation + +* Resource level: tokenization + +This module provides tokenization function of question text, converting questions into token sequences to facilitate the vectorization of questions. After that, each element in the sliced item needs word segmentation. In this step, there is a depth option. You can select all positions or some labels for segmentation according to your needs, such as \SIFSep and \SIFTag. You can also select where to add labels, either at the head and tail or only at the head or tail. + +There are two modes: one is linear mode, which is used for text processing (word segmentation using jieba library). The other one is ast mode, which is used to parse the formula. + +Word Segmentation +------- + +Text-tokenization: A sentence (without formulas) consists of several "words" in order. The process of dividing a sentence into several words is called "Text-tokenization". According to the granularity of "words", it can be subdivided into "Word-tokenization" and "Char-tokenization". + +:: + + - Word-tokenization: each phrase is a token. + + - Char-tokenization: each character is a token. + + +Text-tokenization is divided into two main steps: + +1. Text-tokenization: + + - Word-tokenization: use the word segmentation tool to segment and extract words from the question text. Our project supports `jieba`. + + - Char-tokenization: process text by character. + +2. Filter: filter the specified stopwords. + + The default stopwords used in this project:`[stopwords] `_ + You can also use your own stopwords. The following example demonstrates how to use. + +Examples: + +:: + + from EduNLP.SIF.tokenization.text import tokenize + >>> text = "三角函数是基本初等函数之一" + >>> tokenize(text, granularity="word") + ['三角函数', '初等', '函数'] + + >>> tokenize(text, granularity="char") + ['三', '角', '函', '数', '基', '初', '函', '数'] + +Sentence Segmentation +------- + +During the process of sentence segmentation, a long document is divided into several sentences. Each sentence is a "token" (to be realized). + +Tokenization +------- +Tokenization is comprehensive analysis. In this process, sentences with formulas are segmented into several markers. Each marker is a "token". + +The implementation of this function is tokenize function. The required results can be obtained by passing in items after Structural Component Segmentation. + +:: + + from EduNLP.Tokenizer import get_tokenizer + >>> items = "如图所示,则三角形$ABC$的面积是$\\SIFBlank$。$\\FigureID{1}$" + >>> tokenize(SegmentList(items)) + ['如图所示', '三角形', 'ABC', '面积', '\\\\SIFBlank', \\FigureID{1}] + >>> tokenize(SegmentList(items),formula_params={"method": "ast"}) + ['如图所示', '三角形', , '面积', '\\\\SIFBlank', \\FigureID{1}] + + + +You can view ``./EduNLP/Tokenizer/tokenizer.py`` and ``./EduNLP/Pretrain/gensim_vec.py`` for more tokenizers. We provide some encapsulated tokenizers for users to call them conveniently. Following is a complete list of tokenizers: + +- TextTokenizer + +- PureTextTokenizer + +- GensimSegTokenizer + +- GensimWordTokenizer + + +TextTokenizer ++++++++++++++++++++++ + +By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text and formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future. + +:: + + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokenizer = TextTokenizer() + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + >>> items = [{ + ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", + ... "options": ["1", "2"] + ... }] + >>> tokens = tokenizer(items, key=lambda x: x["stem"]) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + +PureTextTokenizer ++++++++++++++++++++++ + +By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security. At the same time, special formulas such as $\\FormFigureID{...}$ and $\\FormFigureBase64{...}$ are screened out to facilitate the tokenization of text and plain text formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future. + +:: + + >>> tokenizer = PureTextTokenizer() + >>> items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"] + >>> tokens = tokenizer(items) + >>> next(tokens)[:10] + ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z'] + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + >>> items = [{ + ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", + ... "options": ["1", "2"] + ... }] + >>> tokens = tokenizer(items, key=lambda x: x["stem"]) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + +GensimWordTokenizer ++++++++++++++++++++++++ + +By default, the pictures, blanks in the question text and other parts of the incoming item are converted into special characters for data security and the tokenization of text, formulas, labels and separators. Also, the tokenizer uses linear analysis method for text and abstract syntax tree method for formulas respectively. You can choose each of them by ``general`` parameter: + +-true, it means that the incoming item conforms to SIF and the linear analysis method should be used. +-false, it means that the incoming item doesn't conform to SIF and the abstract syntax tree method should be used. + +GensimSegTokenizer +++++++++++++++++++++ + +By default, the pictures, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text, formulas and labels. Also, the tokenizer uses linear analysis method for text and abstract analysis method of syntax tree for formulas. + +Compared to GensimWordTokenizer, the main differences are: + +* It provides the depth option for segmentation position, such as \SIFSep and \SIFTag. +* By default, labels are inserted in the header of item components (such as text and formulas). + +Examples +---------- + +:: + + >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] + >>> tokenizer = GensimWordTokenizer(symbol="fgmas", general=False) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]'] diff --git a/docs/source/tutorial/en/tokenize/Sentence Segmentation.rst b/docs/source/tutorial/en/tokenize/Sentence Segmentation.rst new file mode 100644 index 00000000..1a8d4950 --- /dev/null +++ b/docs/source/tutorial/en/tokenize/Sentence Segmentation.rst @@ -0,0 +1,3 @@ +Sentence Segmentation +------- +During the process of sentence segmentation, a long document is divided into several sentences. Each sentence is a "token" (to be realized). diff --git a/docs/source/tutorial/en/tokenize/Tokenization.rst b/docs/source/tutorial/en/tokenize/Tokenization.rst new file mode 100644 index 00000000..fad25912 --- /dev/null +++ b/docs/source/tutorial/en/tokenize/Tokenization.rst @@ -0,0 +1,29 @@ +Tokenization +------- +Tokenization is comprehensive analysis. In this process, sentences with formulas are segmented into several markers. Each marker is a "token". +We provide some encapsulated tokenizers for users to call them conveniently. The following is a complete list of tokenizers. + +Examples + +:: + + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokenizer = TextTokenizer() + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + + + +You can view ``./EduNLP/Tokenizer/tokenizer.py`` and ``./EduNLP/Pretrain/gensim_vec.py`` for more tokenizers. Following is a complete list of tokenizers: + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + ../tokenization/TextTokenizer + ../tokenization/PureTextTokenizer + ../tokenization/GensimSegTokenizer + ../tokenization/GensimWordTokenizer diff --git a/docs/source/tutorial/en/tokenize/WordSegmentation.rst b/docs/source/tutorial/en/tokenize/WordSegmentation.rst new file mode 100644 index 00000000..a85f4dae --- /dev/null +++ b/docs/source/tutorial/en/tokenize/WordSegmentation.rst @@ -0,0 +1,36 @@ +Word segmentation +------- + +Text-tokenization: A sentence (without formulas) consists of several "words" in order. The process of dividing a sentence into several words is called "Text-tokenization". According to the granularity of "words", it can be subdivided into "Word-tokenization" and "Char-tokenization". + +:: + + - Word-tokenization: each phrase is a token. + + - Char-tokenization: each character is a token. + + +Text-tokenization is divided into two main steps: + +1. Text-tokenization: + + - Word-tokenization: use the word segmentation tool to segment and extract words from the question text. Our project supports `jieba`. + + - Char-tokenization: process text by character. + +2. Filter: filter the specified stopwords. + + The default stopwords used in this project:`[stopwords] `_ + You can also use your own stopwords. The following example demonstrates how to use. + +Examples: + +:: + + >>> text = "三角函数是基本初等函数之一" + >>> tokenize(text, granularity="word") + ['三角函数', '初等', '函数'] + + >>> tokenize(text, granularity="char") + ['三', '角', '函', '数', '基', '初', '函', '数'] + diff --git a/docs/source/tutorial/en/vectorization.rst b/docs/source/tutorial/en/vectorization.rst new file mode 100644 index 00000000..5b744eeb --- /dev/null +++ b/docs/source/tutorial/en/vectorization.rst @@ -0,0 +1,157 @@ +Vectorization +========= + +This section provides a simple interface to convert the incoming items into vectors directly. Currently, the option of whether to use the pre training model is provided. You can choose according to your needs. If you don't want to use the pre-trained model, you can call D2V directly, or call get_pretrained_i2v function if you want to use the pre-trained model. + +- Don't use the pre-trained model + +- Use the pre-trained model + +Overview Flow +--------------------------- + +1.Perform `syntax parsing `_ on incoming items to get items in SIF format; + +2.Perform `component segmentation `_ on sif_items; + +3.Perform `tokenization `_ on segmented items; + +4.Use the existing or pre-trained model we provided to convert the tokenized items into vectors. + + +Don't use the pre-trained model: call existing models directly +------------------------------------ + +You can use any pre-trained model provided by yourself (just give the storage path of the model) to convert the given question text into vectors. + +* Advantages: it is flexible to use your own model and its parameters can be adjusted freely. + +Import modules +++++++++++ + +:: + + from EduNLP.I2V import D2V,W2V,get_pretrained_i2v + from EduNLP.Vector import T2V,get_pretrained_t2v + +Models provided +++++++++++++++++++++ + +- W2V + +- D2V + +- T2V + +W2V +<<<<<<<<< + +This model directly uses the relevant model methods in the gensim library to convert words into vectors. Currently, there are four methods: + + - FastText + + - Word2Vec + + - KeyedVectors + +:: + + >>> i2v = get_pretrained_i2v("test_w2v", "examples/test_model/data/w2v") # doctest: +ELLIPSIS + >>> item_vector, token_vector = i2v(["有学者认为:‘学习’,必须适应实际"]) + >>> item_vector # doctest: +ELLIPSIS + array([[...]], dtype=float32) + +D2V +<<<<<<<<<<<< + +This model is a comprehensive processing method which can convert items into vectors. Currently, the following methods are provided: + +- d2v: call doc2vec module in gensim library to convert items into vectors. + +- BowLoader: call corpora module in gensim library to convert docs into bows. + +- TfidfLoader: call TfidfModel module in gensim library to convert docs into bows. + +:: + + >>> item = {"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\SIFChoice$$\FigureID{1}$"} + >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) + >>> i2v(item) + ([array([ 4.76559885e-02, -1.60574958e-01, 1.94614579e-03, 2.40295693e-01, + 2.24517003e-01, -3.24351490e-02, 4.35789041e-02, -1.65670961e-02,... + +T2V +<<<<<<<<<< + +You can use any pre-trained model provided by yourself to represent the segmentation sequences of a group of questions as vectors (just give the storage path of the model). + +- Advantages: the model and its parameters can be adjusted independently and has strong flexibility. + +Input +^^^^^^^^^^ + +Types: list +Contents: the combination of each question segmentation sequences in one question group. +>You can transfer question text (`str` type) to tokens using ``GensimWordTokenizer`` model + +:: + + >>> token_items=['公式','[FORMULA]','公式','[FORMULA]','如图','[FIGURE]','x',',','y','约束条件','[SEP]','z','=','x','+','7','y','最大值','[MARK]'] + >>> path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> t2v = T2V('d2v',filepath=path) + >>> t2v(token_items) + [array([ 0.0256574 , 0.06061139, -0.00121044, -0.0167674 , -0.0111706 , + 0.05325712, -0.02097339, -0.01613594, 0.02904145, 0.0185046 ,... + +Specific process of processing +++++++++++++++++++++ + +1.Call get_tokenizer function to get the result after word segmentation; + +2.Select the model provided for vectorization depending on the model used. + + +Use the pre-training model: call get_pretrained_i2v directly +--------------------------------------------- + +Use the pre-training model provided by EduNLP to convert the given question text into vectors. + +* Advantages: Simple and convenient. + +* Disadvantages: Only the model given in the project can be used, which has great limitations. + +* Call this function to obtain the corresponding pre-training model. At present, the following pre training models are provided: d2v_all_256, d2v_sci_256, d2v_eng_256 and d2v_lit_256. + +Selection and Use of Models +################## + +Select the pre-training model according to the subject: + ++--------------------+------------------------+ +| Pre-training model name | Subject of model training data | ++====================+========================+ +| d2v_all_256 | all subject | ++--------------------+------------------------+ +| d2v_sci_256 | Science | ++--------------------+------------------------+ +| d2v_lit_256 | Arts | ++--------------------+------------------------+ +| d2v_eng_256 | English | ++--------------------+------------------------+ + + +The concrete process of processing +################## + +1.Download the corresponding preprocessing model + +2.Transfer the obtained model to D2V and process it with D2V + Convert the obtained model into D2V and process it through D2V + +Examples: + +:: + + >>> i2v = get_pretrained_i2v("d2v_sci_256") + >>> i2v(item) diff --git a/docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst b/docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst new file mode 100644 index 00000000..41dcab64 --- /dev/null +++ b/docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst @@ -0,0 +1,42 @@ +Use the pre-training model: call get_pretrained_i2v directly +--------------------------------------------- + +Use the pre-training model provided by EduNLP to convert the given question text into vectors. + +* Advantages: Simple and convenient. + +* Disadvantages: Only the model given in the project can be used, which has great limitations. + +* Call this function to obtain the corresponding pre-training model. At present, the following pre training models are provided: d2v_all_256, d2v_sci_256, d2v_eng_256 and d2v_lit_256. + +Selection and use of models +################## + +Select the pre-training model according to the subject: + ++--------------------+------------------------+ +| Pre-training model name | Subject of model training data | ++====================+========================+ +| d2v_all_256 | all subject | ++--------------------+------------------------+ +| d2v_sci_256 | Science | ++--------------------+------------------------+ +| d2v_lit_256 | Arts | ++--------------------+------------------------+ +| d2v_eng_256 | English | ++--------------------+------------------------+ + +The concrete process of processing +################## + +1.Download the corresponding preprocessing model + +2.Transfer the obtained model to D2V and process it with D2V + Convert the obtained model into D2V and process it through D2V + +Examples: + +:: + + >>> i2v = get_pretrained_i2v("d2v_sci_256") + >>> i2v(item) diff --git a/docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst b/docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst new file mode 100644 index 00000000..2989f8ba --- /dev/null +++ b/docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst @@ -0,0 +1,21 @@ +Don't use the pre-trained model: call existing models directly +------------------------------------ + +You can use any pre-trained model provided by yourself (just give the storage path of the model) to convert the given question text into vectors. + +* Advantages: it is flexible to use your own model and its parameters can be adjusted freely. + +Specific process of processing +++++++++++++++++++++ + +1.Call get_tokenizer function to get the result after word segmentation; + +2.Select the model provided for vectorization depending on the model used. + +Examples: + +:: + + >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) + >>> i2v(item) From c84f5ac96d29b4f2c444336634785f27163d4d6f Mon Sep 17 00:00:00 2001 From: fannazya Date: Sat, 23 Oct 2021 20:24:15 +0800 Subject: [PATCH 3/6] Revert "Add English version tutorial" This reverts commit d3e44223596f12c4821a733bfb8d528800856b3f. --- docs/source/tutorial/en/index.rst | 52 +--- docs/source/tutorial/en/parse.rst | 290 ------------------ .../parse/FormulaSyntaxStructureParsing.rst | 168 ---------- .../en/parse/TextSyntaxStructureParsing.rst | 72 ----- docs/source/tutorial/en/pretrain.rst | 130 -------- docs/source/tutorial/en/pretrain/loading.rst | 11 - docs/source/tutorial/en/pretrain/pub.rst | 74 ----- docs/source/tutorial/en/pretrain/start.rst | 24 -- docs/source/tutorial/en/seg.rst | 187 ----------- .../en/seg/SemanticComponentSegmentation.rst | 47 --- .../seg/StructuralComponentSegmentation.rst | 67 ---- docs/source/tutorial/en/sif.rst | 145 +-------- .../en/tokenization/GensimSegTokenizer.rst | 9 - .../en/tokenization/GensimWordTokenizer.rst | 23 -- .../en/tokenization/PureTextTokenizer.rst | 31 -- .../en/tokenization/TextTokenizer.rst | 27 -- docs/source/tutorial/en/tokenize.rst | 172 ----------- .../en/tokenize/Sentence Segmentation.rst | 3 - .../tutorial/en/tokenize/Tokenization.rst | 29 -- .../tutorial/en/tokenize/WordSegmentation.rst | 36 --- docs/source/tutorial/en/vectorization.rst | 157 ---------- .../en/vectorization/WithPre-trainedModel.rst | 42 --- .../vectorization/WithoutPre-trainedModel.rst | 21 -- 23 files changed, 2 insertions(+), 1815 deletions(-) delete mode 100644 docs/source/tutorial/en/parse.rst delete mode 100644 docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst delete mode 100644 docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst delete mode 100644 docs/source/tutorial/en/pretrain.rst delete mode 100644 docs/source/tutorial/en/pretrain/loading.rst delete mode 100644 docs/source/tutorial/en/pretrain/pub.rst delete mode 100644 docs/source/tutorial/en/pretrain/start.rst delete mode 100644 docs/source/tutorial/en/seg.rst delete mode 100644 docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst delete mode 100644 docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst delete mode 100644 docs/source/tutorial/en/tokenization/GensimSegTokenizer.rst delete mode 100644 docs/source/tutorial/en/tokenization/GensimWordTokenizer.rst delete mode 100644 docs/source/tutorial/en/tokenization/PureTextTokenizer.rst delete mode 100644 docs/source/tutorial/en/tokenization/TextTokenizer.rst delete mode 100644 docs/source/tutorial/en/tokenize.rst delete mode 100644 docs/source/tutorial/en/tokenize/Sentence Segmentation.rst delete mode 100644 docs/source/tutorial/en/tokenize/Tokenization.rst delete mode 100644 docs/source/tutorial/en/tokenize/WordSegmentation.rst delete mode 100644 docs/source/tutorial/en/vectorization.rst delete mode 100644 docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst delete mode 100644 docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst diff --git a/docs/source/tutorial/en/index.rst b/docs/source/tutorial/en/index.rst index 4c8cc040..108a9487 100644 --- a/docs/source/tutorial/en/index.rst +++ b/docs/source/tutorial/en/index.rst @@ -1,52 +1,2 @@ Get Started -===== - -* `Standard Item Format `_ - -* `Syntax Parsing `_ - -* `Component Segmentation `_ - -* `Tokenization `_ - -* `Pre-training `_ - -* `Vectorization `_ - -Main process ----------- - -.. figure:: ../../_static/new_flow.png - -* `Syntax Parsing `_ : Its function is to convert the incoming item into SIF format, which means letters and numbers should be between ``$...$`` and the brackets and underlines of the choice questions should be converted to special symbols we defined in SIF) - -* `Component Segmentation `_ : Its function is to segment items in SIF format according to the types of items, so as to serve the later tokenization module.(that is, elements in different types can be tokenized using their corresponding methods)。 - -* `Tokenization `_: Its function is to tokenize segmented items, so as to serve the later tokenization module. - Generally, the tokenization method in the text form can be used directly. For formulas, the ast method can also be used for parsing(call the formula module); - -* `Vectorization `_: This part mainly calls I2V class and its subclasses. Its function is to vectorize the list of tokenized items, so as to get the corresponding static vectors. - For vectorization module, You can call your own trained model or directly call the provided pre-training model(call get_ pretrained_ I2V module). - -* **Downstream Model**:Process the obtained vectors to get the desired results。 - -Examples --------- - -To help you quickly understand the functions of this project, this section only shows the usages of common function interface. Intermediate function modules (such as parse, formula, segment, etc.) and more subdivided interface methods are not shown. For further study, please refer to relevant documents. - -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: tokenize_gallery - :glob: - - Tokenization <../../build/blitz/tokenizer/tokenizer.ipynb> - - - -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: vectorization_gallery - :glob: - - Vectorization <../../build/blitz/vectorization/total_vector.ipynb> +=========== diff --git a/docs/source/tutorial/en/parse.rst b/docs/source/tutorial/en/parse.rst deleted file mode 100644 index 69608c5e..00000000 --- a/docs/source/tutorial/en/parse.rst +++ /dev/null @@ -1,290 +0,0 @@ -Syntax Parsing -========= - -In educational resources, texts and formulas have internal implicit or explicit syntax structures. It is of great benefit for further processing to extract these structures. -* Text syntax structure parsing - -* Formula syntax structure parsing - -The purpose is as follows: - - -1. Represent underlines of blanks and brackets of choices with special identifiers. And the alphabets and formulas should be wrapped with $$, so that items of different types can be cut accurately through the symbol $. -2. Determine whether the current item is legal and report the error type. - -Specific processing content --------------------- - -1.Its function is to match alphabets and numbers other than formulas. Only the alphabets and numbers between two Chinese characters will be corrected, and the rest of the cases are regarded as formulas that do not conform to latex syntax. - -2.Match brackets like "( )" (both English format and Chinese format), that is, brackets with no content or spaces, which should be replaced with ``$\\SIFChoice$`` - -3.Match continuous underscores or underscores with spaces to replace them with ``$\\SIFBlank$``. - -4.Match latex formulas,check the completeness and analyzability of latex formulas, and report an error for illegal formula. - -Formula syntax structure parsing --------------------- - -This section is mainly realized by EduNLP.Formula modules, which can determine if the text has syntax errors and convert the syntax formula into the form of ast tree. In practice, this module is often used as part of an intermediate process, and the relevant parameters of this module can be automatically chosen by calling the corresponding model, so it generally does not need special attention. - -Introduction of Main Introduction -+++++++++++++++ - -1.Formula: determine whether the single formula passed in is in str form. If so, use the ast method for processing, otherwise an error will be reported. In addition, parameter variable_standardization is given. If this parameter is true, the variable standardization method will be used to make sure the same variable has the same variable number. - -2.FormulaGroup: If you need to pass in a formula set, you can call this interface to get an ast forest. The tree structure in the forest is the same as that of Formula. - -Formula ->>>>>>>>>>>> - -Formula: firstly, in the word segmentation function, the formula of the original text is segmented. In addition, ``Formula parse tree`` function is provided, which can represent the abstract syntax analysis tree of mathematical formula in the form of text or picture. - -This module also provides the function of formula variable standardization, such as determining that 'x' in several sub formulas is the same variable. - -Call the library -+++++++++ - -:: - - import matplotlib.pyplot as plt - from EduNLP.Formula import Formula - from EduNLP.Formula.viz import ForestPlotter - -Initialization -+++++++++ - -Incoming parameters: item - -Item is the latex formula or the abstract syntax parse tree generated after the formula is parsed and its type is str or List[Dict]. - -:: - - >>> f=Formula("x^2 + x+1 = y") - >>> f - - -View the specific content after formula segmentation -++++++++++++++++++++++++++++ - -- View node elements after formula segmentation - -:: - - >>> f.elements - [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, - {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, - {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, - {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, - {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, - {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, - {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, - {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}] - -- View the abstract parse tree of formulas - -:: - - >>> f.ast - [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - 'structure': {'bro': [None, 3],'child': [1, 2],'father': None,'forest': None}}, - {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, - 'structure': {'bro': [None, 2], 'child': None, 'father': 0, 'forest': None}}, - {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, - 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, - {'val': {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, - 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, - {'val': {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, - 'structure': {'bro': [3, 5], 'child': None, 'father': None, 'forest': None}}, - {'val': {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, - 'structure': {'bro': [4, 6], 'child': None, 'father': None, 'forest': None}}, - {'val': {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, - 'structure': {'bro': [5, 7], 'child': None, 'father': None, 'forest': None}}, - {'val': {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, - 'structure': {'bro': [6, 8], 'child': None, 'father': None, 'forest': None}}, - {'val': {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}, - 'structure': {'bro': [7, None],'child': None,'father': None,'forest': None}}] - - >>> print('nodes: ',f.ast_graph.nodes) - nodes: [0, 1, 2, 3, 4, 5, 6, 7, 8] - >>> print('edges: ' ,f.ast_graph.edges) - edges: [(0, 1), (0, 2)] - -- show the abstract parse tree by a picture - -:: - - >>> ForestPlotter().export(f.ast_graph, root_list=[node["val"]["id"] for node in f.ast if node["structure"]["father"] is None],) - >>> plt.show() - - -.. figure:: ../../_static/formula.png - - -Variable standardization -+++++++++++ - -This parameter makes the same variable have the same variable number. - -For example: the number of variable ``x`` is ``0`` and the number of variable ``y`` is ``1``. - -:: - - >>> f.variable_standardization().elements - [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, - {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, - {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, - {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}, - {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, - {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, - {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, - {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] - -FormulaGroup ->>>>>>>>>>>>>>> - -Call ``FormulaGroup`` class to parse the equations. The related attributes and functions are the same as those above. - -:: - - import matplotlib.pyplot as plt - from EduNLP.Formula import Formula - from EduNLP.Formula import FormulaGroup - from EduNLP.Formula.viz import ForestPlotter - >>> fs = FormulaGroup(["x^2 = y", "x^3 = y^2", "x + y = \pi"]) - >>> fs - ;;> - >>> fs.elements - [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, - {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, - {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, - {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, - {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, - {'id': 7, 'type': 'textord', 'text': '3', 'role': 'sup'}, - {'id': 8, 'type': 'rel', 'text': '=', 'role': None}, - {'id': 9, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - {'id': 10, 'type': 'mathord', 'text': 'y', 'role': 'base'}, - {'id': 11, 'type': 'textord', 'text': '2', 'role': 'sup'}, - {'id': 12, 'type': 'mathord', 'text': 'x', 'role': None}, - {'id': 13, 'type': 'bin', 'text': '+', 'role': None}, - {'id': 14, 'type': 'mathord', 'text': 'y', 'role': None}, - {'id': 15, 'type': 'rel', 'text': '=', 'role': None}, - {'id': 16, 'type': 'mathord', 'text': '\\pi', 'role': None}] - >>> fs.ast - [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - 'structure': {'bro': [None, 3], - 'child': [1, 2], - 'father': None, - 'forest': None}}, - {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, - 'structure': {'bro': [None, 2], - 'child': None, - 'father': 0, - 'forest': [6, 12]}}, - {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, - 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, - {'val': {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, - 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, - {'val': {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, - 'structure': {'bro': [3, None], - 'child': None, - 'father': None, - 'forest': [10, 14]}}, - {'val': {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - 'structure': {'bro': [None, 8], - 'child': [6, 7], - 'father': None, - 'forest': None}}, - {'val': {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, - show more (open the raw output data in a text editor) ... - >>> fs.variable_standardization()[0] - [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] - >>> ForestPlotter().export(fs.ast_graph, root_list=[node["val"]["id"] for node in fs.ast if node["structure"]["father"] is None],) - -.. figure:: ../../_static/formulagroup.png - - -Text syntax structure parsing --------------------- - -This section is mainly realized by EduNLP.SIF.Parse module. Its main function is to extract letters and numbers in the text and convert them into standard format. - -This module is mainly used as an *middle module* to parse the input text. Users generally do not call this module directly. - -Introduction of main content -+++++++++++++++ - -1. Judge the type of the incoming text in the following order - -* is_chinese: its function is to match Chinese characters[\u4e00-\u9fa5]. - -* is_alphabet: its function is to match alphabets other than formulas. Only the alphabets between two Chinese characters will be corrected (wrapped with $$), and the rest of the cases are regarded as formulas that do not conform to latex syntax. - -* is_number: its function is to match numbers other than formulas. Only the numbers between two Chinese characters will be corrected, and the rest of the cases are regarded as formulas that do not conform to latex syntax. - -2. Match latex formula - -* If Chinese characters appear in latex, print warning only once. - -* Use _is_formula_legal function, check the completeness and analyzability of latex formula, and report an error for formulas that do not conform to latex syntax. - -Call the library ->>>>>>>>>>>> - -:: - - from EduNLP.SIF.Parser import Parser - -Input ->>>>>>> - -Types: str - -Content: question text - -:: - - >>> text1 = '生产某种零件的A工厂25名工人的日加工零件数_ _' - >>> text2 = 'X的分布列为( )' - >>> text3 = '① AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.AC的中点为D' - >>> text4 = '支持公式如$\\frac{y}{x}$,$\\SIFBlank$,$\\FigureID{1}$,不支持公式如$\\frac{ \\dddot y}{x}$' - -Parsing ->>>>>>>>>>>>>>>>>>>> - -:: - - >>> text_parser1 = Parser(text1) - >>> text_parser2 = Parser(text2) - >>> text_parser3 = Parser(text3) - >>> text_parser4 = Parser(text4) - -Related parameters description(?) ->>>>>>>>>>>> - -- Try to convert text to standard format - -:: - - >>> text_parser1.description_list() - >>> print('text_parser1.text:',text_parser1.text) - text_parser1.text: 生产某种零件的$A$工厂$25$名工人的日加工零件数$\SIFBlank$ - >>> text_parser2.description_list() - >>> print('text_parser2.text:',text_parser2.text) - text_parser2.text: $X$的分布列为$\SIFChoice$ - -- Determine if the text has syntax errors - -:: - - >>> text_parser3.description_list() - >>> print('text_parser3.error_flag: ',text_parser3.error_flag) - text_parser3.error_flag: 1 - >>> text_parser4.description_list() - >>> print('text_parser4.fomula_illegal_flag: ',text_parser4.fomula_illegal_flag) - text_parser4.fomula_illegal_flag: 1 - diff --git a/docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst b/docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst deleted file mode 100644 index 2fc479c5..00000000 --- a/docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst +++ /dev/null @@ -1,168 +0,0 @@ -Formula syntax structure parsing --------------------- - -This section is mainly realized by EduNLP.Formula modules, which can determine if the text has syntax errors and convert the syntax formula into the form of ast tree. In practice, this module is often used as part of an intermediate process, and the relevant parameters of this module can be automatically chosen by calling the corresponding model, so it generally does not need special attention. - -Introduction of Main Content -+++++++++++++++ - -1.Formula: determine whether the single formula passed in is in str form. If so, use the ast method for processing, otherwise an error will be reported. In addition, parameter variable_standardization is given. If this parameter is true, the variable standardization method will be used to make sure the same variable has the same variable number. - -2.FormulaGroup: If you need to pass in a formula set, you can call this interface to get an ast forest. The tree structure in the forest is the same as that of Formula. - -Formula ->>>>>>>>>>>> - -Formula: firstly, in the word segmentation function, the formula of the original text is segmented. In addition, ``Formula parse tree`` function is provided, which can represent the abstract syntax analysis tree of mathematical formula in the form of text or picture. - -This module also provides the function of formula variable standardization, such as determining that 'x' in several sub formulas is the same variable. - -Initialization -+++++++++ - -Incoming parameters: item - -Item is the latex formula or the abstract syntax parse tree generated after the formula is parsed and its type is str or List[Dict]. - -:: - - >>> f=Formula("x^2 + x+1 = y") - >>> f - - -View the specific content after formula segmentation -++++++++++++++++++++++++++++ - -- View node elements after formula segmentation - -:: - - >>> f.elements - [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, - {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, - {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, - {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, - {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, - {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, - {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, - {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}] - -- View the abstract parsing tree of formulas - -:: - - >>> f.ast - [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - 'structure': {'bro': [None, 3],'child': [1, 2],'father': None,'forest': None}}, - {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, - 'structure': {'bro': [None, 2], 'child': None, 'father': 0, 'forest': None}}, - {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, - 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, - {'val': {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, - 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, - {'val': {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, - 'structure': {'bro': [3, 5], 'child': None, 'father': None, 'forest': None}}, - {'val': {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, - 'structure': {'bro': [4, 6], 'child': None, 'father': None, 'forest': None}}, - {'val': {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, - 'structure': {'bro': [5, 7], 'child': None, 'father': None, 'forest': None}}, - {'val': {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, - 'structure': {'bro': [6, 8], 'child': None, 'father': None, 'forest': None}}, - {'val': {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}, - 'structure': {'bro': [7, None],'child': None,'father': None,'forest': None}}] - - >>> print('nodes: ',f.ast_graph.nodes) - nodes: [0, 1, 2, 3, 4, 5, 6, 7, 8] - >>> print('edges: ' ,f.ast_graph.edges) - edges: [(0, 1), (0, 2)] - -- show the abstract parse tree by a picture - -:: - - >>> ForestPlotter().export(f.ast_graph, root_list=[node["val"]["id"] for node in f.ast if node["structure"]["father"] is None],) - >>> plt.show() - -.. figure:: ../../../_static/formula.png - -Variable Standardization -+++++++++++ - -This parameter makes the same variable have the same variable number. - -For example: the number of variable ``x`` is ``0`` and the number of variable ``y`` is ``1``. - -:: - - >>> f.variable_standardization().elements - [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, - {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, - {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, - {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}, - {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, - {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, - {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, - {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] - -FormulaGroup ->>>>>>>>>>>>>>> - -Call ``FormulaGroup`` class to parse the equations. The related attributes and functions are the same as those above. - -:: - - >>> fs = FormulaGroup(["x^2 = y", "x^3 = y^2", "x + y = \pi"]) - >>> fs - ;;> - >>> fs.elements - [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, - {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, - {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, - {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, - {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, - {'id': 7, 'type': 'textord', 'text': '3', 'role': 'sup'}, - {'id': 8, 'type': 'rel', 'text': '=', 'role': None}, - {'id': 9, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - {'id': 10, 'type': 'mathord', 'text': 'y', 'role': 'base'}, - {'id': 11, 'type': 'textord', 'text': '2', 'role': 'sup'}, - {'id': 12, 'type': 'mathord', 'text': 'x', 'role': None}, - {'id': 13, 'type': 'bin', 'text': '+', 'role': None}, - {'id': 14, 'type': 'mathord', 'text': 'y', 'role': None}, - {'id': 15, 'type': 'rel', 'text': '=', 'role': None}, - {'id': 16, 'type': 'mathord', 'text': '\\pi', 'role': None}] - >>> fs.ast - [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - 'structure': {'bro': [None, 3], - 'child': [1, 2], - 'father': None, - 'forest': None}}, - {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, - 'structure': {'bro': [None, 2], - 'child': None, - 'father': 0, - 'forest': [6, 12]}}, - {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, - 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, - {'val': {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, - 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, - {'val': {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, - 'structure': {'bro': [3, None], - 'child': None, - 'father': None, - 'forest': [10, 14]}}, - {'val': {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, - 'structure': {'bro': [None, 8], - 'child': [6, 7], - 'father': None, - 'forest': None}}, - {'val': {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, - show more (open the raw output data in a text editor) ... - >>> fs.variable_standardization()[0] - [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] - >>> ForestPlotter().export(fs.ast_graph, root_list=[node["val"]["id"] for node in fs.ast if node["structure"]["father"] is None],) - -.. figure:: ../../../_static/formulagroup.png diff --git a/docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst b/docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst deleted file mode 100644 index bdfe6848..00000000 --- a/docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst +++ /dev/null @@ -1,72 +0,0 @@ -Text syntax structure parsing --------------------- - -This section is mainly realized by EduNLP.SIF.Parse module. Its main function is to extract letters and numbers in the text and convert them into standard format. - -This module is mainly used as an *middle module* to parse the input text. Users generally do not call this module directly. - -Introduction of Main Content -+++++++++++++++ - -1. Judge the type of the incoming text in the following order - -* is_chinese: its function is to match Chinese characters[\u4e00-\u9fa5]. - -* is_alphabet: its function is to match alphabets other than formulas. Only the alphabets between two Chinese characters will be corrected (wrapped with $$), and the rest of the cases are regarded as formulas that do not conform to latex syntax. - -* is_number: its function is to match numbers other than formulas. Only the numbers between two Chinese characters will be corrected, and the rest of the cases are regarded as formulas that do not conform to latex syntax. - -2. Match latex formula - -* If Chinese characters appear in latex, print warning only once. - -* Use _is_formula_legal function, check the completeness and analyzability of latex formula, and report an error for formulas that do not conform to latex syntax. - -Input ->>>>>>> - -Type: str - -Content:question text - -:: - - >>> text1 = '生产某种零件的A工厂25名工人的日加工零件数_ _' - >>> text2 = 'X的分布列为( )' - >>> text3 = '① AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.AC的中点为D' - >>> text4 = '支持公式如$\\frac{y}{x}$,$\\SIFBlank$,$\\FigureID{1}$,不支持公式如$\\frac{ \\dddot y}{x}$' - -Parsing ->>>>>>>>>>>>>>>>>>>> - -:: - - >>> text_parser1 = Parser(text1) - >>> text_parser2 = Parser(text2) - >>> text_parser3 = Parser(text3) - >>> text_parser4 = Parser(text4) - -Related parameters description(?) ->>>>>>>>>>>> - -- Try to convert text to standard format - -:: - - >>> text_parser1.description_list() - >>> print('text_parser1.text:',text_parser1.text) - text_parser1.text: 生产某种零件的$A$工厂$25$名工人的日加工零件数$\SIFBlank$ - >>> text_parser2.description_list() - >>> print('text_parser2.text:',text_parser2.text) - text_parser2.text: $X$的分布列为$\SIFChoice$ - -- Determine if the text has syntax errors - -:: - - >>> text_parser3.description_list() - >>> print('text_parser3.error_flag: ',text_parser3.error_flag) - text_parser3.error_flag: 1 - >>> text_parser4.description_list() - >>> print('text_parser4.fomula_illegal_flag: ',text_parser4.fomula_illegal_flag) - text_parser4.fomula_illegal_flag: 1 diff --git a/docs/source/tutorial/en/pretrain.rst b/docs/source/tutorial/en/pretrain.rst deleted file mode 100644 index 9319b87d..00000000 --- a/docs/source/tutorial/en/pretrain.rst +++ /dev/null @@ -1,130 +0,0 @@ -Pre-training -======= - -In the field of NLP, Pre-trained Language Models has become a very important basic technology. -In this chapter, we will introduce the pre training tools in EduNLP: - -* How to train with a corpus to get a pre-trained model -* How to load the pre-trained model -* Public pre-trained models - -Import modules ----------- - -:: - - from EduNLP.I2V import get_pretrained_i2v - from EduNLP.Vector import get_pretrained_t2v - -Train the Model ------------- - -Call train_Vector function interface directly to make the training model easier. This section calls the relevant training models in the gensim library. At present, the training methods of "sg"、 "cbow"、 "fastext"、 "d2v"、 "bow"、 "tfidf" are provided. Parameter embedding_dim is also provided for users to determine vector dimension according to their needs. - -Basic Steps -################## - -1.Determine the type of model and select the appropriate tokenizer (GensimWordTokenizer、 GensimSegTokenizer) to finish tokenization. - -2.Call train_vector function to get the required pre-trained model。 - -Examples: - -:: - - >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) - >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ - ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") - >>> print(token_item.tokens[:10]) - ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] - - # 10 dimension with fasstext method - train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="d2v") - - -Load models --------- - -Transfer the obtained model to the I2V module to load the model. - -Examples: - -:: - - >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" - >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) - -The overview of our public model ------------- - -Version description -################## - -First level version: - -* Public version 1 (luna_pub): college entrance examination -* Public version 2 (luna_pub_large): college entrance examination + regional examination - -Second level version: - -* Minor subjects(Chinese,Math,English,History,Geography,Politics,Biology,Physics,Chemistry) -* Major subjects(science, arts and all subject) - -Third level version【to be finished】: - -* Don't use third-party initializers -* Use third-party initializers - -Description of train data in models -################## - -* Currently, the data used in w2v and d2v models are the subjects of senior high school. -* test data:`[OpenLUNA.json] `_ - -At present, the following models are provided. More models of different subjects and question types are being trained. Please look forward to it. - "d2v_all_256" (all subject), "d2v_sci_256" (Science), "d2v_eng_256" (English),"d2v_lit_256" (Arts) - - -Examples of Model Training ------------- - -Get the dataset -#################### - -.. toctree:: - :maxdepth: 1 - :titlesonly: - - prepare_dataset <../../build/blitz/pretrain/prepare_dataset.ipynb> - -An example of d2v in gensim model -#################### - -.. toctree:: - :maxdepth: 1 - :titlesonly: - - d2v_bow_tfidf <../../build/blitz/pretrain/gensim/d2v_bow_tfidf.ipynb> - d2v_general <../../build/blitz/pretrain/gensim/d2v_general.ipynb> - d2v_stem_tf <../../build/blitz/pretrain/gensim/d2v_stem_tf.ipynb> - -An example of w2v in gensim model -#################### - -.. toctree:: - :maxdepth: 1 - :titlesonly: - - w2v_stem_text <../../build/blitz/pretrain/gensim/w2v_stem_text.ipynb> - w2v_stem_tf <../../build/blitz/pretrain/gensim/w2v_stem_tf.ipynb> - -An example of seg_token -#################### - -.. toctree:: - :maxdepth: 1 - :titlesonly: - - d2v.ipynb <../../build/blitz/pretrain/seg_token/d2v.ipynb> - d2v_d1 <../../build/blitz/pretrain/seg_token/d2v_d1.ipynb> - d2v_d2 <../../build/blitz/pretrain/seg_token/d2v_d2.ipynb> \ No newline at end of file diff --git a/docs/source/tutorial/en/pretrain/loading.rst b/docs/source/tutorial/en/pretrain/loading.rst deleted file mode 100644 index 31fa3ea8..00000000 --- a/docs/source/tutorial/en/pretrain/loading.rst +++ /dev/null @@ -1,11 +0,0 @@ -Load models --------- - -Transfer the obtained model to the I2V module to load the model. - -Examples: - -:: - - >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" - >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) diff --git a/docs/source/tutorial/en/pretrain/pub.rst b/docs/source/tutorial/en/pretrain/pub.rst deleted file mode 100644 index 34407745..00000000 --- a/docs/source/tutorial/en/pretrain/pub.rst +++ /dev/null @@ -1,74 +0,0 @@ -The overview of our public model ------------- - - -Version Description -################## - -First level version: - -* Public version 1 (luna_pub): college entrance examination -* Public version 2 (luna_pub_large): college entrance examination + regional examination - -Second level version: - -* Minor subjects(Chinese,Math,English,History,Geography,Politics,Biology,Physics,Chemistry) -* Major subjects(science, arts and all subject) - -Third level version【to be finished】: - -* Don't use third-party initializers -* Use third-party initializers - -Description of train data in models -################## - -* Currently, the data used in w2v and d2v models are the subjects of senior high school. -* test data:`[OpenLUNA.json] `_ - -At present, the following models are provided. More models of different subjects and question types are being trained. Please look forward to it. - "d2v_all_256" (all subject), "d2v_sci_256" (Science), "d2v_eng_256" (English),"d2v_lit_256" (Arts) - -Examples of model training ------------- - -Get the dataset -#################### - -.. toctree:: - :maxdepth: 1 - :titlesonly: - - prepare_dataset <../../../build/blitz/pretrain/prepare_dataset.ipynb> - -An example of d2v in gensim model -#################### - -.. toctree:: - :maxdepth: 1 - :titlesonly: - - d2v_bow_tfidf <../../../build/blitz/pretrain/gensim/d2v_bow_tfidf.ipynb> - d2v_general <../../../build/blitz/pretrain/gensim/d2v_general.ipynb> - d2v_stem_tf <../../../build/blitz/pretrain/gensim/d2v_stem_tf.ipynb> - -An example of w2v in gensim model -#################### - -.. toctree:: - :maxdepth: 1 - :titlesonly: - - w2v_stem_text <../../../build/blitz/pretrain/gensim/w2v_stem_text.ipynb> - w2v_stem_tf <../../../build/blitz/pretrain/gensim/w2v_stem_tf.ipynb> - -An example of seg_token -#################### - -.. toctree:: - :maxdepth: 1 - :titlesonly: - - d2v.ipynb <../../../build/blitz/pretrain/seg_token/d2v.ipynb> - d2v_d1 <../../../build/blitz/pretrain/seg_token/d2v_d1.ipynb> - d2v_d2 <../../../build/blitz/pretrain/seg_token/d2v_d2.ipynb> diff --git a/docs/source/tutorial/en/pretrain/start.rst b/docs/source/tutorial/en/pretrain/start.rst deleted file mode 100644 index 9c5bc241..00000000 --- a/docs/source/tutorial/en/pretrain/start.rst +++ /dev/null @@ -1,24 +0,0 @@ -Train the model ------------- - -Call train_Vector function interface directly to make the training model easier. This section calls the relevant training models in the gensim library. At present, the training methods of "sg"、 "cbow"、 "fastext"、 "d2v"、 "bow"、 "tfidf" are provided. Parameter embedding_dim is also provided for users to determine vector dimension according to their needs. - -Basic Steps -################## - -1.Determine the type of model and select the appropriate tokenizer (GensimWordTokenizer、 GensimSegTokenizer) to finish tokenization. - -2.Call train_vector function to get the required pre-trained model。 - -Examples: - -:: - - >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) - >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ - ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") - >>> print(token_item.tokens[:10]) - ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] - - # 10 dimension with fasstext method - train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="d2v") diff --git a/docs/source/tutorial/en/seg.rst b/docs/source/tutorial/en/seg.rst deleted file mode 100644 index ad2696a2..00000000 --- a/docs/source/tutorial/en/seg.rst +++ /dev/null @@ -1,187 +0,0 @@ -Component Segmentation -========= - -Educational resource is a kind of multimodal data, including data such as text, picture, formula and so on. -At the same time, it may also contain different components semantically, such as question stems, options, etc. Therefore, we first need to identify and segment the different components of educational resources: - -* Semantic Component Segmentation -* Structural Component Segmentation - -Main Processing Contents --------------------- - -1. Convert multiple-choice questions in the form of dict to qualified item by `Syntax parsing `_; - -2. The input items are segmented and grouped according to the element type. - -Semantic Component Segmentation ------------- - -Because multiple-choice questions are given in the form of dict, it is necessary to convert them into text format while retaining their data relationship. This function can be realized by dict2str4sif function which can convert multiple-choice question items into character format and identify question stem and options。 - -Import Modules -+++++++++ - -:: - - from EduNLP.utils import dict2str4sif - -Basic Usage -++++++++++++++++++ - -:: - - >>> item = { - ... "stem": r"若复数$z=1+2 i+i^{3}$,则$|z|=$", - ... "options": ['0', '1', r'$\sqrt{2}$', '2'], - ... } - >>> dict2str4sif(item) # doctest: +ELLIPSIS - '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' - -Optional additional parameters / interfaces -++++++++++++++++++++++ - -1.add_list_no_tag: if this parameter is true, it means that you need to count the labels in the options section. - -:: - - >>> dict2str4sif(item, add_list_no_tag=True) # doctest: +ELLIPSIS - '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' - - >>> dict2str4sif(item, add_list_no_tag=False) # doctest: +ELLIPSIS - '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2$\\SIFTag{options_end}$' - -2.tag_mode: The location for the label can be selected using this parameter. 'delimiter' is to label both the beginning and the end,'head' is to label only the head, and 'tail' is to label only the tail. - -:: - - >>> dict2str4sif(item, tag_mode="head") # doctest: +ELLIPSIS - '$\\SIFTag{stem}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{options}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2' - - >>> dict2str4sif(item, tag_mode="tail") # doctest: +ELLIPSIS - '若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options}$' - -3.key_as_tag: If this parameter is false, this process will only adds $\SIFSep$ between the options without distinguishing the type of segmentation label. - -:: - - >>> dict2str4sif(item, key_as_tag=False) - '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' - -Structural Component Segmentation ------------- - -This step is to segment sliced items. In this step, there is a depth option. You can select all positions or some labels for segmentation according to your needs, such as \SIFSep and \SIFTag. You can also select where to add labels, either at the head and tail or only at the head or tail. - - -There are two modes: - -* linear mode: it is used for text processing (word segmentation using jieba library); - -* ast mode: it is used to parse the formula. - -Basic Segmentation process: - -- Match components with regular expression matching - -- Process the components with special structures, such as converting the base64 encoded picture to numpy form - -- Classify the elements into each element group - -- Enter the corresponding parameters as required to get the filtered results - -Import Modules -+++++++++ - -:: - - from EduNLP.SIF.segment import seg - from EduNLP.SIF import sif4sci - -Basic Usage -++++++++++++++++++ - -:: - - >>> test_item = r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$" - >>> seg(test_item) - >>> ['如图所示,则', '\\bigtriangleup ABC', '的面积是', '\\SIFBlank', '。', \FigureID{1}] - -Optional additional parameters/interfaces -++++++++++++++++++++++ - -1.describe: count the number of elements of different types - -:: - - >>> s.describe() - {'t': 3, 'f': 1, 'g': 1, 'm': 1} - -2.filter: this interface can screen out one or more types of elements. - -Using this interface, you can pass in a "keep" parameter or a special character directly to choose what type of elements to retain. - -Element type represented by symbol: - -- "t": text -- "f": formula -- "g": figure -- "m": question mark -- "a": tag -- "s": sep tag - -:: - - >>> with s.filter("f"): - ... s - ['如图所示,则', '的面积是', '\\SIFBlank', '。', \FigureID{1}] - >>> with s.filter(keep="t"): - ... s - ['如图所示,则', '的面积是', '。'] - -3.symbol: this interface can convert some types of data into special symbols - -Element type represented by symbol: - -- "t": text -- "f": formula -- "g": figure -- "m": question mark - -:: - - >>> seg(test_item, symbol="fgm") - ['如图所示,则', '[FORMULA]', '的面积是', '[MARK]', '。', '[FIGURE]'] - >>> seg(test_item, symbol="tfgm") - ['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]'] - -In addition,sif4sci function is also provided, which can easily convert items into the result processed by Structural Component Segmentation - -:: - - >>> segments = sif4sci(item["stem"], figures=figures, tokenization=False) - >>> segments - ['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形', 'ABC', '的斜边', 'BC', ', 直角边', 'AB', ', ', 'AC', '.', '\\bigtriangleup ABC', '的三边所围成的区域记为', 'I', ',黑色部分记为', 'II', ', 其余部分记为', 'III', '.在整个图形中随机取一点,此点取自', 'I,II,III', '的概率分别记为', 'p_1,p_2,p_3', ',则', '\\SIFChoice', \FigureID{1}] - -- When calling this function, you can selectively output a certain type of data according to your needs - -:: - - >>> segments.formula_segments - ['ABC', - 'BC', - 'AB', - 'AC', - '\\bigtriangleup ABC', - 'I', - 'II', - 'III', - 'I,II,III', - 'p_1,p_2,p_3'] - -- Similar to seg function, sif4sci function also provides depth options to help with your research ----- By modifying the ``symbol`` parameter, different components can be transformed into specific markers. - -:: - - >>> sif4sci(item["stem"], figures=figures, tokenization=False, symbol="tfgm") - ['[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[FIGURE]'] diff --git a/docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst b/docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst deleted file mode 100644 index 3901f4cb..00000000 --- a/docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst +++ /dev/null @@ -1,47 +0,0 @@ -Semantic Component Segmentation ------------- - -Because multiple-choice questions are given in the form of dict, it is necessary to convert them into text format while retaining their data relationship. This function can be realized by dict2str4sif function which can convert multiple-choice question items into character format and identify question stem and options。 - - -Basic Usage -++++++++++++++++++ - -:: - - >>> item = { - ... "stem": r"若复数$z=1+2 i+i^{3}$,则$|z|=$", - ... "options": ['0', '1', r'$\sqrt{2}$', '2'], - ... } - >>> dict2str4sif(item) # doctest: +ELLIPSIS - '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' - -Optional additional parameters / interfaces -++++++++++++++++++++++ - -1.add_list_no_tag: if this parameter is true, it means that you need to count the labels in the options section. - -:: - - >>> dict2str4sif(item, add_list_no_tag=True) # doctest: +ELLIPSIS - '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' - - >>> dict2str4sif(item, add_list_no_tag=False) # doctest: +ELLIPSIS - '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2$\\SIFTag{options_end}$' - -2.tag_mode: The location for the label can be selected using this parameter. 'delimiter' is to label both the beginning and the end,'head' is to label only the head, and 'tail' is to label only the tail. - -:: - - >>> dict2str4sif(item, tag_mode="head") # doctest: +ELLIPSIS - '$\\SIFTag{stem}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{options}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2' - - >>> dict2str4sif(item, tag_mode="tail") # doctest: +ELLIPSIS - '若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options}$' - -3.key_as_tag: If this parameter is false, this process will only adds $\SIFSep$ between the options without distinguishing the type of segmentation label. - -:: - - >>> dict2str4sif(item, key_as_tag=False) - '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' \ No newline at end of file diff --git a/docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst b/docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst deleted file mode 100644 index 8661c3d6..00000000 --- a/docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst +++ /dev/null @@ -1,67 +0,0 @@ -Structural Component Segmentation ------------- - -This step is to segment sliced items. In this step, there is a depth option. You can select all positions or some labels for segmentation according to your needs, such as \SIFSep and \SIFTag. You can also select where to add labels, either at the head and tail or only at the head or tail. - - -There are two modes: - -* linear mode: it is used for text processing (word segmentation using jieba library); - -* ast mode: it is used to parse the formula. - -Basic Usage -++++++++++++++++++ - -:: - - >>> test_item = r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$" - >>> seg(test_item) - >>> ['如图所示,则', '\\bigtriangleup ABC', '的面积是', '\\SIFBlank', '。', \FigureID{1}] - -Optional additional parameters/interfaces -++++++++++++++++++++++ - -1.describe: count the number of elements of different types - -:: - - >>> s.describe() - {'t': 3, 'f': 1, 'g': 1, 'm': 1} - -2.filter: this interface can screen out one or more types of elements. - -Using this interface, you can pass in a "keep" parameter or a special character directly to choose what type of elements to retain. - -Element type represented by symbol: - "t": text - "f": formula - "g": figure - "m": question mark - "a": tag - "s": sep tag - -:: - - >>> with s.filter("f"): - ... s - ['如图所示,则', '的面积是', '\\SIFBlank', '。', \FigureID{1}] - >>> with s.filter(keep="t"): - ... s - ['如图所示,则', '的面积是', '。'] - -3.symbol: this interface can convert some types of data into special symbols - -Element type represented by symbol: - -- "t": text -- "f": formula -- "g": figure -- "m": question mark - -:: - - >>> seg(test_item, symbol="fgm") - ['如图所示,则', '[FORMULA]', '的面积是', '[MARK]', '。', '[FIGURE]'] - >>> seg(test_item, symbol="tfgm") - ['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]'] diff --git a/docs/source/tutorial/en/sif.rst b/docs/source/tutorial/en/sif.rst index 877cb503..0cbe7cf3 100644 --- a/docs/source/tutorial/en/sif.rst +++ b/docs/source/tutorial/en/sif.rst @@ -1,145 +1,2 @@ Standard Item Format -=============== - -version: 0.2 - -For the convenience of follow-up research and use, we need a unified test question grammar standard. - -Grammar Rules ------------ - -1. Only Chinese characters, Chinese and English punctuation and line breaks are allowed in the question text. - -2. Represent underlines of blanks and brackets of choices with ``\$\SIFBlank\$`` and ``\$\SIFChoice\$`` respectively. - -3. We use ``$\FigureID{ uuid }$`` or Base64 to represent pictures. Especially, ``$\FormFigureID{ uuid }$`` is used to represent formulas pictures. - -4. Text format description: we represent text in different styles with ``$\textf{item,CHAR_EN}$``. Currently, we have defined some styles: b-bold, i-italic, u-underline, w-wave, d-dotted, t-title. CHAR_EN Labels can be mixed and sorted alphabetically. An example: $\textf{EduNLP, b}$ looks **EduNLP** - -5. Other mathematical symbols like English letters, Roman characters and numbers need to be expressed in latex format, that is, embedded in `$$`. - -6. For the entry standard of molecular formula, please refer to `INCHI `_ for the time being. - -7. Currently, there are no requirements for latex internal syntax. - -:: - - 1. Item -> CHARACTER|EN_PUN_LIST|CH_PUN_LIST|FORMULA|QUES_MARK - 2. EN_PUN_LIST -> [',', '.', '?', '!', ':', ';', '\'', '\"', '(', ')', ' ','_','/','|','\\','<','>','[',']','-'] - 3. CH_PUN_LIST -> [',', '。', '!', '?', ':',';', '‘', '’', '“', '”', '(', ')', ' ', '、','《','》','—','.'] - 4. FORMULA -> $latex formula$ | $\FormFigureID{UUID}$ | $\FormFigureBase64{BASE64}$ - 5. FIGURE -> $\FigureID{UUID}$ | $\FigureBase64{BASE64}$ - 6. UUID -> [a-zA-Z\-0-9]+ - 7. CHARACTER -> CHAR_EN | CHAR_CH - 8. CHAR_EN -> [a-zA-Z]+ - 9. CHAR_CH -> [\u4e00-\u9fa5]+ - 10. DIGITAL -> [0-9]+ - 11. QUES_MARK -> $\SIFBlank$ | $\SIFChoice$ - - -Tips -+++++++++++++++ - -1. Reserved characters and escape characters. - -2. Numbers. - -3. Choices and blanks. - -4. A single number or letter is also required to be between `$$` (automatic verification could already realize it). - -5. Try to make sure Chinese is not included in the latex formula such as ``\text{CHAR_CH}``. - -6. When importing data using MySQL database, an ``\`` is automatically ignored which needs to be further processed as``\\``. - -Examples ------------------ - -Standard Format: - -:: - - 1. 若$x,y$满足约束条件$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,则$z=x+7 y$的最大值$\\SIFUnderline$' - - 2. 已知函数$f(x)=|3 x+1|-2|x|$画出$y=f(x)$的图像求不等式$f(x)>f(x+1)$的解集$\\PictureID{3bf2ddf4-8af1-11eb-b750-b46bfc50aa29}$$\\PictureID{59b8bd14-8af1-11eb-93a5-b46bfc50aa29}$$\\PictureID{63118b3a-8b75-11eb-a5c0-b46bfc50aa29}$$\\PictureID{6a006179-8b76-11eb-b386-b46bfc50aa29}$$\\PictureID{088f15eb-8b7c-11eb-a86f-b46bfc50aa29}$ - -Non-standard Format: - -1. Letters, numbers and mathematical symbols are mixed: - - For example: - - ``完成下面的2x2列联表,`` - - ``(单位:m3)`` - - ``则输出的n=`` - -2. Some special mathematical symbols are not represented by the latex formula: - - For example: - - ``命题中真命题的序号是 ①`` - - ``AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.若D为AC的中点`` - -3. There are unicode encoded characters in the text: - - For example: - ``则$a$的取值范围是(\u3000\u3000)`` - -Functions for judging whether text is in SIF format and converting to SIF format --------------------------------------------- - -Call the Library -++++++++ -:: - - from EduNLP.SIF import is_sif, to_sif - -is_sif -+++++++++++ - -:: - - >>> text1 = '若$x,y$满足约束条件' - >>> text2 = '$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,' - >>> text3 = '则$z=x+7 y$的最大值$\\SIFUnderline$' - >>> text4 = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...' - >>> is_sif(text1) - True - >>> is_sif(text2) - True - >>> is_sif(text3) - True - >>> is_sif(text4) - False - -to_sif -+++++++++++ - -:: - - >>> text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...' - >>> to_sif(text) - '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...' - - -Change Log ----------------- - -2021-05-18 - -Changed - -1. Originally, we use ``\$\SIFUnderline\$`` and ``\$\SIFBracket\$`` to represent underlines of blanks and brackets of choices. Now we represent them with ``\$\SIFBlank\$`` and ``\$\SIFChoice\$``. - -2. Originally, we used ``$\PictureID{ uuid }$`` to represent pictures, but now we use ``$\FigureID{ uuid }$`` instead. Especially, ``$\FormFigureID{ uuid }$`` is used to represent formulas pictures. - -2021-06-28 - -Added: - -1. There should not be line breaks between the notation ``$$``. - -2. Add text format description. +==================== diff --git a/docs/source/tutorial/en/tokenization/GensimSegTokenizer.rst b/docs/source/tutorial/en/tokenization/GensimSegTokenizer.rst deleted file mode 100644 index eb624e94..00000000 --- a/docs/source/tutorial/en/tokenization/GensimSegTokenizer.rst +++ /dev/null @@ -1,9 +0,0 @@ -GensimSegTokenizer -===================== - -By default, the pictures, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text, formulas and labels. Also, the tokenizer uses linear analysis method for text and abstract analysis method of syntax tree for formulas. - -Compared to GensimWordTokenizer, the main differences are: - -* It provides the depth option for segmentation position, such as \SIFSep and \SIFTag. -* By default, labels are inserted in the header of item components (such as text and formula). \ No newline at end of file diff --git a/docs/source/tutorial/en/tokenization/GensimWordTokenizer.rst b/docs/source/tutorial/en/tokenization/GensimWordTokenizer.rst deleted file mode 100644 index 98d4b10a..00000000 --- a/docs/source/tutorial/en/tokenization/GensimWordTokenizer.rst +++ /dev/null @@ -1,23 +0,0 @@ -GensimWordTokenizer -===================== - -By default, the pictures, blanks in the question text and other parts of the incoming item are converted into special characters for data security and the tokenization of text, formulas, labels and separators. Also, the tokenizer uses linear analysis method for text and abstract syntax tree method for formulas respectively. You can choose each of them by ``general`` parameter: - --true, it means that the incoming item conforms to SIF and the linear analysis method should be used. --false, it means that the incoming item doesn't conform to SIF and the abstract syntax tree method should be used. - -Examples ----------- - -:: - - >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) - >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ - ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") - >>> print(token_item.tokens[:10]) - ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] - >>> tokenizer = GensimWordTokenizer(symbol="fgmas", general=False) - >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ - ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") - >>> print(token_item.tokens[:10]) - ['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]'] diff --git a/docs/source/tutorial/en/tokenization/PureTextTokenizer.rst b/docs/source/tutorial/en/tokenization/PureTextTokenizer.rst deleted file mode 100644 index 8c36e67c..00000000 --- a/docs/source/tutorial/en/tokenization/PureTextTokenizer.rst +++ /dev/null @@ -1,31 +0,0 @@ -PureTextTokenizer -================ - -By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security. At the same time, special formulas such as $\\FormFigureID{...}$ and $\\FormFigureBase64{...}$ are screened out to facilitate the tokenization of text and plain text formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future. - -Examples ----------- - -:: - - >>> tokenizer = PureTextTokenizer() - >>> items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ - ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"] - >>> tokens = tokenizer(items) - >>> next(tokens)[:10] - ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z'] - >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] - >>> tokens = tokenizer(items) - >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE - ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', - '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', - '\\quad', 'A', '\\cap', 'B', '='] - >>> items = [{ - ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", - ... "options": ["1", "2"] - ... }] - >>> tokens = tokenizer(items, key=lambda x: x["stem"]) - >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE - ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', - '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', - '\\quad', 'A', '\\cap', 'B', '='] diff --git a/docs/source/tutorial/en/tokenization/TextTokenizer.rst b/docs/source/tutorial/en/tokenization/TextTokenizer.rst deleted file mode 100644 index 08991be6..00000000 --- a/docs/source/tutorial/en/tokenization/TextTokenizer.rst +++ /dev/null @@ -1,27 +0,0 @@ -TextTokenizer -================ - -By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text and formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future. - - -Examples ----------- - -:: - - >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] - >>> tokenizer = TextTokenizer() - >>> tokens = tokenizer(items) - >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE - ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', - '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', - '\\quad', 'A', '\\cap', 'B', '='] - >>> items = [{ - ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", - ... "options": ["1", "2"] - ... }] - >>> tokens = tokenizer(items, key=lambda x: x["stem"]) - >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE - ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', - '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', - '\\quad', 'A', '\\cap', 'B', '='] diff --git a/docs/source/tutorial/en/tokenize.rst b/docs/source/tutorial/en/tokenize.rst deleted file mode 100644 index 3411b74b..00000000 --- a/docs/source/tutorial/en/tokenize.rst +++ /dev/null @@ -1,172 +0,0 @@ -Tokenization -======= - -Tokenization, known as word segmentation and sentence segmentation, is a basic but very important step in the field of NLP. -In EduNLP, we divided Tokenization into different levels according to different granularity. To avoid ambiguity, we define as follows: - -* Word/char level: word segmentation - -* Sentence level: sentence segmentation - -* Resource level: tokenization - -This module provides tokenization function of question text, converting questions into token sequences to facilitate the vectorization of questions. After that, each element in the sliced item needs word segmentation. In this step, there is a depth option. You can select all positions or some labels for segmentation according to your needs, such as \SIFSep and \SIFTag. You can also select where to add labels, either at the head and tail or only at the head or tail. - -There are two modes: one is linear mode, which is used for text processing (word segmentation using jieba library). The other one is ast mode, which is used to parse the formula. - -Word Segmentation -------- - -Text-tokenization: A sentence (without formulas) consists of several "words" in order. The process of dividing a sentence into several words is called "Text-tokenization". According to the granularity of "words", it can be subdivided into "Word-tokenization" and "Char-tokenization". - -:: - - - Word-tokenization: each phrase is a token. - - - Char-tokenization: each character is a token. - - -Text-tokenization is divided into two main steps: - -1. Text-tokenization: - - - Word-tokenization: use the word segmentation tool to segment and extract words from the question text. Our project supports `jieba`. - - - Char-tokenization: process text by character. - -2. Filter: filter the specified stopwords. - - The default stopwords used in this project:`[stopwords] `_ - You can also use your own stopwords. The following example demonstrates how to use. - -Examples: - -:: - - from EduNLP.SIF.tokenization.text import tokenize - >>> text = "三角函数是基本初等函数之一" - >>> tokenize(text, granularity="word") - ['三角函数', '初等', '函数'] - - >>> tokenize(text, granularity="char") - ['三', '角', '函', '数', '基', '初', '函', '数'] - -Sentence Segmentation -------- - -During the process of sentence segmentation, a long document is divided into several sentences. Each sentence is a "token" (to be realized). - -Tokenization -------- -Tokenization is comprehensive analysis. In this process, sentences with formulas are segmented into several markers. Each marker is a "token". - -The implementation of this function is tokenize function. The required results can be obtained by passing in items after Structural Component Segmentation. - -:: - - from EduNLP.Tokenizer import get_tokenizer - >>> items = "如图所示,则三角形$ABC$的面积是$\\SIFBlank$。$\\FigureID{1}$" - >>> tokenize(SegmentList(items)) - ['如图所示', '三角形', 'ABC', '面积', '\\\\SIFBlank', \\FigureID{1}] - >>> tokenize(SegmentList(items),formula_params={"method": "ast"}) - ['如图所示', '三角形', , '面积', '\\\\SIFBlank', \\FigureID{1}] - - - -You can view ``./EduNLP/Tokenizer/tokenizer.py`` and ``./EduNLP/Pretrain/gensim_vec.py`` for more tokenizers. We provide some encapsulated tokenizers for users to call them conveniently. Following is a complete list of tokenizers: - -- TextTokenizer - -- PureTextTokenizer - -- GensimSegTokenizer - -- GensimWordTokenizer - - -TextTokenizer -+++++++++++++++++++++ - -By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text and formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future. - -:: - - >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] - >>> tokenizer = TextTokenizer() - >>> tokens = tokenizer(items) - >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE - ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', - '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', - '\\quad', 'A', '\\cap', 'B', '='] - >>> items = [{ - ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", - ... "options": ["1", "2"] - ... }] - >>> tokens = tokenizer(items, key=lambda x: x["stem"]) - >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE - ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', - '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', - '\\quad', 'A', '\\cap', 'B', '='] - -PureTextTokenizer -+++++++++++++++++++++ - -By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security. At the same time, special formulas such as $\\FormFigureID{...}$ and $\\FormFigureBase64{...}$ are screened out to facilitate the tokenization of text and plain text formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future. - -:: - - >>> tokenizer = PureTextTokenizer() - >>> items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ - ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"] - >>> tokens = tokenizer(items) - >>> next(tokens)[:10] - ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z'] - >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] - >>> tokens = tokenizer(items) - >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE - ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', - '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', - '\\quad', 'A', '\\cap', 'B', '='] - >>> items = [{ - ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", - ... "options": ["1", "2"] - ... }] - >>> tokens = tokenizer(items, key=lambda x: x["stem"]) - >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE - ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', - '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', - '\\quad', 'A', '\\cap', 'B', '='] - -GensimWordTokenizer -+++++++++++++++++++++++ - -By default, the pictures, blanks in the question text and other parts of the incoming item are converted into special characters for data security and the tokenization of text, formulas, labels and separators. Also, the tokenizer uses linear analysis method for text and abstract syntax tree method for formulas respectively. You can choose each of them by ``general`` parameter: - --true, it means that the incoming item conforms to SIF and the linear analysis method should be used. --false, it means that the incoming item doesn't conform to SIF and the abstract syntax tree method should be used. - -GensimSegTokenizer -++++++++++++++++++++ - -By default, the pictures, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text, formulas and labels. Also, the tokenizer uses linear analysis method for text and abstract analysis method of syntax tree for formulas. - -Compared to GensimWordTokenizer, the main differences are: - -* It provides the depth option for segmentation position, such as \SIFSep and \SIFTag. -* By default, labels are inserted in the header of item components (such as text and formulas). - -Examples ----------- - -:: - - >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) - >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ - ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") - >>> print(token_item.tokens[:10]) - ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] - >>> tokenizer = GensimWordTokenizer(symbol="fgmas", general=False) - >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ - ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") - >>> print(token_item.tokens[:10]) - ['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]'] diff --git a/docs/source/tutorial/en/tokenize/Sentence Segmentation.rst b/docs/source/tutorial/en/tokenize/Sentence Segmentation.rst deleted file mode 100644 index 1a8d4950..00000000 --- a/docs/source/tutorial/en/tokenize/Sentence Segmentation.rst +++ /dev/null @@ -1,3 +0,0 @@ -Sentence Segmentation -------- -During the process of sentence segmentation, a long document is divided into several sentences. Each sentence is a "token" (to be realized). diff --git a/docs/source/tutorial/en/tokenize/Tokenization.rst b/docs/source/tutorial/en/tokenize/Tokenization.rst deleted file mode 100644 index fad25912..00000000 --- a/docs/source/tutorial/en/tokenize/Tokenization.rst +++ /dev/null @@ -1,29 +0,0 @@ -Tokenization -------- -Tokenization is comprehensive analysis. In this process, sentences with formulas are segmented into several markers. Each marker is a "token". -We provide some encapsulated tokenizers for users to call them conveniently. The following is a complete list of tokenizers. - -Examples - -:: - - >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] - >>> tokenizer = TextTokenizer() - >>> tokens = tokenizer(items) - >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE - ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', - '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', - '\\quad', 'A', '\\cap', 'B', '='] - - - -You can view ``./EduNLP/Tokenizer/tokenizer.py`` and ``./EduNLP/Pretrain/gensim_vec.py`` for more tokenizers. Following is a complete list of tokenizers: - -.. toctree:: - :maxdepth: 1 - :titlesonly: - - ../tokenization/TextTokenizer - ../tokenization/PureTextTokenizer - ../tokenization/GensimSegTokenizer - ../tokenization/GensimWordTokenizer diff --git a/docs/source/tutorial/en/tokenize/WordSegmentation.rst b/docs/source/tutorial/en/tokenize/WordSegmentation.rst deleted file mode 100644 index a85f4dae..00000000 --- a/docs/source/tutorial/en/tokenize/WordSegmentation.rst +++ /dev/null @@ -1,36 +0,0 @@ -Word segmentation -------- - -Text-tokenization: A sentence (without formulas) consists of several "words" in order. The process of dividing a sentence into several words is called "Text-tokenization". According to the granularity of "words", it can be subdivided into "Word-tokenization" and "Char-tokenization". - -:: - - - Word-tokenization: each phrase is a token. - - - Char-tokenization: each character is a token. - - -Text-tokenization is divided into two main steps: - -1. Text-tokenization: - - - Word-tokenization: use the word segmentation tool to segment and extract words from the question text. Our project supports `jieba`. - - - Char-tokenization: process text by character. - -2. Filter: filter the specified stopwords. - - The default stopwords used in this project:`[stopwords] `_ - You can also use your own stopwords. The following example demonstrates how to use. - -Examples: - -:: - - >>> text = "三角函数是基本初等函数之一" - >>> tokenize(text, granularity="word") - ['三角函数', '初等', '函数'] - - >>> tokenize(text, granularity="char") - ['三', '角', '函', '数', '基', '初', '函', '数'] - diff --git a/docs/source/tutorial/en/vectorization.rst b/docs/source/tutorial/en/vectorization.rst deleted file mode 100644 index 5b744eeb..00000000 --- a/docs/source/tutorial/en/vectorization.rst +++ /dev/null @@ -1,157 +0,0 @@ -Vectorization -========= - -This section provides a simple interface to convert the incoming items into vectors directly. Currently, the option of whether to use the pre training model is provided. You can choose according to your needs. If you don't want to use the pre-trained model, you can call D2V directly, or call get_pretrained_i2v function if you want to use the pre-trained model. - -- Don't use the pre-trained model - -- Use the pre-trained model - -Overview Flow ---------------------------- - -1.Perform `syntax parsing `_ on incoming items to get items in SIF format; - -2.Perform `component segmentation `_ on sif_items; - -3.Perform `tokenization `_ on segmented items; - -4.Use the existing or pre-trained model we provided to convert the tokenized items into vectors. - - -Don't use the pre-trained model: call existing models directly ------------------------------------- - -You can use any pre-trained model provided by yourself (just give the storage path of the model) to convert the given question text into vectors. - -* Advantages: it is flexible to use your own model and its parameters can be adjusted freely. - -Import modules -++++++++++ - -:: - - from EduNLP.I2V import D2V,W2V,get_pretrained_i2v - from EduNLP.Vector import T2V,get_pretrained_t2v - -Models provided -++++++++++++++++++++ - -- W2V - -- D2V - -- T2V - -W2V -<<<<<<<<< - -This model directly uses the relevant model methods in the gensim library to convert words into vectors. Currently, there are four methods: - - - FastText - - - Word2Vec - - - KeyedVectors - -:: - - >>> i2v = get_pretrained_i2v("test_w2v", "examples/test_model/data/w2v") # doctest: +ELLIPSIS - >>> item_vector, token_vector = i2v(["有学者认为:‘学习’,必须适应实际"]) - >>> item_vector # doctest: +ELLIPSIS - array([[...]], dtype=float32) - -D2V -<<<<<<<<<<<< - -This model is a comprehensive processing method which can convert items into vectors. Currently, the following methods are provided: - -- d2v: call doc2vec module in gensim library to convert items into vectors. - -- BowLoader: call corpora module in gensim library to convert docs into bows. - -- TfidfLoader: call TfidfModel module in gensim library to convert docs into bows. - -:: - - >>> item = {"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\SIFChoice$$\FigureID{1}$"} - >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" - >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) - >>> i2v(item) - ([array([ 4.76559885e-02, -1.60574958e-01, 1.94614579e-03, 2.40295693e-01, - 2.24517003e-01, -3.24351490e-02, 4.35789041e-02, -1.65670961e-02,... - -T2V -<<<<<<<<<< - -You can use any pre-trained model provided by yourself to represent the segmentation sequences of a group of questions as vectors (just give the storage path of the model). - -- Advantages: the model and its parameters can be adjusted independently and has strong flexibility. - -Input -^^^^^^^^^^ - -Types: list -Contents: the combination of each question segmentation sequences in one question group. ->You can transfer question text (`str` type) to tokens using ``GensimWordTokenizer`` model - -:: - - >>> token_items=['公式','[FORMULA]','公式','[FORMULA]','如图','[FIGURE]','x',',','y','约束条件','[SEP]','z','=','x','+','7','y','最大值','[MARK]'] - >>> path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" - >>> t2v = T2V('d2v',filepath=path) - >>> t2v(token_items) - [array([ 0.0256574 , 0.06061139, -0.00121044, -0.0167674 , -0.0111706 , - 0.05325712, -0.02097339, -0.01613594, 0.02904145, 0.0185046 ,... - -Specific process of processing -++++++++++++++++++++ - -1.Call get_tokenizer function to get the result after word segmentation; - -2.Select the model provided for vectorization depending on the model used. - - -Use the pre-training model: call get_pretrained_i2v directly ---------------------------------------------- - -Use the pre-training model provided by EduNLP to convert the given question text into vectors. - -* Advantages: Simple and convenient. - -* Disadvantages: Only the model given in the project can be used, which has great limitations. - -* Call this function to obtain the corresponding pre-training model. At present, the following pre training models are provided: d2v_all_256, d2v_sci_256, d2v_eng_256 and d2v_lit_256. - -Selection and Use of Models -################## - -Select the pre-training model according to the subject: - -+--------------------+------------------------+ -| Pre-training model name | Subject of model training data | -+====================+========================+ -| d2v_all_256 | all subject | -+--------------------+------------------------+ -| d2v_sci_256 | Science | -+--------------------+------------------------+ -| d2v_lit_256 | Arts | -+--------------------+------------------------+ -| d2v_eng_256 | English | -+--------------------+------------------------+ - - -The concrete process of processing -################## - -1.Download the corresponding preprocessing model - -2.Transfer the obtained model to D2V and process it with D2V - Convert the obtained model into D2V and process it through D2V - -Examples: - -:: - - >>> i2v = get_pretrained_i2v("d2v_sci_256") - >>> i2v(item) diff --git a/docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst b/docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst deleted file mode 100644 index 41dcab64..00000000 --- a/docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst +++ /dev/null @@ -1,42 +0,0 @@ -Use the pre-training model: call get_pretrained_i2v directly ---------------------------------------------- - -Use the pre-training model provided by EduNLP to convert the given question text into vectors. - -* Advantages: Simple and convenient. - -* Disadvantages: Only the model given in the project can be used, which has great limitations. - -* Call this function to obtain the corresponding pre-training model. At present, the following pre training models are provided: d2v_all_256, d2v_sci_256, d2v_eng_256 and d2v_lit_256. - -Selection and use of models -################## - -Select the pre-training model according to the subject: - -+--------------------+------------------------+ -| Pre-training model name | Subject of model training data | -+====================+========================+ -| d2v_all_256 | all subject | -+--------------------+------------------------+ -| d2v_sci_256 | Science | -+--------------------+------------------------+ -| d2v_lit_256 | Arts | -+--------------------+------------------------+ -| d2v_eng_256 | English | -+--------------------+------------------------+ - -The concrete process of processing -################## - -1.Download the corresponding preprocessing model - -2.Transfer the obtained model to D2V and process it with D2V - Convert the obtained model into D2V and process it through D2V - -Examples: - -:: - - >>> i2v = get_pretrained_i2v("d2v_sci_256") - >>> i2v(item) diff --git a/docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst b/docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst deleted file mode 100644 index 2989f8ba..00000000 --- a/docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst +++ /dev/null @@ -1,21 +0,0 @@ -Don't use the pre-trained model: call existing models directly ------------------------------------- - -You can use any pre-trained model provided by yourself (just give the storage path of the model) to convert the given question text into vectors. - -* Advantages: it is flexible to use your own model and its parameters can be adjusted freely. - -Specific process of processing -++++++++++++++++++++ - -1.Call get_tokenizer function to get the result after word segmentation; - -2.Select the model provided for vectorization depending on the model used. - -Examples: - -:: - - >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" - >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) - >>> i2v(item) From 66f91c1d322cef9d6580d56f3f2f7f005a48c382 Mon Sep 17 00:00:00 2001 From: fannazya Date: Sat, 23 Oct 2021 20:27:06 +0800 Subject: [PATCH 4/6] Revert "Revert "Add English version tutorial"" This reverts commit c84f5ac96d29b4f2c444336634785f27163d4d6f. --- docs/source/tutorial/en/index.rst | 52 +++- docs/source/tutorial/en/parse.rst | 290 ++++++++++++++++++ .../parse/FormulaSyntaxStructureParsing.rst | 168 ++++++++++ .../en/parse/TextSyntaxStructureParsing.rst | 72 +++++ docs/source/tutorial/en/pretrain.rst | 130 ++++++++ docs/source/tutorial/en/pretrain/loading.rst | 11 + docs/source/tutorial/en/pretrain/pub.rst | 74 +++++ docs/source/tutorial/en/pretrain/start.rst | 24 ++ docs/source/tutorial/en/seg.rst | 187 +++++++++++ .../en/seg/SemanticComponentSegmentation.rst | 47 +++ .../seg/StructuralComponentSegmentation.rst | 67 ++++ docs/source/tutorial/en/sif.rst | 145 ++++++++- .../en/tokenization/GensimSegTokenizer.rst | 9 + .../en/tokenization/GensimWordTokenizer.rst | 23 ++ .../en/tokenization/PureTextTokenizer.rst | 31 ++ .../en/tokenization/TextTokenizer.rst | 27 ++ docs/source/tutorial/en/tokenize.rst | 172 +++++++++++ .../en/tokenize/Sentence Segmentation.rst | 3 + .../tutorial/en/tokenize/Tokenization.rst | 29 ++ .../tutorial/en/tokenize/WordSegmentation.rst | 36 +++ docs/source/tutorial/en/vectorization.rst | 157 ++++++++++ .../en/vectorization/WithPre-trainedModel.rst | 42 +++ .../vectorization/WithoutPre-trainedModel.rst | 21 ++ 23 files changed, 1815 insertions(+), 2 deletions(-) create mode 100644 docs/source/tutorial/en/parse.rst create mode 100644 docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst create mode 100644 docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst create mode 100644 docs/source/tutorial/en/pretrain.rst create mode 100644 docs/source/tutorial/en/pretrain/loading.rst create mode 100644 docs/source/tutorial/en/pretrain/pub.rst create mode 100644 docs/source/tutorial/en/pretrain/start.rst create mode 100644 docs/source/tutorial/en/seg.rst create mode 100644 docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst create mode 100644 docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst create mode 100644 docs/source/tutorial/en/tokenization/GensimSegTokenizer.rst create mode 100644 docs/source/tutorial/en/tokenization/GensimWordTokenizer.rst create mode 100644 docs/source/tutorial/en/tokenization/PureTextTokenizer.rst create mode 100644 docs/source/tutorial/en/tokenization/TextTokenizer.rst create mode 100644 docs/source/tutorial/en/tokenize.rst create mode 100644 docs/source/tutorial/en/tokenize/Sentence Segmentation.rst create mode 100644 docs/source/tutorial/en/tokenize/Tokenization.rst create mode 100644 docs/source/tutorial/en/tokenize/WordSegmentation.rst create mode 100644 docs/source/tutorial/en/vectorization.rst create mode 100644 docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst create mode 100644 docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst diff --git a/docs/source/tutorial/en/index.rst b/docs/source/tutorial/en/index.rst index 108a9487..4c8cc040 100644 --- a/docs/source/tutorial/en/index.rst +++ b/docs/source/tutorial/en/index.rst @@ -1,2 +1,52 @@ Get Started -=========== +===== + +* `Standard Item Format `_ + +* `Syntax Parsing `_ + +* `Component Segmentation `_ + +* `Tokenization `_ + +* `Pre-training `_ + +* `Vectorization `_ + +Main process +---------- + +.. figure:: ../../_static/new_flow.png + +* `Syntax Parsing `_ : Its function is to convert the incoming item into SIF format, which means letters and numbers should be between ``$...$`` and the brackets and underlines of the choice questions should be converted to special symbols we defined in SIF) + +* `Component Segmentation `_ : Its function is to segment items in SIF format according to the types of items, so as to serve the later tokenization module.(that is, elements in different types can be tokenized using their corresponding methods)。 + +* `Tokenization `_: Its function is to tokenize segmented items, so as to serve the later tokenization module. + Generally, the tokenization method in the text form can be used directly. For formulas, the ast method can also be used for parsing(call the formula module); + +* `Vectorization `_: This part mainly calls I2V class and its subclasses. Its function is to vectorize the list of tokenized items, so as to get the corresponding static vectors. + For vectorization module, You can call your own trained model or directly call the provided pre-training model(call get_ pretrained_ I2V module). + +* **Downstream Model**:Process the obtained vectors to get the desired results。 + +Examples +-------- + +To help you quickly understand the functions of this project, this section only shows the usages of common function interface. Intermediate function modules (such as parse, formula, segment, etc.) and more subdivided interface methods are not shown. For further study, please refer to relevant documents. + +.. nbgallery:: + :caption: This is a thumbnail gallery: + :name: tokenize_gallery + :glob: + + Tokenization <../../build/blitz/tokenizer/tokenizer.ipynb> + + + +.. nbgallery:: + :caption: This is a thumbnail gallery: + :name: vectorization_gallery + :glob: + + Vectorization <../../build/blitz/vectorization/total_vector.ipynb> diff --git a/docs/source/tutorial/en/parse.rst b/docs/source/tutorial/en/parse.rst new file mode 100644 index 00000000..69608c5e --- /dev/null +++ b/docs/source/tutorial/en/parse.rst @@ -0,0 +1,290 @@ +Syntax Parsing +========= + +In educational resources, texts and formulas have internal implicit or explicit syntax structures. It is of great benefit for further processing to extract these structures. +* Text syntax structure parsing + +* Formula syntax structure parsing + +The purpose is as follows: + + +1. Represent underlines of blanks and brackets of choices with special identifiers. And the alphabets and formulas should be wrapped with $$, so that items of different types can be cut accurately through the symbol $. +2. Determine whether the current item is legal and report the error type. + +Specific processing content +-------------------- + +1.Its function is to match alphabets and numbers other than formulas. Only the alphabets and numbers between two Chinese characters will be corrected, and the rest of the cases are regarded as formulas that do not conform to latex syntax. + +2.Match brackets like "( )" (both English format and Chinese format), that is, brackets with no content or spaces, which should be replaced with ``$\\SIFChoice$`` + +3.Match continuous underscores or underscores with spaces to replace them with ``$\\SIFBlank$``. + +4.Match latex formulas,check the completeness and analyzability of latex formulas, and report an error for illegal formula. + +Formula syntax structure parsing +-------------------- + +This section is mainly realized by EduNLP.Formula modules, which can determine if the text has syntax errors and convert the syntax formula into the form of ast tree. In practice, this module is often used as part of an intermediate process, and the relevant parameters of this module can be automatically chosen by calling the corresponding model, so it generally does not need special attention. + +Introduction of Main Introduction ++++++++++++++++ + +1.Formula: determine whether the single formula passed in is in str form. If so, use the ast method for processing, otherwise an error will be reported. In addition, parameter variable_standardization is given. If this parameter is true, the variable standardization method will be used to make sure the same variable has the same variable number. + +2.FormulaGroup: If you need to pass in a formula set, you can call this interface to get an ast forest. The tree structure in the forest is the same as that of Formula. + +Formula +>>>>>>>>>>>> + +Formula: firstly, in the word segmentation function, the formula of the original text is segmented. In addition, ``Formula parse tree`` function is provided, which can represent the abstract syntax analysis tree of mathematical formula in the form of text or picture. + +This module also provides the function of formula variable standardization, such as determining that 'x' in several sub formulas is the same variable. + +Call the library ++++++++++ + +:: + + import matplotlib.pyplot as plt + from EduNLP.Formula import Formula + from EduNLP.Formula.viz import ForestPlotter + +Initialization ++++++++++ + +Incoming parameters: item + +Item is the latex formula or the abstract syntax parse tree generated after the formula is parsed and its type is str or List[Dict]. + +:: + + >>> f=Formula("x^2 + x+1 = y") + >>> f + + +View the specific content after formula segmentation +++++++++++++++++++++++++++++ + +- View node elements after formula segmentation + +:: + + >>> f.elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, + {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}] + +- View the abstract parse tree of formulas + +:: + + >>> f.ast + [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 3],'child': [1, 2],'father': None,'forest': None}}, + {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + 'structure': {'bro': [None, 2], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, + 'structure': {'bro': [3, 5], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + 'structure': {'bro': [4, 6], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + 'structure': {'bro': [5, 7], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + 'structure': {'bro': [6, 8], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}, + 'structure': {'bro': [7, None],'child': None,'father': None,'forest': None}}] + + >>> print('nodes: ',f.ast_graph.nodes) + nodes: [0, 1, 2, 3, 4, 5, 6, 7, 8] + >>> print('edges: ' ,f.ast_graph.edges) + edges: [(0, 1), (0, 2)] + +- show the abstract parse tree by a picture + +:: + + >>> ForestPlotter().export(f.ast_graph, root_list=[node["val"]["id"] for node in f.ast if node["structure"]["father"] is None],) + >>> plt.show() + + +.. figure:: ../../_static/formula.png + + +Variable standardization ++++++++++++ + +This parameter makes the same variable have the same variable number. + +For example: the number of variable ``x`` is ``0`` and the number of variable ``y`` is ``1``. + +:: + + >>> f.variable_standardization().elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}, + {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] + +FormulaGroup +>>>>>>>>>>>>>>> + +Call ``FormulaGroup`` class to parse the equations. The related attributes and functions are the same as those above. + +:: + + import matplotlib.pyplot as plt + from EduNLP.Formula import Formula + from EduNLP.Formula import FormulaGroup + from EduNLP.Formula.viz import ForestPlotter + >>> fs = FormulaGroup(["x^2 = y", "x^3 = y^2", "x + y = \pi"]) + >>> fs + ;;> + >>> fs.elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, + {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 7, 'type': 'textord', 'text': '3', 'role': 'sup'}, + {'id': 8, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 9, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 10, 'type': 'mathord', 'text': 'y', 'role': 'base'}, + {'id': 11, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 12, 'type': 'mathord', 'text': 'x', 'role': None}, + {'id': 13, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 14, 'type': 'mathord', 'text': 'y', 'role': None}, + {'id': 15, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 16, 'type': 'mathord', 'text': '\\pi', 'role': None}] + >>> fs.ast + [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 3], + 'child': [1, 2], + 'father': None, + 'forest': None}}, + {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + 'structure': {'bro': [None, 2], + 'child': None, + 'father': 0, + 'forest': [6, 12]}}, + {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, + 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, + 'structure': {'bro': [3, None], + 'child': None, + 'father': None, + 'forest': [10, 14]}}, + {'val': {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 8], + 'child': [6, 7], + 'father': None, + 'forest': None}}, + {'val': {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + show more (open the raw output data in a text editor) ... + >>> fs.variable_standardization()[0] + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] + >>> ForestPlotter().export(fs.ast_graph, root_list=[node["val"]["id"] for node in fs.ast if node["structure"]["father"] is None],) + +.. figure:: ../../_static/formulagroup.png + + +Text syntax structure parsing +-------------------- + +This section is mainly realized by EduNLP.SIF.Parse module. Its main function is to extract letters and numbers in the text and convert them into standard format. + +This module is mainly used as an *middle module* to parse the input text. Users generally do not call this module directly. + +Introduction of main content ++++++++++++++++ + +1. Judge the type of the incoming text in the following order + +* is_chinese: its function is to match Chinese characters[\u4e00-\u9fa5]. + +* is_alphabet: its function is to match alphabets other than formulas. Only the alphabets between two Chinese characters will be corrected (wrapped with $$), and the rest of the cases are regarded as formulas that do not conform to latex syntax. + +* is_number: its function is to match numbers other than formulas. Only the numbers between two Chinese characters will be corrected, and the rest of the cases are regarded as formulas that do not conform to latex syntax. + +2. Match latex formula + +* If Chinese characters appear in latex, print warning only once. + +* Use _is_formula_legal function, check the completeness and analyzability of latex formula, and report an error for formulas that do not conform to latex syntax. + +Call the library +>>>>>>>>>>>> + +:: + + from EduNLP.SIF.Parser import Parser + +Input +>>>>>>> + +Types: str + +Content: question text + +:: + + >>> text1 = '生产某种零件的A工厂25名工人的日加工零件数_ _' + >>> text2 = 'X的分布列为( )' + >>> text3 = '① AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.AC的中点为D' + >>> text4 = '支持公式如$\\frac{y}{x}$,$\\SIFBlank$,$\\FigureID{1}$,不支持公式如$\\frac{ \\dddot y}{x}$' + +Parsing +>>>>>>>>>>>>>>>>>>>> + +:: + + >>> text_parser1 = Parser(text1) + >>> text_parser2 = Parser(text2) + >>> text_parser3 = Parser(text3) + >>> text_parser4 = Parser(text4) + +Related parameters description(?) +>>>>>>>>>>>> + +- Try to convert text to standard format + +:: + + >>> text_parser1.description_list() + >>> print('text_parser1.text:',text_parser1.text) + text_parser1.text: 生产某种零件的$A$工厂$25$名工人的日加工零件数$\SIFBlank$ + >>> text_parser2.description_list() + >>> print('text_parser2.text:',text_parser2.text) + text_parser2.text: $X$的分布列为$\SIFChoice$ + +- Determine if the text has syntax errors + +:: + + >>> text_parser3.description_list() + >>> print('text_parser3.error_flag: ',text_parser3.error_flag) + text_parser3.error_flag: 1 + >>> text_parser4.description_list() + >>> print('text_parser4.fomula_illegal_flag: ',text_parser4.fomula_illegal_flag) + text_parser4.fomula_illegal_flag: 1 + diff --git a/docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst b/docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst new file mode 100644 index 00000000..2fc479c5 --- /dev/null +++ b/docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst @@ -0,0 +1,168 @@ +Formula syntax structure parsing +-------------------- + +This section is mainly realized by EduNLP.Formula modules, which can determine if the text has syntax errors and convert the syntax formula into the form of ast tree. In practice, this module is often used as part of an intermediate process, and the relevant parameters of this module can be automatically chosen by calling the corresponding model, so it generally does not need special attention. + +Introduction of Main Content ++++++++++++++++ + +1.Formula: determine whether the single formula passed in is in str form. If so, use the ast method for processing, otherwise an error will be reported. In addition, parameter variable_standardization is given. If this parameter is true, the variable standardization method will be used to make sure the same variable has the same variable number. + +2.FormulaGroup: If you need to pass in a formula set, you can call this interface to get an ast forest. The tree structure in the forest is the same as that of Formula. + +Formula +>>>>>>>>>>>> + +Formula: firstly, in the word segmentation function, the formula of the original text is segmented. In addition, ``Formula parse tree`` function is provided, which can represent the abstract syntax analysis tree of mathematical formula in the form of text or picture. + +This module also provides the function of formula variable standardization, such as determining that 'x' in several sub formulas is the same variable. + +Initialization ++++++++++ + +Incoming parameters: item + +Item is the latex formula or the abstract syntax parse tree generated after the formula is parsed and its type is str or List[Dict]. + +:: + + >>> f=Formula("x^2 + x+1 = y") + >>> f + + +View the specific content after formula segmentation +++++++++++++++++++++++++++++ + +- View node elements after formula segmentation + +:: + + >>> f.elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, + {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}] + +- View the abstract parsing tree of formulas + +:: + + >>> f.ast + [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 3],'child': [1, 2],'father': None,'forest': None}}, + {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + 'structure': {'bro': [None, 2], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None}, + 'structure': {'bro': [3, 5], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + 'structure': {'bro': [4, 6], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + 'structure': {'bro': [5, 7], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + 'structure': {'bro': [6, 8], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}, + 'structure': {'bro': [7, None],'child': None,'father': None,'forest': None}}] + + >>> print('nodes: ',f.ast_graph.nodes) + nodes: [0, 1, 2, 3, 4, 5, 6, 7, 8] + >>> print('edges: ' ,f.ast_graph.edges) + edges: [(0, 1), (0, 2)] + +- show the abstract parse tree by a picture + +:: + + >>> ForestPlotter().export(f.ast_graph, root_list=[node["val"]["id"] for node in f.ast if node["structure"]["father"] is None],) + >>> plt.show() + +.. figure:: ../../../_static/formula.png + +Variable Standardization ++++++++++++ + +This parameter makes the same variable have the same variable number. + +For example: the number of variable ``x`` is ``0`` and the number of variable ``y`` is ``1``. + +:: + + >>> f.variable_standardization().elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}, + {'id': 5, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 6, 'type': 'textord', 'text': '1', 'role': None}, + {'id': 7, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] + +FormulaGroup +>>>>>>>>>>>>>>> + +Call ``FormulaGroup`` class to parse the equations. The related attributes and functions are the same as those above. + +:: + + >>> fs = FormulaGroup(["x^2 = y", "x^3 = y^2", "x + y = \pi"]) + >>> fs + ;;> + >>> fs.elements + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, + {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + {'id': 7, 'type': 'textord', 'text': '3', 'role': 'sup'}, + {'id': 8, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 9, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + {'id': 10, 'type': 'mathord', 'text': 'y', 'role': 'base'}, + {'id': 11, 'type': 'textord', 'text': '2', 'role': 'sup'}, + {'id': 12, 'type': 'mathord', 'text': 'x', 'role': None}, + {'id': 13, 'type': 'bin', 'text': '+', 'role': None}, + {'id': 14, 'type': 'mathord', 'text': 'y', 'role': None}, + {'id': 15, 'type': 'rel', 'text': '=', 'role': None}, + {'id': 16, 'type': 'mathord', 'text': '\\pi', 'role': None}] + >>> fs.ast + [{'val': {'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 3], + 'child': [1, 2], + 'father': None, + 'forest': None}}, + {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + 'structure': {'bro': [None, 2], + 'child': None, + 'father': 0, + 'forest': [6, 12]}}, + {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, + 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}}, + {'val': {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, + 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}}, + {'val': {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None}, + 'structure': {'bro': [3, None], + 'child': None, + 'father': None, + 'forest': [10, 14]}}, + {'val': {'id': 5, 'type': 'supsub', 'text': '\\supsub', 'role': None}, + 'structure': {'bro': [None, 8], + 'child': [6, 7], + 'father': None, + 'forest': None}}, + {'val': {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'}, + show more (open the raw output data in a text editor) ... + >>> fs.variable_standardization()[0] + [{'id': 0, 'type': 'supsub', 'text': '\\supsub', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}] + >>> ForestPlotter().export(fs.ast_graph, root_list=[node["val"]["id"] for node in fs.ast if node["structure"]["father"] is None],) + +.. figure:: ../../../_static/formulagroup.png diff --git a/docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst b/docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst new file mode 100644 index 00000000..bdfe6848 --- /dev/null +++ b/docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst @@ -0,0 +1,72 @@ +Text syntax structure parsing +-------------------- + +This section is mainly realized by EduNLP.SIF.Parse module. Its main function is to extract letters and numbers in the text and convert them into standard format. + +This module is mainly used as an *middle module* to parse the input text. Users generally do not call this module directly. + +Introduction of Main Content ++++++++++++++++ + +1. Judge the type of the incoming text in the following order + +* is_chinese: its function is to match Chinese characters[\u4e00-\u9fa5]. + +* is_alphabet: its function is to match alphabets other than formulas. Only the alphabets between two Chinese characters will be corrected (wrapped with $$), and the rest of the cases are regarded as formulas that do not conform to latex syntax. + +* is_number: its function is to match numbers other than formulas. Only the numbers between two Chinese characters will be corrected, and the rest of the cases are regarded as formulas that do not conform to latex syntax. + +2. Match latex formula + +* If Chinese characters appear in latex, print warning only once. + +* Use _is_formula_legal function, check the completeness and analyzability of latex formula, and report an error for formulas that do not conform to latex syntax. + +Input +>>>>>>> + +Type: str + +Content:question text + +:: + + >>> text1 = '生产某种零件的A工厂25名工人的日加工零件数_ _' + >>> text2 = 'X的分布列为( )' + >>> text3 = '① AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.AC的中点为D' + >>> text4 = '支持公式如$\\frac{y}{x}$,$\\SIFBlank$,$\\FigureID{1}$,不支持公式如$\\frac{ \\dddot y}{x}$' + +Parsing +>>>>>>>>>>>>>>>>>>>> + +:: + + >>> text_parser1 = Parser(text1) + >>> text_parser2 = Parser(text2) + >>> text_parser3 = Parser(text3) + >>> text_parser4 = Parser(text4) + +Related parameters description(?) +>>>>>>>>>>>> + +- Try to convert text to standard format + +:: + + >>> text_parser1.description_list() + >>> print('text_parser1.text:',text_parser1.text) + text_parser1.text: 生产某种零件的$A$工厂$25$名工人的日加工零件数$\SIFBlank$ + >>> text_parser2.description_list() + >>> print('text_parser2.text:',text_parser2.text) + text_parser2.text: $X$的分布列为$\SIFChoice$ + +- Determine if the text has syntax errors + +:: + + >>> text_parser3.description_list() + >>> print('text_parser3.error_flag: ',text_parser3.error_flag) + text_parser3.error_flag: 1 + >>> text_parser4.description_list() + >>> print('text_parser4.fomula_illegal_flag: ',text_parser4.fomula_illegal_flag) + text_parser4.fomula_illegal_flag: 1 diff --git a/docs/source/tutorial/en/pretrain.rst b/docs/source/tutorial/en/pretrain.rst new file mode 100644 index 00000000..9319b87d --- /dev/null +++ b/docs/source/tutorial/en/pretrain.rst @@ -0,0 +1,130 @@ +Pre-training +======= + +In the field of NLP, Pre-trained Language Models has become a very important basic technology. +In this chapter, we will introduce the pre training tools in EduNLP: + +* How to train with a corpus to get a pre-trained model +* How to load the pre-trained model +* Public pre-trained models + +Import modules +---------- + +:: + + from EduNLP.I2V import get_pretrained_i2v + from EduNLP.Vector import get_pretrained_t2v + +Train the Model +------------ + +Call train_Vector function interface directly to make the training model easier. This section calls the relevant training models in the gensim library. At present, the training methods of "sg"、 "cbow"、 "fastext"、 "d2v"、 "bow"、 "tfidf" are provided. Parameter embedding_dim is also provided for users to determine vector dimension according to their needs. + +Basic Steps +################## + +1.Determine the type of model and select the appropriate tokenizer (GensimWordTokenizer、 GensimSegTokenizer) to finish tokenization. + +2.Call train_vector function to get the required pre-trained model。 + +Examples: + +:: + + >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] + + # 10 dimension with fasstext method + train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="d2v") + + +Load models +-------- + +Transfer the obtained model to the I2V module to load the model. + +Examples: + +:: + + >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) + +The overview of our public model +------------ + +Version description +################## + +First level version: + +* Public version 1 (luna_pub): college entrance examination +* Public version 2 (luna_pub_large): college entrance examination + regional examination + +Second level version: + +* Minor subjects(Chinese,Math,English,History,Geography,Politics,Biology,Physics,Chemistry) +* Major subjects(science, arts and all subject) + +Third level version【to be finished】: + +* Don't use third-party initializers +* Use third-party initializers + +Description of train data in models +################## + +* Currently, the data used in w2v and d2v models are the subjects of senior high school. +* test data:`[OpenLUNA.json] `_ + +At present, the following models are provided. More models of different subjects and question types are being trained. Please look forward to it. + "d2v_all_256" (all subject), "d2v_sci_256" (Science), "d2v_eng_256" (English),"d2v_lit_256" (Arts) + + +Examples of Model Training +------------ + +Get the dataset +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + prepare_dataset <../../build/blitz/pretrain/prepare_dataset.ipynb> + +An example of d2v in gensim model +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + d2v_bow_tfidf <../../build/blitz/pretrain/gensim/d2v_bow_tfidf.ipynb> + d2v_general <../../build/blitz/pretrain/gensim/d2v_general.ipynb> + d2v_stem_tf <../../build/blitz/pretrain/gensim/d2v_stem_tf.ipynb> + +An example of w2v in gensim model +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + w2v_stem_text <../../build/blitz/pretrain/gensim/w2v_stem_text.ipynb> + w2v_stem_tf <../../build/blitz/pretrain/gensim/w2v_stem_tf.ipynb> + +An example of seg_token +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + d2v.ipynb <../../build/blitz/pretrain/seg_token/d2v.ipynb> + d2v_d1 <../../build/blitz/pretrain/seg_token/d2v_d1.ipynb> + d2v_d2 <../../build/blitz/pretrain/seg_token/d2v_d2.ipynb> \ No newline at end of file diff --git a/docs/source/tutorial/en/pretrain/loading.rst b/docs/source/tutorial/en/pretrain/loading.rst new file mode 100644 index 00000000..31fa3ea8 --- /dev/null +++ b/docs/source/tutorial/en/pretrain/loading.rst @@ -0,0 +1,11 @@ +Load models +-------- + +Transfer the obtained model to the I2V module to load the model. + +Examples: + +:: + + >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) diff --git a/docs/source/tutorial/en/pretrain/pub.rst b/docs/source/tutorial/en/pretrain/pub.rst new file mode 100644 index 00000000..34407745 --- /dev/null +++ b/docs/source/tutorial/en/pretrain/pub.rst @@ -0,0 +1,74 @@ +The overview of our public model +------------ + + +Version Description +################## + +First level version: + +* Public version 1 (luna_pub): college entrance examination +* Public version 2 (luna_pub_large): college entrance examination + regional examination + +Second level version: + +* Minor subjects(Chinese,Math,English,History,Geography,Politics,Biology,Physics,Chemistry) +* Major subjects(science, arts and all subject) + +Third level version【to be finished】: + +* Don't use third-party initializers +* Use third-party initializers + +Description of train data in models +################## + +* Currently, the data used in w2v and d2v models are the subjects of senior high school. +* test data:`[OpenLUNA.json] `_ + +At present, the following models are provided. More models of different subjects and question types are being trained. Please look forward to it. + "d2v_all_256" (all subject), "d2v_sci_256" (Science), "d2v_eng_256" (English),"d2v_lit_256" (Arts) + +Examples of model training +------------ + +Get the dataset +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + prepare_dataset <../../../build/blitz/pretrain/prepare_dataset.ipynb> + +An example of d2v in gensim model +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + d2v_bow_tfidf <../../../build/blitz/pretrain/gensim/d2v_bow_tfidf.ipynb> + d2v_general <../../../build/blitz/pretrain/gensim/d2v_general.ipynb> + d2v_stem_tf <../../../build/blitz/pretrain/gensim/d2v_stem_tf.ipynb> + +An example of w2v in gensim model +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + w2v_stem_text <../../../build/blitz/pretrain/gensim/w2v_stem_text.ipynb> + w2v_stem_tf <../../../build/blitz/pretrain/gensim/w2v_stem_tf.ipynb> + +An example of seg_token +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + d2v.ipynb <../../../build/blitz/pretrain/seg_token/d2v.ipynb> + d2v_d1 <../../../build/blitz/pretrain/seg_token/d2v_d1.ipynb> + d2v_d2 <../../../build/blitz/pretrain/seg_token/d2v_d2.ipynb> diff --git a/docs/source/tutorial/en/pretrain/start.rst b/docs/source/tutorial/en/pretrain/start.rst new file mode 100644 index 00000000..9c5bc241 --- /dev/null +++ b/docs/source/tutorial/en/pretrain/start.rst @@ -0,0 +1,24 @@ +Train the model +------------ + +Call train_Vector function interface directly to make the training model easier. This section calls the relevant training models in the gensim library. At present, the training methods of "sg"、 "cbow"、 "fastext"、 "d2v"、 "bow"、 "tfidf" are provided. Parameter embedding_dim is also provided for users to determine vector dimension according to their needs. + +Basic Steps +################## + +1.Determine the type of model and select the appropriate tokenizer (GensimWordTokenizer、 GensimSegTokenizer) to finish tokenization. + +2.Call train_vector function to get the required pre-trained model。 + +Examples: + +:: + + >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] + + # 10 dimension with fasstext method + train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="d2v") diff --git a/docs/source/tutorial/en/seg.rst b/docs/source/tutorial/en/seg.rst new file mode 100644 index 00000000..ad2696a2 --- /dev/null +++ b/docs/source/tutorial/en/seg.rst @@ -0,0 +1,187 @@ +Component Segmentation +========= + +Educational resource is a kind of multimodal data, including data such as text, picture, formula and so on. +At the same time, it may also contain different components semantically, such as question stems, options, etc. Therefore, we first need to identify and segment the different components of educational resources: + +* Semantic Component Segmentation +* Structural Component Segmentation + +Main Processing Contents +-------------------- + +1. Convert multiple-choice questions in the form of dict to qualified item by `Syntax parsing `_; + +2. The input items are segmented and grouped according to the element type. + +Semantic Component Segmentation +------------ + +Because multiple-choice questions are given in the form of dict, it is necessary to convert them into text format while retaining their data relationship. This function can be realized by dict2str4sif function which can convert multiple-choice question items into character format and identify question stem and options。 + +Import Modules ++++++++++ + +:: + + from EduNLP.utils import dict2str4sif + +Basic Usage +++++++++++++++++++ + +:: + + >>> item = { + ... "stem": r"若复数$z=1+2 i+i^{3}$,则$|z|=$", + ... "options": ['0', '1', r'$\sqrt{2}$', '2'], + ... } + >>> dict2str4sif(item) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' + +Optional additional parameters / interfaces +++++++++++++++++++++++ + +1.add_list_no_tag: if this parameter is true, it means that you need to count the labels in the options section. + +:: + + >>> dict2str4sif(item, add_list_no_tag=True) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' + + >>> dict2str4sif(item, add_list_no_tag=False) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2$\\SIFTag{options_end}$' + +2.tag_mode: The location for the label can be selected using this parameter. 'delimiter' is to label both the beginning and the end,'head' is to label only the head, and 'tail' is to label only the tail. + +:: + + >>> dict2str4sif(item, tag_mode="head") # doctest: +ELLIPSIS + '$\\SIFTag{stem}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{options}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2' + + >>> dict2str4sif(item, tag_mode="tail") # doctest: +ELLIPSIS + '若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options}$' + +3.key_as_tag: If this parameter is false, this process will only adds $\SIFSep$ between the options without distinguishing the type of segmentation label. + +:: + + >>> dict2str4sif(item, key_as_tag=False) + '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' + +Structural Component Segmentation +------------ + +This step is to segment sliced items. In this step, there is a depth option. You can select all positions or some labels for segmentation according to your needs, such as \SIFSep and \SIFTag. You can also select where to add labels, either at the head and tail or only at the head or tail. + + +There are two modes: + +* linear mode: it is used for text processing (word segmentation using jieba library); + +* ast mode: it is used to parse the formula. + +Basic Segmentation process: + +- Match components with regular expression matching + +- Process the components with special structures, such as converting the base64 encoded picture to numpy form + +- Classify the elements into each element group + +- Enter the corresponding parameters as required to get the filtered results + +Import Modules ++++++++++ + +:: + + from EduNLP.SIF.segment import seg + from EduNLP.SIF import sif4sci + +Basic Usage +++++++++++++++++++ + +:: + + >>> test_item = r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$" + >>> seg(test_item) + >>> ['如图所示,则', '\\bigtriangleup ABC', '的面积是', '\\SIFBlank', '。', \FigureID{1}] + +Optional additional parameters/interfaces +++++++++++++++++++++++ + +1.describe: count the number of elements of different types + +:: + + >>> s.describe() + {'t': 3, 'f': 1, 'g': 1, 'm': 1} + +2.filter: this interface can screen out one or more types of elements. + +Using this interface, you can pass in a "keep" parameter or a special character directly to choose what type of elements to retain. + +Element type represented by symbol: + +- "t": text +- "f": formula +- "g": figure +- "m": question mark +- "a": tag +- "s": sep tag + +:: + + >>> with s.filter("f"): + ... s + ['如图所示,则', '的面积是', '\\SIFBlank', '。', \FigureID{1}] + >>> with s.filter(keep="t"): + ... s + ['如图所示,则', '的面积是', '。'] + +3.symbol: this interface can convert some types of data into special symbols + +Element type represented by symbol: + +- "t": text +- "f": formula +- "g": figure +- "m": question mark + +:: + + >>> seg(test_item, symbol="fgm") + ['如图所示,则', '[FORMULA]', '的面积是', '[MARK]', '。', '[FIGURE]'] + >>> seg(test_item, symbol="tfgm") + ['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]'] + +In addition,sif4sci function is also provided, which can easily convert items into the result processed by Structural Component Segmentation + +:: + + >>> segments = sif4sci(item["stem"], figures=figures, tokenization=False) + >>> segments + ['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形', 'ABC', '的斜边', 'BC', ', 直角边', 'AB', ', ', 'AC', '.', '\\bigtriangleup ABC', '的三边所围成的区域记为', 'I', ',黑色部分记为', 'II', ', 其余部分记为', 'III', '.在整个图形中随机取一点,此点取自', 'I,II,III', '的概率分别记为', 'p_1,p_2,p_3', ',则', '\\SIFChoice', \FigureID{1}] + +- When calling this function, you can selectively output a certain type of data according to your needs + +:: + + >>> segments.formula_segments + ['ABC', + 'BC', + 'AB', + 'AC', + '\\bigtriangleup ABC', + 'I', + 'II', + 'III', + 'I,II,III', + 'p_1,p_2,p_3'] + +- Similar to seg function, sif4sci function also provides depth options to help with your research ----- By modifying the ``symbol`` parameter, different components can be transformed into specific markers. + +:: + + >>> sif4sci(item["stem"], figures=figures, tokenization=False, symbol="tfgm") + ['[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[FIGURE]'] diff --git a/docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst b/docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst new file mode 100644 index 00000000..3901f4cb --- /dev/null +++ b/docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst @@ -0,0 +1,47 @@ +Semantic Component Segmentation +------------ + +Because multiple-choice questions are given in the form of dict, it is necessary to convert them into text format while retaining their data relationship. This function can be realized by dict2str4sif function which can convert multiple-choice question items into character format and identify question stem and options。 + + +Basic Usage +++++++++++++++++++ + +:: + + >>> item = { + ... "stem": r"若复数$z=1+2 i+i^{3}$,则$|z|=$", + ... "options": ['0', '1', r'$\sqrt{2}$', '2'], + ... } + >>> dict2str4sif(item) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' + +Optional additional parameters / interfaces +++++++++++++++++++++++ + +1.add_list_no_tag: if this parameter is true, it means that you need to count the labels in the options section. + +:: + + >>> dict2str4sif(item, add_list_no_tag=True) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' + + >>> dict2str4sif(item, add_list_no_tag=False) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2$\\SIFTag{options_end}$' + +2.tag_mode: The location for the label can be selected using this parameter. 'delimiter' is to label both the beginning and the end,'head' is to label only the head, and 'tail' is to label only the tail. + +:: + + >>> dict2str4sif(item, tag_mode="head") # doctest: +ELLIPSIS + '$\\SIFTag{stem}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{options}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2' + + >>> dict2str4sif(item, tag_mode="tail") # doctest: +ELLIPSIS + '若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options}$' + +3.key_as_tag: If this parameter is false, this process will only adds $\SIFSep$ between the options without distinguishing the type of segmentation label. + +:: + + >>> dict2str4sif(item, key_as_tag=False) + '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' \ No newline at end of file diff --git a/docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst b/docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst new file mode 100644 index 00000000..8661c3d6 --- /dev/null +++ b/docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst @@ -0,0 +1,67 @@ +Structural Component Segmentation +------------ + +This step is to segment sliced items. In this step, there is a depth option. You can select all positions or some labels for segmentation according to your needs, such as \SIFSep and \SIFTag. You can also select where to add labels, either at the head and tail or only at the head or tail. + + +There are two modes: + +* linear mode: it is used for text processing (word segmentation using jieba library); + +* ast mode: it is used to parse the formula. + +Basic Usage +++++++++++++++++++ + +:: + + >>> test_item = r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$" + >>> seg(test_item) + >>> ['如图所示,则', '\\bigtriangleup ABC', '的面积是', '\\SIFBlank', '。', \FigureID{1}] + +Optional additional parameters/interfaces +++++++++++++++++++++++ + +1.describe: count the number of elements of different types + +:: + + >>> s.describe() + {'t': 3, 'f': 1, 'g': 1, 'm': 1} + +2.filter: this interface can screen out one or more types of elements. + +Using this interface, you can pass in a "keep" parameter or a special character directly to choose what type of elements to retain. + +Element type represented by symbol: + "t": text + "f": formula + "g": figure + "m": question mark + "a": tag + "s": sep tag + +:: + + >>> with s.filter("f"): + ... s + ['如图所示,则', '的面积是', '\\SIFBlank', '。', \FigureID{1}] + >>> with s.filter(keep="t"): + ... s + ['如图所示,则', '的面积是', '。'] + +3.symbol: this interface can convert some types of data into special symbols + +Element type represented by symbol: + +- "t": text +- "f": formula +- "g": figure +- "m": question mark + +:: + + >>> seg(test_item, symbol="fgm") + ['如图所示,则', '[FORMULA]', '的面积是', '[MARK]', '。', '[FIGURE]'] + >>> seg(test_item, symbol="tfgm") + ['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]'] diff --git a/docs/source/tutorial/en/sif.rst b/docs/source/tutorial/en/sif.rst index 0cbe7cf3..877cb503 100644 --- a/docs/source/tutorial/en/sif.rst +++ b/docs/source/tutorial/en/sif.rst @@ -1,2 +1,145 @@ Standard Item Format -==================== +=============== + +version: 0.2 + +For the convenience of follow-up research and use, we need a unified test question grammar standard. + +Grammar Rules +----------- + +1. Only Chinese characters, Chinese and English punctuation and line breaks are allowed in the question text. + +2. Represent underlines of blanks and brackets of choices with ``\$\SIFBlank\$`` and ``\$\SIFChoice\$`` respectively. + +3. We use ``$\FigureID{ uuid }$`` or Base64 to represent pictures. Especially, ``$\FormFigureID{ uuid }$`` is used to represent formulas pictures. + +4. Text format description: we represent text in different styles with ``$\textf{item,CHAR_EN}$``. Currently, we have defined some styles: b-bold, i-italic, u-underline, w-wave, d-dotted, t-title. CHAR_EN Labels can be mixed and sorted alphabetically. An example: $\textf{EduNLP, b}$ looks **EduNLP** + +5. Other mathematical symbols like English letters, Roman characters and numbers need to be expressed in latex format, that is, embedded in `$$`. + +6. For the entry standard of molecular formula, please refer to `INCHI `_ for the time being. + +7. Currently, there are no requirements for latex internal syntax. + +:: + + 1. Item -> CHARACTER|EN_PUN_LIST|CH_PUN_LIST|FORMULA|QUES_MARK + 2. EN_PUN_LIST -> [',', '.', '?', '!', ':', ';', '\'', '\"', '(', ')', ' ','_','/','|','\\','<','>','[',']','-'] + 3. CH_PUN_LIST -> [',', '。', '!', '?', ':',';', '‘', '’', '“', '”', '(', ')', ' ', '、','《','》','—','.'] + 4. FORMULA -> $latex formula$ | $\FormFigureID{UUID}$ | $\FormFigureBase64{BASE64}$ + 5. FIGURE -> $\FigureID{UUID}$ | $\FigureBase64{BASE64}$ + 6. UUID -> [a-zA-Z\-0-9]+ + 7. CHARACTER -> CHAR_EN | CHAR_CH + 8. CHAR_EN -> [a-zA-Z]+ + 9. CHAR_CH -> [\u4e00-\u9fa5]+ + 10. DIGITAL -> [0-9]+ + 11. QUES_MARK -> $\SIFBlank$ | $\SIFChoice$ + + +Tips ++++++++++++++++ + +1. Reserved characters and escape characters. + +2. Numbers. + +3. Choices and blanks. + +4. A single number or letter is also required to be between `$$` (automatic verification could already realize it). + +5. Try to make sure Chinese is not included in the latex formula such as ``\text{CHAR_CH}``. + +6. When importing data using MySQL database, an ``\`` is automatically ignored which needs to be further processed as``\\``. + +Examples +----------------- + +Standard Format: + +:: + + 1. 若$x,y$满足约束条件$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,则$z=x+7 y$的最大值$\\SIFUnderline$' + + 2. 已知函数$f(x)=|3 x+1|-2|x|$画出$y=f(x)$的图像求不等式$f(x)>f(x+1)$的解集$\\PictureID{3bf2ddf4-8af1-11eb-b750-b46bfc50aa29}$$\\PictureID{59b8bd14-8af1-11eb-93a5-b46bfc50aa29}$$\\PictureID{63118b3a-8b75-11eb-a5c0-b46bfc50aa29}$$\\PictureID{6a006179-8b76-11eb-b386-b46bfc50aa29}$$\\PictureID{088f15eb-8b7c-11eb-a86f-b46bfc50aa29}$ + +Non-standard Format: + +1. Letters, numbers and mathematical symbols are mixed: + + For example: + + ``完成下面的2x2列联表,`` + + ``(单位:m3)`` + + ``则输出的n=`` + +2. Some special mathematical symbols are not represented by the latex formula: + + For example: + + ``命题中真命题的序号是 ①`` + + ``AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.若D为AC的中点`` + +3. There are unicode encoded characters in the text: + + For example: + ``则$a$的取值范围是(\u3000\u3000)`` + +Functions for judging whether text is in SIF format and converting to SIF format +-------------------------------------------- + +Call the Library +++++++++ +:: + + from EduNLP.SIF import is_sif, to_sif + +is_sif ++++++++++++ + +:: + + >>> text1 = '若$x,y$满足约束条件' + >>> text2 = '$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,' + >>> text3 = '则$z=x+7 y$的最大值$\\SIFUnderline$' + >>> text4 = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...' + >>> is_sif(text1) + True + >>> is_sif(text2) + True + >>> is_sif(text3) + True + >>> is_sif(text4) + False + +to_sif ++++++++++++ + +:: + + >>> text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...' + >>> to_sif(text) + '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...' + + +Change Log +---------------- + +2021-05-18 + +Changed + +1. Originally, we use ``\$\SIFUnderline\$`` and ``\$\SIFBracket\$`` to represent underlines of blanks and brackets of choices. Now we represent them with ``\$\SIFBlank\$`` and ``\$\SIFChoice\$``. + +2. Originally, we used ``$\PictureID{ uuid }$`` to represent pictures, but now we use ``$\FigureID{ uuid }$`` instead. Especially, ``$\FormFigureID{ uuid }$`` is used to represent formulas pictures. + +2021-06-28 + +Added: + +1. There should not be line breaks between the notation ``$$``. + +2. Add text format description. diff --git a/docs/source/tutorial/en/tokenization/GensimSegTokenizer.rst b/docs/source/tutorial/en/tokenization/GensimSegTokenizer.rst new file mode 100644 index 00000000..eb624e94 --- /dev/null +++ b/docs/source/tutorial/en/tokenization/GensimSegTokenizer.rst @@ -0,0 +1,9 @@ +GensimSegTokenizer +===================== + +By default, the pictures, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text, formulas and labels. Also, the tokenizer uses linear analysis method for text and abstract analysis method of syntax tree for formulas. + +Compared to GensimWordTokenizer, the main differences are: + +* It provides the depth option for segmentation position, such as \SIFSep and \SIFTag. +* By default, labels are inserted in the header of item components (such as text and formula). \ No newline at end of file diff --git a/docs/source/tutorial/en/tokenization/GensimWordTokenizer.rst b/docs/source/tutorial/en/tokenization/GensimWordTokenizer.rst new file mode 100644 index 00000000..98d4b10a --- /dev/null +++ b/docs/source/tutorial/en/tokenization/GensimWordTokenizer.rst @@ -0,0 +1,23 @@ +GensimWordTokenizer +===================== + +By default, the pictures, blanks in the question text and other parts of the incoming item are converted into special characters for data security and the tokenization of text, formulas, labels and separators. Also, the tokenizer uses linear analysis method for text and abstract syntax tree method for formulas respectively. You can choose each of them by ``general`` parameter: + +-true, it means that the incoming item conforms to SIF and the linear analysis method should be used. +-false, it means that the incoming item doesn't conform to SIF and the abstract syntax tree method should be used. + +Examples +---------- + +:: + + >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] + >>> tokenizer = GensimWordTokenizer(symbol="fgmas", general=False) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]'] diff --git a/docs/source/tutorial/en/tokenization/PureTextTokenizer.rst b/docs/source/tutorial/en/tokenization/PureTextTokenizer.rst new file mode 100644 index 00000000..8c36e67c --- /dev/null +++ b/docs/source/tutorial/en/tokenization/PureTextTokenizer.rst @@ -0,0 +1,31 @@ +PureTextTokenizer +================ + +By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security. At the same time, special formulas such as $\\FormFigureID{...}$ and $\\FormFigureBase64{...}$ are screened out to facilitate the tokenization of text and plain text formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future. + +Examples +---------- + +:: + + >>> tokenizer = PureTextTokenizer() + >>> items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"] + >>> tokens = tokenizer(items) + >>> next(tokens)[:10] + ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z'] + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + >>> items = [{ + ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", + ... "options": ["1", "2"] + ... }] + >>> tokens = tokenizer(items, key=lambda x: x["stem"]) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] diff --git a/docs/source/tutorial/en/tokenization/TextTokenizer.rst b/docs/source/tutorial/en/tokenization/TextTokenizer.rst new file mode 100644 index 00000000..08991be6 --- /dev/null +++ b/docs/source/tutorial/en/tokenization/TextTokenizer.rst @@ -0,0 +1,27 @@ +TextTokenizer +================ + +By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text and formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future. + + +Examples +---------- + +:: + + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokenizer = TextTokenizer() + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + >>> items = [{ + ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", + ... "options": ["1", "2"] + ... }] + >>> tokens = tokenizer(items, key=lambda x: x["stem"]) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] diff --git a/docs/source/tutorial/en/tokenize.rst b/docs/source/tutorial/en/tokenize.rst new file mode 100644 index 00000000..3411b74b --- /dev/null +++ b/docs/source/tutorial/en/tokenize.rst @@ -0,0 +1,172 @@ +Tokenization +======= + +Tokenization, known as word segmentation and sentence segmentation, is a basic but very important step in the field of NLP. +In EduNLP, we divided Tokenization into different levels according to different granularity. To avoid ambiguity, we define as follows: + +* Word/char level: word segmentation + +* Sentence level: sentence segmentation + +* Resource level: tokenization + +This module provides tokenization function of question text, converting questions into token sequences to facilitate the vectorization of questions. After that, each element in the sliced item needs word segmentation. In this step, there is a depth option. You can select all positions or some labels for segmentation according to your needs, such as \SIFSep and \SIFTag. You can also select where to add labels, either at the head and tail or only at the head or tail. + +There are two modes: one is linear mode, which is used for text processing (word segmentation using jieba library). The other one is ast mode, which is used to parse the formula. + +Word Segmentation +------- + +Text-tokenization: A sentence (without formulas) consists of several "words" in order. The process of dividing a sentence into several words is called "Text-tokenization". According to the granularity of "words", it can be subdivided into "Word-tokenization" and "Char-tokenization". + +:: + + - Word-tokenization: each phrase is a token. + + - Char-tokenization: each character is a token. + + +Text-tokenization is divided into two main steps: + +1. Text-tokenization: + + - Word-tokenization: use the word segmentation tool to segment and extract words from the question text. Our project supports `jieba`. + + - Char-tokenization: process text by character. + +2. Filter: filter the specified stopwords. + + The default stopwords used in this project:`[stopwords] `_ + You can also use your own stopwords. The following example demonstrates how to use. + +Examples: + +:: + + from EduNLP.SIF.tokenization.text import tokenize + >>> text = "三角函数是基本初等函数之一" + >>> tokenize(text, granularity="word") + ['三角函数', '初等', '函数'] + + >>> tokenize(text, granularity="char") + ['三', '角', '函', '数', '基', '初', '函', '数'] + +Sentence Segmentation +------- + +During the process of sentence segmentation, a long document is divided into several sentences. Each sentence is a "token" (to be realized). + +Tokenization +------- +Tokenization is comprehensive analysis. In this process, sentences with formulas are segmented into several markers. Each marker is a "token". + +The implementation of this function is tokenize function. The required results can be obtained by passing in items after Structural Component Segmentation. + +:: + + from EduNLP.Tokenizer import get_tokenizer + >>> items = "如图所示,则三角形$ABC$的面积是$\\SIFBlank$。$\\FigureID{1}$" + >>> tokenize(SegmentList(items)) + ['如图所示', '三角形', 'ABC', '面积', '\\\\SIFBlank', \\FigureID{1}] + >>> tokenize(SegmentList(items),formula_params={"method": "ast"}) + ['如图所示', '三角形', , '面积', '\\\\SIFBlank', \\FigureID{1}] + + + +You can view ``./EduNLP/Tokenizer/tokenizer.py`` and ``./EduNLP/Pretrain/gensim_vec.py`` for more tokenizers. We provide some encapsulated tokenizers for users to call them conveniently. Following is a complete list of tokenizers: + +- TextTokenizer + +- PureTextTokenizer + +- GensimSegTokenizer + +- GensimWordTokenizer + + +TextTokenizer ++++++++++++++++++++++ + +By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text and formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future. + +:: + + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokenizer = TextTokenizer() + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + >>> items = [{ + ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", + ... "options": ["1", "2"] + ... }] + >>> tokens = tokenizer(items, key=lambda x: x["stem"]) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + +PureTextTokenizer ++++++++++++++++++++++ + +By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security. At the same time, special formulas such as $\\FormFigureID{...}$ and $\\FormFigureBase64{...}$ are screened out to facilitate the tokenization of text and plain text formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future. + +:: + + >>> tokenizer = PureTextTokenizer() + >>> items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"] + >>> tokens = tokenizer(items) + >>> next(tokens)[:10] + ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z'] + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + >>> items = [{ + ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", + ... "options": ["1", "2"] + ... }] + >>> tokens = tokenizer(items, key=lambda x: x["stem"]) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + +GensimWordTokenizer ++++++++++++++++++++++++ + +By default, the pictures, blanks in the question text and other parts of the incoming item are converted into special characters for data security and the tokenization of text, formulas, labels and separators. Also, the tokenizer uses linear analysis method for text and abstract syntax tree method for formulas respectively. You can choose each of them by ``general`` parameter: + +-true, it means that the incoming item conforms to SIF and the linear analysis method should be used. +-false, it means that the incoming item doesn't conform to SIF and the abstract syntax tree method should be used. + +GensimSegTokenizer +++++++++++++++++++++ + +By default, the pictures, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text, formulas and labels. Also, the tokenizer uses linear analysis method for text and abstract analysis method of syntax tree for formulas. + +Compared to GensimWordTokenizer, the main differences are: + +* It provides the depth option for segmentation position, such as \SIFSep and \SIFTag. +* By default, labels are inserted in the header of item components (such as text and formulas). + +Examples +---------- + +:: + + >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] + >>> tokenizer = GensimWordTokenizer(symbol="fgmas", general=False) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]'] diff --git a/docs/source/tutorial/en/tokenize/Sentence Segmentation.rst b/docs/source/tutorial/en/tokenize/Sentence Segmentation.rst new file mode 100644 index 00000000..1a8d4950 --- /dev/null +++ b/docs/source/tutorial/en/tokenize/Sentence Segmentation.rst @@ -0,0 +1,3 @@ +Sentence Segmentation +------- +During the process of sentence segmentation, a long document is divided into several sentences. Each sentence is a "token" (to be realized). diff --git a/docs/source/tutorial/en/tokenize/Tokenization.rst b/docs/source/tutorial/en/tokenize/Tokenization.rst new file mode 100644 index 00000000..fad25912 --- /dev/null +++ b/docs/source/tutorial/en/tokenize/Tokenization.rst @@ -0,0 +1,29 @@ +Tokenization +------- +Tokenization is comprehensive analysis. In this process, sentences with formulas are segmented into several markers. Each marker is a "token". +We provide some encapsulated tokenizers for users to call them conveniently. The following is a complete list of tokenizers. + +Examples + +:: + + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokenizer = TextTokenizer() + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + + + +You can view ``./EduNLP/Tokenizer/tokenizer.py`` and ``./EduNLP/Pretrain/gensim_vec.py`` for more tokenizers. Following is a complete list of tokenizers: + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + ../tokenization/TextTokenizer + ../tokenization/PureTextTokenizer + ../tokenization/GensimSegTokenizer + ../tokenization/GensimWordTokenizer diff --git a/docs/source/tutorial/en/tokenize/WordSegmentation.rst b/docs/source/tutorial/en/tokenize/WordSegmentation.rst new file mode 100644 index 00000000..a85f4dae --- /dev/null +++ b/docs/source/tutorial/en/tokenize/WordSegmentation.rst @@ -0,0 +1,36 @@ +Word segmentation +------- + +Text-tokenization: A sentence (without formulas) consists of several "words" in order. The process of dividing a sentence into several words is called "Text-tokenization". According to the granularity of "words", it can be subdivided into "Word-tokenization" and "Char-tokenization". + +:: + + - Word-tokenization: each phrase is a token. + + - Char-tokenization: each character is a token. + + +Text-tokenization is divided into two main steps: + +1. Text-tokenization: + + - Word-tokenization: use the word segmentation tool to segment and extract words from the question text. Our project supports `jieba`. + + - Char-tokenization: process text by character. + +2. Filter: filter the specified stopwords. + + The default stopwords used in this project:`[stopwords] `_ + You can also use your own stopwords. The following example demonstrates how to use. + +Examples: + +:: + + >>> text = "三角函数是基本初等函数之一" + >>> tokenize(text, granularity="word") + ['三角函数', '初等', '函数'] + + >>> tokenize(text, granularity="char") + ['三', '角', '函', '数', '基', '初', '函', '数'] + diff --git a/docs/source/tutorial/en/vectorization.rst b/docs/source/tutorial/en/vectorization.rst new file mode 100644 index 00000000..5b744eeb --- /dev/null +++ b/docs/source/tutorial/en/vectorization.rst @@ -0,0 +1,157 @@ +Vectorization +========= + +This section provides a simple interface to convert the incoming items into vectors directly. Currently, the option of whether to use the pre training model is provided. You can choose according to your needs. If you don't want to use the pre-trained model, you can call D2V directly, or call get_pretrained_i2v function if you want to use the pre-trained model. + +- Don't use the pre-trained model + +- Use the pre-trained model + +Overview Flow +--------------------------- + +1.Perform `syntax parsing `_ on incoming items to get items in SIF format; + +2.Perform `component segmentation `_ on sif_items; + +3.Perform `tokenization `_ on segmented items; + +4.Use the existing or pre-trained model we provided to convert the tokenized items into vectors. + + +Don't use the pre-trained model: call existing models directly +------------------------------------ + +You can use any pre-trained model provided by yourself (just give the storage path of the model) to convert the given question text into vectors. + +* Advantages: it is flexible to use your own model and its parameters can be adjusted freely. + +Import modules +++++++++++ + +:: + + from EduNLP.I2V import D2V,W2V,get_pretrained_i2v + from EduNLP.Vector import T2V,get_pretrained_t2v + +Models provided +++++++++++++++++++++ + +- W2V + +- D2V + +- T2V + +W2V +<<<<<<<<< + +This model directly uses the relevant model methods in the gensim library to convert words into vectors. Currently, there are four methods: + + - FastText + + - Word2Vec + + - KeyedVectors + +:: + + >>> i2v = get_pretrained_i2v("test_w2v", "examples/test_model/data/w2v") # doctest: +ELLIPSIS + >>> item_vector, token_vector = i2v(["有学者认为:‘学习’,必须适应实际"]) + >>> item_vector # doctest: +ELLIPSIS + array([[...]], dtype=float32) + +D2V +<<<<<<<<<<<< + +This model is a comprehensive processing method which can convert items into vectors. Currently, the following methods are provided: + +- d2v: call doc2vec module in gensim library to convert items into vectors. + +- BowLoader: call corpora module in gensim library to convert docs into bows. + +- TfidfLoader: call TfidfModel module in gensim library to convert docs into bows. + +:: + + >>> item = {"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\SIFChoice$$\FigureID{1}$"} + >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) + >>> i2v(item) + ([array([ 4.76559885e-02, -1.60574958e-01, 1.94614579e-03, 2.40295693e-01, + 2.24517003e-01, -3.24351490e-02, 4.35789041e-02, -1.65670961e-02,... + +T2V +<<<<<<<<<< + +You can use any pre-trained model provided by yourself to represent the segmentation sequences of a group of questions as vectors (just give the storage path of the model). + +- Advantages: the model and its parameters can be adjusted independently and has strong flexibility. + +Input +^^^^^^^^^^ + +Types: list +Contents: the combination of each question segmentation sequences in one question group. +>You can transfer question text (`str` type) to tokens using ``GensimWordTokenizer`` model + +:: + + >>> token_items=['公式','[FORMULA]','公式','[FORMULA]','如图','[FIGURE]','x',',','y','约束条件','[SEP]','z','=','x','+','7','y','最大值','[MARK]'] + >>> path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> t2v = T2V('d2v',filepath=path) + >>> t2v(token_items) + [array([ 0.0256574 , 0.06061139, -0.00121044, -0.0167674 , -0.0111706 , + 0.05325712, -0.02097339, -0.01613594, 0.02904145, 0.0185046 ,... + +Specific process of processing +++++++++++++++++++++ + +1.Call get_tokenizer function to get the result after word segmentation; + +2.Select the model provided for vectorization depending on the model used. + + +Use the pre-training model: call get_pretrained_i2v directly +--------------------------------------------- + +Use the pre-training model provided by EduNLP to convert the given question text into vectors. + +* Advantages: Simple and convenient. + +* Disadvantages: Only the model given in the project can be used, which has great limitations. + +* Call this function to obtain the corresponding pre-training model. At present, the following pre training models are provided: d2v_all_256, d2v_sci_256, d2v_eng_256 and d2v_lit_256. + +Selection and Use of Models +################## + +Select the pre-training model according to the subject: + ++--------------------+------------------------+ +| Pre-training model name | Subject of model training data | ++====================+========================+ +| d2v_all_256 | all subject | ++--------------------+------------------------+ +| d2v_sci_256 | Science | ++--------------------+------------------------+ +| d2v_lit_256 | Arts | ++--------------------+------------------------+ +| d2v_eng_256 | English | ++--------------------+------------------------+ + + +The concrete process of processing +################## + +1.Download the corresponding preprocessing model + +2.Transfer the obtained model to D2V and process it with D2V + Convert the obtained model into D2V and process it through D2V + +Examples: + +:: + + >>> i2v = get_pretrained_i2v("d2v_sci_256") + >>> i2v(item) diff --git a/docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst b/docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst new file mode 100644 index 00000000..41dcab64 --- /dev/null +++ b/docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst @@ -0,0 +1,42 @@ +Use the pre-training model: call get_pretrained_i2v directly +--------------------------------------------- + +Use the pre-training model provided by EduNLP to convert the given question text into vectors. + +* Advantages: Simple and convenient. + +* Disadvantages: Only the model given in the project can be used, which has great limitations. + +* Call this function to obtain the corresponding pre-training model. At present, the following pre training models are provided: d2v_all_256, d2v_sci_256, d2v_eng_256 and d2v_lit_256. + +Selection and use of models +################## + +Select the pre-training model according to the subject: + ++--------------------+------------------------+ +| Pre-training model name | Subject of model training data | ++====================+========================+ +| d2v_all_256 | all subject | ++--------------------+------------------------+ +| d2v_sci_256 | Science | ++--------------------+------------------------+ +| d2v_lit_256 | Arts | ++--------------------+------------------------+ +| d2v_eng_256 | English | ++--------------------+------------------------+ + +The concrete process of processing +################## + +1.Download the corresponding preprocessing model + +2.Transfer the obtained model to D2V and process it with D2V + Convert the obtained model into D2V and process it through D2V + +Examples: + +:: + + >>> i2v = get_pretrained_i2v("d2v_sci_256") + >>> i2v(item) diff --git a/docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst b/docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst new file mode 100644 index 00000000..2989f8ba --- /dev/null +++ b/docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst @@ -0,0 +1,21 @@ +Don't use the pre-trained model: call existing models directly +------------------------------------ + +You can use any pre-trained model provided by yourself (just give the storage path of the model) to convert the given question text into vectors. + +* Advantages: it is flexible to use your own model and its parameters can be adjusted freely. + +Specific process of processing +++++++++++++++++++++ + +1.Call get_tokenizer function to get the result after word segmentation; + +2.Select the model provided for vectorization depending on the model used. + +Examples: + +:: + + >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) + >>> i2v(item) From 3b76b5f80d46e2e9b0eb8e5667d800808b901637 Mon Sep 17 00:00:00 2001 From: fannazya Date: Mon, 25 Oct 2021 22:03:35 +0800 Subject: [PATCH 5/6] update English version --- docs/source/_static/new_flow.png | Bin 0 -> 13843 bytes docs/source/index.rst | 5 +++ docs/source/tutorial/en/index.rst | 12 +++---- docs/source/tutorial/en/parse.rst | 33 +++++++++--------- .../parse/FormulaSyntaxStructureParsing.rst | 14 ++++---- .../en/parse/TextSyntaxStructureParsing.rst | 8 ++--- docs/source/tutorial/en/pretrain.rst | 22 ++++++------ docs/source/tutorial/en/pretrain/loading.rst | 2 +- docs/source/tutorial/en/pretrain/pub.rst | 14 ++++---- docs/source/tutorial/en/pretrain/start.rst | 2 +- docs/source/tutorial/en/seg.rst | 14 ++++---- .../en/seg/SemanticComponentSegmentation.rst | 4 +-- .../seg/StructuralComponentSegmentation.rst | 4 +-- docs/source/tutorial/en/sif.rst | 12 +++---- .../en/tokenization/PureTextTokenizer.rst | 2 +- docs/source/tutorial/en/tokenize.rst | 9 ++--- .../en/tokenize/Sentence Segmentation.rst | 2 +- .../tutorial/en/tokenize/Tokenization.rst | 2 +- .../tutorial/en/tokenize/WordSegmentation.rst | 2 +- docs/source/tutorial/en/vectorization.rst | 12 +++---- .../en/vectorization/WithPre-trainedModel.rst | 6 ++-- .../vectorization/WithoutPre-trainedModel.rst | 4 +-- 22 files changed, 96 insertions(+), 89 deletions(-) create mode 100644 docs/source/_static/new_flow.png diff --git a/docs/source/_static/new_flow.png b/docs/source/_static/new_flow.png new file mode 100644 index 0000000000000000000000000000000000000000..f103cc7dcc8238b8e48f016d6f2eef31f067e747 GIT binary patch literal 13843 zcmeHuc_3R`*SG3`YSmD5Fb_paTso*RrPL7Dm{bw;8ZM)ka)# zO(iv?Rm9L5YHX`fV+@rvi7|-xc;4sU`@G-R@4xTQ_YaYCa@O8!+H3vx+Uuk^Iau%C zdt|SGfWUrR8%q}ffn6}*`y=5!z-Py!hI_!j9Y_~z3juPM+$?ahE71H`a{+;Jyy(Vn zyMb$wt2XXP0Rgdh+kZP~;Y5D{fm1|VOLJI^_d>1@G5S;d(9WV4{=T0t`DdK9G0HDj zGkQL7yC<=r{gO?jUUmAQvd%EY=E$LzgSRftPQEx)lt>XL`HAmU|9n(B@h1HcZnEnp zZ1McA{V_$}#8r)7^=X^GPOo&WY|uwNVp+&#qi*{8THM&XoXVlRhGk6Gldns&pV!e} z_bVrgTL}N_!(BRY@84H#9ZInOzP)|x!k+wpKLlJ==nG5#eU+AJPW)>C<-|YsStR{^ z|{RBWVcEz&>&erHJn`vi&w|1{j5 zA;+9#kD_Oqlio<*gzY%A6Cf4xVQj`F;PpHm;N&NSa_#`(i1E*7EVRp<4U5J9 zoK;JJ)yu5IKb0|cTgLBue$I{!z|Om{gFh!!0SM=B|C~@5Agl}nl=u^^E`tHWx$8e= z_0K?nRcE2yKhk=wQz zegMm6K0EMp8JD+ZWbFL8%1r?ycdfno>6Bw1fhj}0mE1JpSXR;sJ&ym*kzT(Q8eWV3 zI`6$Ayvg6li|hQr+GtVK`tlYHILJTVmGdhpZdI0!4DoVhPu7iI<32smurxi9m%y7Q zV7aOR5P_Ac3}ARaZ%sEfcq8Wo>-Ac`;%Xx&!9PbAySNY#d*;A@x^m75+e~$f7WC#( z>Q-GSsZoI0S@gW)T21{W@>x<)pk0=hB6{UX<8ep+$U1%F;-_%P3bP^)TF7o4E{VL@ zRSnIQ^@(FoX9qr|pgHXtikn{{>}Dwg#f@)5;odQ0NfrCwNCKMN2?%ke%l)~X42TuC zbWL$x^?ofi86;@{__PvFCf=2mn()LGCHe zNV0=l)6c-%WuN@P<%Ll{QGzLR_FIUp!8YYTInNHeBd zY+4x&oy9ck#a$`P?Wwy+!l(@0M>fYgS|*Dx73S*mnKJjwd0EQgqbfCh6zWDhYDCi3 zVu;MWFfM|_^~PadG3pBRXmVo%PO{or8{@t5P~3{vA4D_IS2AELJc9y?fGyDanC zVTpymR!vsFXoRQW2N7F~WgDx7M%j1LKz+Fpfrie3-K?NuI+r~^-1uRH`LRc(7AJd9AxnU2>wSoskA$#`&@`n=ixIYu--yr_S)qgUIXrT)jE;grl=4a zE-kS)?2c~yEHuElI_0(UwQKLGmu*-`_A<|%R)B~ac!BvXtUqPGuTUF(Z!W~)SaI&$ zRQOpJpU1SpPE29gcYd$QWiyM5GgMxKM)Aqh18n3xBWuKEllT&y;)g^|FL=EF(C6{@ z^6q`!;gx7d&rM6RlL3Fe|ICWgrCuT4-t?}s#OhUz^c-Y}J>OyR<_pPwYv54-iws5g zxz$x)q1!zdqqNBeb0KE^HLQ*vw69s>iegk^9@GzJOi zND5F&@#c`4#!#D0sDjOlsxG+KEkX@XjKsD*Rp}N0rj@*P|XM-EfoIUWlBSY?)=c zgRy+QHnvL?>+?rox;Sm8?7;%b{oc#~Y@C>)E3=QyPugXve)8J;c{rz?q1hdf{6PGa z_c4S3eByqEQnw%f3u!JfaW4kXPj&_(+fn8jjSP^5q%xJ-+{@lLa}W1OTPqR5e9&3x zl>XyKWIC!AhL_Qi1|e7dj7zN_#c4q4pImGoorpYLT#@9|j=Q>Wvq#XImPEy!oAJ`7 zo8is3;7`(ygO#o!jJj6FTWoy}eoF=(UI zaFIeLyH0kKYIdqk7$Z1x!mOId)?0Fk39YH3weucOeB|LyI^Q&^zZREN4`MYBm^R2LO zn(U0nNNhIVd@@{|L(8yK*FgjmA-ddGr6JeeD!rAYRrV00p9N`S`P%o!Yl*E)Up0L9 zdgl8d`eUJ+^`xV;Bp=tFCzT`g&hy&{?_W?ORSEt|2Gl4VuJh~`o#GV~vsE6V!b@$b z)m@F3g;w|A^?N@{jEx^-(XZ;MTnk&MpJ`W(kj?)nX=cH=k3ru{U8y-Qu*Ps*Vr{;f z?wzk9qi5^SctSI0!7JQs&gFs?A^Ni9L{M!J+(*VtU z+8@;WC21TtL09qd*P93cpY%;%Fm@Y%B$`7C#Wq`7W(SGv$s@$cHxNGBEy5TFp)EI7 zhDsQlIxQ%U@PbNjjCvqu6P{N@sQH($0kie~uO5Q>JBv>e%=wEbXT|D6URThY3)T&FuR>ql^i*y zf9!`DCv0E%7Ysrjmw$%oNA|brkrKz&OCBwKwYDl}wgojKH0=G?;(jAm)KQr*kvGa4 zeSBbJL$(tmoiOReitT5Hveup4g&Oa4v-37XsPMs0$9Y%$zC)IaeZcdG3{$AItWW{z z0eknK(f6u4NLICdQXp@c&Dyq(bR&E>=DV|AV+Yea!P;NQaS^6fcU7t{OOL*v3Q z<8Rhy2e|yK@y*%KCF~G>KMI>d&hw0U0tB4$Odzvhg*x6oznge_^18nT7NIfWq6!sF z$S;|^nYuP%7NN~evrx~2hH_MGkPHb-xLrS+mamrMf!EH>ErezccHJvlZHjeh)0O~X zM^Ym5k<*-~az9>SO6>OvswP7lM1h?gFD zgRwzX--cqAU-_M%5HVwqLjcp?T;0HAFz^ZEC-AL}ff28)i^p-lI}eC(3c)->=+03+ zl}O5j=&W7is<8!Pa&qNvbLzp(q0zhy#c+`V#1&k5nF*`oD>B{LDC%Z4MG`LWyo6TV$gw^MO4wxhJURLy6wZ)L z)Siw1onP~8o;dgYu{@wF@hwkp&M=-8yp|5xlb&uKk!*KyGWS4++<>pC?#xnbS!!qIh9Hc(haiBaS&2{i0%(u2RM%{K z>UY9U)BJ*Yw!X$(^3zK{P#wvGjQsAi-0_L_yjcD{4r2x>9x z6B5?o^#$ymF zuN^PxC1srnLbJ6a(qvhZVqi-|Ty5HATOpr%My|H^^rn$B7{G!ak@->k%?w`i2WvJ~ zM<#Q%BM^WyY3QDGi>QBm^uD#EeE>fjomU#d{NNsLPq@Tl!PsB-7LZN|LKqTB9p33` zy*bN?9a_WbFf_w4ZliaUGvWtXi^zj;30n^CEXn3a;vjBwm_me1zRI#1oYUli^VEn= z@aKRfXs8t7cA;F2AG@o1Sa68@p@PAq_ZRLifdR*u;ovHYt)|9=k9VW$t9Yp>aL)J1}=NywoKo9vE)#hpuVrn zH-^cu_EC>M3*W+(r{V-LbN+dyz~lJ^dX(mf0&H1on%d=D8ts$hu}0OG8tLR4g?7?D z2vHHih7mY^WpzjeSC3O?J$Et|Ii%CE&NnuwaE+RhF9IZ2C(OYMc)bML;~eR}inGg3pOpW~!iv{1b% z1xND8Lq&F??GJ;)>K(vRUKlOJL^2oI43iH9k-JO2k3F z8ZK6mYSWD?!*~d@a3us5m8hWZ0%^P|%ZqQXI|HMkx`cB*8H=b_s}49Bo!`V6F%yan z4M86dMLm6bk;9ww@#C<0a?`IPM$~&G3%s7OFX!;Hg*VCQVtVMr5~$)Hr@L&$#ls7y zutjPum?-T&3u8FN=Q(c1lk(JAs}iwkz=rv5^zk6Jj&08a#ls7)owgb;$qcA!>Tt-Z zS?}8;l^`;cF%5OX()pCOp(6ChsA`2%l16z@l@!q>a>z8Zfs|*4~ATa8toF(bX4bkintnG0*H=v4fGunKW{BXF`rG1aJzBz6& z*cwM|I}Id4E9$%vXKD6v2@Nfm>&E>y9@IW!>E)c+Es{xm&|R9VSMFCtQe0w!V2kMK zuT~@j1INNog)j=Tx592d+smqU` z&qiZer`WHJ@TtZ8Jf2^f{Lrb(2qWt--HjadMns%Q1pisouSdEWPfHmm6K+dm70SxE zF+QnX(yY7e#*;Gf%mzbpS3dvR{&4=gvdrQ<6QPn%H(dycAwc90(kEoV_TpKG%2saYaj^?y-W9j#U9%j z4Sj3jKP90^8w22a6W&*3N>I&1~x}7A`*&qGTgi;GHdiM zXGKapQx!k@r5ixU6~oI z=|I7N;z}<$6#?$=lI2P6mlVqo^3Yrl6Hj?CLYHOCSyMyOOSdr82<%cxq|a)JJx!1* z)BaE!ax#@z!kv^qJ`n3$Ke9WGH+T`z9OESmIzqtrvC-nDz3Gf$h8FeY86KWPl%iFh zeJlkEo{zt*-Aiagmd1bWOXUi5ynQH=b3YetWX zx5}IDvht~A@VR~}zvMK}eTm!TV|*SHzq)Gm-YRLOG|%fiaisH_&g)}dPr50sa|~Dc z>LtD6)Vv{JWup6A*Cma@A6ol7eli0^PEKWB>#$I7EqgV0d~5$K2B}y=`4h%Y3RxP) z>!y0*cdrS0Zcce=x{owFNGZSl764XE#12L1ysGX&IgjHw+KO^z8X)(KB3u_?K1R`E zDc+>?_1a?(f7=HPXJ3n<;_@jouGj%x(fWFW7f+p1x>bq(DQzEHAjovcgE>sRTvO-t zc7^h{6aac&(dbNhez#G~9lw;kp<`HDW_NpjS8=5K#^~>W(*)?cw8#z>kR~R1ncD3i zi#)Vr73W-O6`E@!mG~Md0P$4~cN?pz1r#7=P`W6#5(nL%hcCf;!$qK62|S^(ek!)?R7FJQ*HbK0Ux(M%Vj=nI~7do++6>F6c{U-Q92x zxS?FX0iezz{A4!4FhkXhUMZR^ zoU2QSRXmaa?)Kl|oI*4_IT+;Qg>rm&rU3(vPh?$bUzMBm00}9K#>EdX57Op;&;Ax*s;64d%yD9WRnHI) zX)*r=)kWbv(}2ZU_l+>7HYdAjkh(0J2M<$>e8UuuZx_91fcj!;bXSf!F-Z0<%f7ef z`w#lTAmw~chr!lL$(CU2?tIP}eu4me>gp9WSmRP@fbOw6B!59;-ilH3hXe>sXwEtK zYvsTmK19HKzr}9yev!aUBB(%b-hLZ zyMPyVN&D|b9NoVDZuLUg2Z19wq{+4eaa(p#rjzby6%vt4`|}{nuU!S}Agn#>_rcFz z1rjDTgJ^28k8+Ubo~D@03~=Y?mXtBblLiSl^2$!(ag;ypCDwZPZZb29R$`zsx*@U+ z*71{p4##4?20NSt6SRN&VgG1+`*&x>uk4U9c3~${E1T0Ksynh>d9L9rJjKivKM8z_o}Mluc)y0r zcD3Vl8&_v4hej>njJEH#|CZ{Pxpc*bqn5^ooQt5}DkrD@Kx(@;hbzMnFJ>5ne!41o zj`7ngg!j39j53XLm?gFN0(-yZihC=n_21tnb%^=febcoX@cn8<%SpVd*Mrdt74^C3 z#`-AysZd|1esfa=Dmw+J5r^MC1sHh@VK36?&V!-sJ}PGdzr-H%iE(saQD(De zj7p49RztG!RPJWD);!`kblZAgyV4uJ+h;5y97D@2xlHXJW)(qgF<-R>H%xex z$7T0c+(a_t?jtRVG#W7$R+|`zj`>uAlto@8VLhhiAgJx(5Ha!YYSHM1jXQPRR1q`p zw;%x;YIn&lut(KS1Ull^YMs``45<8RoMQYsYHj0Ao^}B4x!pjwckiiPaZ%i<+WxMG zfqC)Ey`$vj6l?;M!f5P#;An;}vFEgWX`=KvJYW4@?N>}fS}tq4ChC%{;I#H7&r$k# z-3c|pNO{&f*^eG?>V8?H;77tV5N1{WNv%ctA_PREzeIv!eg;)pGr{Ysm7_zl$FGipdX*Q5oSfIgWoSYjESdeEOD{f<5uO{fk{-=W$+^$?XRyRh#oR9zi>_Ipap6 z_Yq`d@Y`HGqoo0=l%K0WQss<)QNQgp>g>kgvT?y|<`i1{aM^tPEW%r6=(EaaUaIo_ zY6EJ)5o-4h-Vbg}!&4`TwIi_6^^XkJ6kO40dI5?4wh0@WRB{dLlLRSlevj?sWpAGb zfxf81$oNzGR@ux*S{I$3J^s-W!iZe3>Ggv+TmRX&bMR_A>V?h2=F!q<&6w<6cNNxC zB6X@7?!X|}d-v;9bnc_apZZKR}bti5`Nq3x( zb8hB{2mTDY!A|mw_mI__481cE3LnDZ3X|RBj05~r!-vZlepkE#mug&kgUcqIU-GW( zWR++Dp$0!;F(ow0`|JUQxGkNK)P=_;c0`q5c%j*s%dGu0AsWO;yqg9~F|w$0BVh!x zq15YxTVW-UI!)ECQ@4O)%Hn99?-C2*Dc!z_PQB|_F?ma$hAaQlOX7KNn468@q;uc%8lsdosOEq&g)C7vH?I${MV<16#gr$T!PVE1*=wS0_3$P+SHE;60nh=Y%1Pq2Ldx^FuXC;e(ALtB!9dBS#LtvrjvjCA#2hqgv);n* z9v^z0wgIm_X4QEbO(Jw@K=_|p=x9U0$2E3k-*Nx58@&$?q zwCjtZXE6t${kYdYYuPB%3y>e!C8msgpV%Ws-VN3|Q!gP97u`X0!0swWiIou);iRro zBcnjMp)=RKs4Gpe;ng3ACO%Z`U^UTxvo*qbDdHHAjS_V$s&D0!`p!w7S5cx4M~7MC zKHDByRkivoA~gzn{wl%*-`Oo4U+jRu@k>U%#4o?Rp9h6>-w$7ivdWFgFCiBmapMm@ zk4v>*l%46-zW;7CLTZUKykS+TGwxibu3I!vKO+ZPq2>;^^kdNFEAkbZxdT0(~IaA8AEx5BeTk>K4XtAHDaM(>pJ#I~P1{oDID1lwRB0c_sc1E-k0W zLAK#ZQ80seVANS^Aqp{ttp_Ua+Ssp;MZM0=^8nxeT%5{F!8jYVqOLJjT_4VpQGa}Q ze<4iR%cGD~5eIGLVzT|uKdesM#XfDGb!6_#mzGB)6vdxponCJuDxIs_v4~_Dsa^1J z31IRDmOJz6H}^#hbh{7ZfRy0kk#ALMY2m%-7B>s^!CZ09YuC37diy<6wJ_aMx5S^#3%@!uNJ$BRnm`+@B|bYEuvl5E6o1o{%J7AkfXr;W^{+?jil zBuCxjEFyhhT@-(>l|8GwHR%@`YY|{e%{Wh=4jOUWUh%9mpX#=qykomf!80Mb@u+2i zYV-r)1!h{UI9NNf?oBt#_?+5Rm`p}su7jgsu*hNNVJ)>gzyYmrzHjq z5Yhf8_J{7F7h?J3N z0FG6cCcdPo8AW<#mORDEn4Qii?z|oQZc9)7)Cy-fpS!~cThNaYdrzhKUoQf@PH7C#QwY^>MXJzaY3E`7vB*KaMONb!c#o7R~k(#p;cXn8b7KvS6` z%;y7Q;Zl24*BiRO-LsZRZ*gNICJr@btj-zt3^HdeOw>-Em6P=diZvd?h3A)}DY}gV zHKIc4Q&!$1>%)ooG4hc5cx{?}yGjQQzO*ouS%wy&C-CbzIT&}i2EH4mZ<{F(x$iTiCA9dIYRO4Jz9%q}%P& zhiG6}#rx|$C^P3YcXGiGN37|^!0`*%k3=!n_iP7(&`u`AI$cXe^p-+~eC97RK*a(` z#b(s#oJk|fQ%f88%ck|GQlJ4%>*ZEVqPy!(G!! zo~TtzT#;B!#ihwKx>`(8e*Fgctd->3PD^Z#T6D;nf#GJaGH7%8g)iFDk!!$t)aRpzS>A0#3zEB4uHJTy)SNBAHjJ!rdGkR5qr*5#? zeza^!`bscy({{IanHqj$G~#N->347ib>4&@6Pdp}I!eeh_Ngq)Rm|`vm323-U#a%d z*YLKE2ihe{$vQu}Kdk>~R1d8GwI}$?&TOD}&VRdi&UFw#I+D`hvm5H=e5__ zGFSayn^}wx*rH;xEk@CuxLRHP+&UyI(I-R(VRr5*BG&k8(%4y36vAXr=OW6nd8)bE7u(2gWr2qcN%kuZ@e`a=?Cn31G)m zkn7D2=02T!Q6sWj(B$2Xx{tX9qriC^k`|Mr&Uv%`gU`ZvHQ{M`fnkVJJ;Sw#W6I|F zuNtg>&w8IS01y9=;+=`-F@mUe+59yFslQViM)u9W2d{R~2-X82<$yV(+SPl!S z>haUhoJpH>NSmfx9 zx$Xgpe{x791lWD7I*5wMY<*SSDQTL|>A`Q<1Z}%a?M$1W*G-@KoZ;STs3m2eB)_&_ z;jq?r#;h%qbAORsw|BpC@mNAmrqiP@*%5hMA%0Y+q^!3Lt!27 z8{LT7e7{3YofQ5tLXXp9(GuV=@0qr!aD;Vk(#U$u2EP{M{GDYj;n5cS^49YkEJ+D8 z?KXi1met8w(!9%pwTWJp-DyOX`3R)9%2af&wotu&XubXFxYuN4@BG%k^4f~*`}>`v z_yX=+T(DF@C+b;HY8+y%vSUb%GBz~_4Yfkt!aH&&7T~m``(3gO*}3kmEY7H18SAVc za{wr={1vl9A4!2EW%z74v%5;$-bjf)B3x+4%Mk9TSyzgTvIv9)H z{{*vFehx(81jEv_OU2Q*wbOTlp^3Z;Udj>aQ@_e6H=)qNBKLsu(46%`0V1^DdnpmS zjIEG!Mn9xDwF`p0r;qoS;hqND6Q=XG7n!ndUS-?Ca?7AaVzkd0q{?rzYuv(f(gRFs zT}5SB^T@EppR45~G`b;2G)R5Vcsz)I*aif{zd93Kl62KOs6=-uyXaO;Uz0HllPp`x z8#n6Qp%{4Tv`;+_0j|f}nT<@nuJ;B@L0gOFR}6c*8^4@$IhSXfoGGqlM1OKZ0;v#{ zNVX1zH(FwZ7wp+*EGp~!UJ3BwMI_w1>8;gy zQQMTk*b5wR)?Muzq{gg8RnmN@h;%xN3ags_kzw+D8*@wsAddkqd>m#;zh3|(qJ!r- z%B*i~p+h-7@x?x)rIHMif`O#7BT#56IP!#dFGAj(2&~-inUt?GSLFc|x#zk)(46hx zu?#dXG_{;tin(`D1tusWws4Xye>6igL+bN@^TPFZ8^om7vAGkIDR_#((-C}6t(vJv zoedB0+|?nGKglmp_4pp zzGG-N0Kx=-0QBKsl|sqw?3zUo(ccZKI$?!hw8@x}*N4mJb1t3@(SX7O!`_ @@ -14,7 +14,7 @@ Get Started * `Vectorization `_ Main process ----------- +--------------- .. figure:: ../../_static/new_flow.png @@ -23,15 +23,15 @@ Main process * `Component Segmentation `_ : Its function is to segment items in SIF format according to the types of items, so as to serve the later tokenization module.(that is, elements in different types can be tokenized using their corresponding methods)。 * `Tokenization `_: Its function is to tokenize segmented items, so as to serve the later tokenization module. - Generally, the tokenization method in the text form can be used directly. For formulas, the ast method can also be used for parsing(call the formula module); + Generally, the tokenization method in the text form can be used directly. For formulas, the ast method can also be used for parsing(call the formula module). * `Vectorization `_: This part mainly calls I2V class and its subclasses. Its function is to vectorize the list of tokenized items, so as to get the corresponding static vectors. - For vectorization module, You can call your own trained model or directly call the provided pre-training model(call get_ pretrained_ I2V module). + For vectorization module, You can call your own trained model or directly call the provided pre-training model(call get_pretrained_I2V module). -* **Downstream Model**:Process the obtained vectors to get the desired results。 +* **Downstream Model**:Process the obtained vectors to get the desired results. Examples --------- +--------- To help you quickly understand the functions of this project, this section only shows the usages of common function interface. Intermediate function modules (such as parse, formula, segment, etc.) and more subdivided interface methods are not shown. For further study, please refer to relevant documents. diff --git a/docs/source/tutorial/en/parse.rst b/docs/source/tutorial/en/parse.rst index 69608c5e..22542da2 100644 --- a/docs/source/tutorial/en/parse.rst +++ b/docs/source/tutorial/en/parse.rst @@ -1,7 +1,8 @@ Syntax Parsing -========= +================= In educational resources, texts and formulas have internal implicit or explicit syntax structures. It is of great benefit for further processing to extract these structures. + * Text syntax structure parsing * Formula syntax structure parsing @@ -13,23 +14,23 @@ The purpose is as follows: 2. Determine whether the current item is legal and report the error type. Specific processing content --------------------- +-------------------------------- 1.Its function is to match alphabets and numbers other than formulas. Only the alphabets and numbers between two Chinese characters will be corrected, and the rest of the cases are regarded as formulas that do not conform to latex syntax. 2.Match brackets like "( )" (both English format and Chinese format), that is, brackets with no content or spaces, which should be replaced with ``$\\SIFChoice$`` -3.Match continuous underscores or underscores with spaces to replace them with ``$\\SIFBlank$``. +3.Match continuous underscores or underscores with spaces and replace them with ``$\\SIFBlank$``. 4.Match latex formulas,check the completeness and analyzability of latex formulas, and report an error for illegal formula. Formula syntax structure parsing --------------------- +------------------------------------- -This section is mainly realized by EduNLP.Formula modules, which can determine if the text has syntax errors and convert the syntax formula into the form of ast tree. In practice, this module is often used as part of an intermediate process, and the relevant parameters of this module can be automatically chosen by calling the corresponding model, so it generally does not need special attention. +This section is mainly realized by EduNLP. Formula modules, which can determine if the text has syntax errors and convert the syntax formula into the form of ast tree. In practice, this module is often used as part of an intermediate process, and the relevant parameters of this module can be automatically chosen by calling the corresponding model, so it generally does not need special attention. Introduction of Main Introduction -+++++++++++++++ ++++++++++++++++++++++++++++++++++++++++ 1.Formula: determine whether the single formula passed in is in str form. If so, use the ast method for processing, otherwise an error will be reported. In addition, parameter variable_standardization is given. If this parameter is true, the variable standardization method will be used to make sure the same variable has the same variable number. @@ -40,10 +41,10 @@ Formula Formula: firstly, in the word segmentation function, the formula of the original text is segmented. In addition, ``Formula parse tree`` function is provided, which can represent the abstract syntax analysis tree of mathematical formula in the form of text or picture. -This module also provides the function of formula variable standardization, such as determining that 'x' in several sub formulas is the same variable. +This module also provides the function of formula variable standardization, such as determining whether 'x' in several sub formulas is the same variable. Call the library -+++++++++ ++++++++++++++++++++++ :: @@ -52,7 +53,7 @@ Call the library from EduNLP.Formula.viz import ForestPlotter Initialization -+++++++++ ++++++++++++++++ Incoming parameters: item @@ -65,7 +66,7 @@ Item is the latex formula or the abstract syntax parse tree generated after the View the specific content after formula segmentation -++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - View node elements after formula segmentation @@ -123,7 +124,7 @@ View the specific content after formula segmentation Variable standardization -+++++++++++ ++++++++++++++++++++++++++++++ This parameter makes the same variable have the same variable number. @@ -209,14 +210,14 @@ Call ``FormulaGroup`` class to parse the equations. The related attributes and f Text syntax structure parsing --------------------- +------------------------------------ This section is mainly realized by EduNLP.SIF.Parse module. Its main function is to extract letters and numbers in the text and convert them into standard format. -This module is mainly used as an *middle module* to parse the input text. Users generally do not call this module directly. +This module is mainly used as an *middle module* to parse the input text. In general, users do not call this module directly. Introduction of main content -+++++++++++++++ ++++++++++++++++++++++++++++++++++++ 1. Judge the type of the incoming text in the following order @@ -233,7 +234,7 @@ Introduction of main content * Use _is_formula_legal function, check the completeness and analyzability of latex formula, and report an error for formulas that do not conform to latex syntax. Call the library ->>>>>>>>>>>> +>>>>>>>>>>>>>>>>>>> :: @@ -263,7 +264,7 @@ Parsing >>> text_parser3 = Parser(text3) >>> text_parser4 = Parser(text4) -Related parameters description(?) +Related parameters description >>>>>>>>>>>> - Try to convert text to standard format diff --git a/docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst b/docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst index 2fc479c5..c09da64b 100644 --- a/docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst +++ b/docs/source/tutorial/en/parse/FormulaSyntaxStructureParsing.rst @@ -1,10 +1,10 @@ Formula syntax structure parsing --------------------- +---------------------------------- -This section is mainly realized by EduNLP.Formula modules, which can determine if the text has syntax errors and convert the syntax formula into the form of ast tree. In practice, this module is often used as part of an intermediate process, and the relevant parameters of this module can be automatically chosen by calling the corresponding model, so it generally does not need special attention. +This section is mainly realized by EduNLP. Formula modules, which can determine if the text has syntax errors and convert the syntax formula into the form of ast tree. In practice, this module is often used as part of an intermediate process, and the relevant parameters of this module can be automatically chosen by calling the corresponding model, so it generally does not need special attention. Introduction of Main Content -+++++++++++++++ ++++++++++++++++++++++++++++++++++++++ 1.Formula: determine whether the single formula passed in is in str form. If so, use the ast method for processing, otherwise an error will be reported. In addition, parameter variable_standardization is given. If this parameter is true, the variable standardization method will be used to make sure the same variable has the same variable number. @@ -15,10 +15,10 @@ Formula Formula: firstly, in the word segmentation function, the formula of the original text is segmented. In addition, ``Formula parse tree`` function is provided, which can represent the abstract syntax analysis tree of mathematical formula in the form of text or picture. -This module also provides the function of formula variable standardization, such as determining that 'x' in several sub formulas is the same variable. +This module also provides the function of formula variable standardization, such as determining whether 'x' in several sub formulas is the same variable. Initialization -+++++++++ +++++++++++++++++++++ Incoming parameters: item @@ -31,7 +31,7 @@ Item is the latex formula or the abstract syntax parse tree generated after the View the specific content after formula segmentation -++++++++++++++++++++++++++++ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - View node elements after formula segmentation @@ -87,7 +87,7 @@ View the specific content after formula segmentation .. figure:: ../../../_static/formula.png Variable Standardization -+++++++++++ ++++++++++++++++++++++++++++++++++ This parameter makes the same variable have the same variable number. diff --git a/docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst b/docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst index bdfe6848..6822c961 100644 --- a/docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst +++ b/docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst @@ -1,12 +1,12 @@ Text syntax structure parsing --------------------- +-------------------------------- This section is mainly realized by EduNLP.SIF.Parse module. Its main function is to extract letters and numbers in the text and convert them into standard format. -This module is mainly used as an *middle module* to parse the input text. Users generally do not call this module directly. +This module is mainly used as an *middle module* to parse the input text. In general, users do not call this module directly. Introduction of Main Content -+++++++++++++++ ++++++++++++++++++++++++++++++++++++++ 1. Judge the type of the incoming text in the following order @@ -47,7 +47,7 @@ Parsing >>> text_parser4 = Parser(text4) Related parameters description(?) ->>>>>>>>>>>> +>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - Try to convert text to standard format diff --git a/docs/source/tutorial/en/pretrain.rst b/docs/source/tutorial/en/pretrain.rst index 9319b87d..985cffde 100644 --- a/docs/source/tutorial/en/pretrain.rst +++ b/docs/source/tutorial/en/pretrain.rst @@ -1,5 +1,5 @@ Pre-training -======= +============== In the field of NLP, Pre-trained Language Models has become a very important basic technology. In this chapter, we will introduce the pre training tools in EduNLP: @@ -9,7 +9,7 @@ In this chapter, we will introduce the pre training tools in EduNLP: * Public pre-trained models Import modules ----------- +--------------- :: @@ -17,7 +17,7 @@ Import modules from EduNLP.Vector import get_pretrained_t2v Train the Model ------------- +------------------ Call train_Vector function interface directly to make the training model easier. This section calls the relevant training models in the gensim library. At present, the training methods of "sg"、 "cbow"、 "fastext"、 "d2v"、 "bow"、 "tfidf" are provided. Parameter embedding_dim is also provided for users to determine vector dimension according to their needs. @@ -43,7 +43,7 @@ Examples: Load models --------- +---------------- Transfer the obtained model to the I2V module to load the model. @@ -55,10 +55,10 @@ Examples: >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) The overview of our public model ------------- +------------------------------------ Version description -################## +####################### First level version: @@ -76,7 +76,7 @@ Third level version【to be finished】: * Use third-party initializers Description of train data in models -################## +############################################## * Currently, the data used in w2v and d2v models are the subjects of senior high school. * test data:`[OpenLUNA.json] `_ @@ -86,7 +86,7 @@ At present, the following models are provided. More models of different subjects Examples of Model Training ------------- +------------------------------------ Get the dataset #################### @@ -98,7 +98,7 @@ Get the dataset prepare_dataset <../../build/blitz/pretrain/prepare_dataset.ipynb> An example of d2v in gensim model -#################### +################################## .. toctree:: :maxdepth: 1 @@ -109,7 +109,7 @@ An example of d2v in gensim model d2v_stem_tf <../../build/blitz/pretrain/gensim/d2v_stem_tf.ipynb> An example of w2v in gensim model -#################### +################################## .. toctree:: :maxdepth: 1 @@ -119,7 +119,7 @@ An example of w2v in gensim model w2v_stem_tf <../../build/blitz/pretrain/gensim/w2v_stem_tf.ipynb> An example of seg_token -#################### +############################# .. toctree:: :maxdepth: 1 diff --git a/docs/source/tutorial/en/pretrain/loading.rst b/docs/source/tutorial/en/pretrain/loading.rst index 31fa3ea8..83b54c39 100644 --- a/docs/source/tutorial/en/pretrain/loading.rst +++ b/docs/source/tutorial/en/pretrain/loading.rst @@ -1,5 +1,5 @@ Load models --------- +---------------- Transfer the obtained model to the I2V module to load the model. diff --git a/docs/source/tutorial/en/pretrain/pub.rst b/docs/source/tutorial/en/pretrain/pub.rst index 34407745..60077309 100644 --- a/docs/source/tutorial/en/pretrain/pub.rst +++ b/docs/source/tutorial/en/pretrain/pub.rst @@ -1,9 +1,9 @@ The overview of our public model ------------- +------------------------------------ Version Description -################## +######################### First level version: @@ -21,7 +21,7 @@ Third level version【to be finished】: * Use third-party initializers Description of train data in models -################## +####################################### * Currently, the data used in w2v and d2v models are the subjects of senior high school. * test data:`[OpenLUNA.json] `_ @@ -30,7 +30,7 @@ At present, the following models are provided. More models of different subjects "d2v_all_256" (all subject), "d2v_sci_256" (Science), "d2v_eng_256" (English),"d2v_lit_256" (Arts) Examples of model training ------------- +---------------------------- Get the dataset #################### @@ -42,7 +42,7 @@ Get the dataset prepare_dataset <../../../build/blitz/pretrain/prepare_dataset.ipynb> An example of d2v in gensim model -#################### +#################################### .. toctree:: :maxdepth: 1 @@ -53,7 +53,7 @@ An example of d2v in gensim model d2v_stem_tf <../../../build/blitz/pretrain/gensim/d2v_stem_tf.ipynb> An example of w2v in gensim model -#################### +#################################### .. toctree:: :maxdepth: 1 @@ -63,7 +63,7 @@ An example of w2v in gensim model w2v_stem_tf <../../../build/blitz/pretrain/gensim/w2v_stem_tf.ipynb> An example of seg_token -#################### +############################ .. toctree:: :maxdepth: 1 diff --git a/docs/source/tutorial/en/pretrain/start.rst b/docs/source/tutorial/en/pretrain/start.rst index 9c5bc241..4aa91619 100644 --- a/docs/source/tutorial/en/pretrain/start.rst +++ b/docs/source/tutorial/en/pretrain/start.rst @@ -1,5 +1,5 @@ Train the model ------------- +------------------ Call train_Vector function interface directly to make the training model easier. This section calls the relevant training models in the gensim library. At present, the training methods of "sg"、 "cbow"、 "fastext"、 "d2v"、 "bow"、 "tfidf" are provided. Parameter embedding_dim is also provided for users to determine vector dimension according to their needs. diff --git a/docs/source/tutorial/en/seg.rst b/docs/source/tutorial/en/seg.rst index ad2696a2..4e2f2d39 100644 --- a/docs/source/tutorial/en/seg.rst +++ b/docs/source/tutorial/en/seg.rst @@ -1,26 +1,26 @@ Component Segmentation -========= +========================= -Educational resource is a kind of multimodal data, including data such as text, picture, formula and so on. +Educational resource is a kind of multimodal data, including data such as text, pictures, formulas and so on. At the same time, it may also contain different components semantically, such as question stems, options, etc. Therefore, we first need to identify and segment the different components of educational resources: * Semantic Component Segmentation * Structural Component Segmentation Main Processing Contents --------------------- +--------------------------- 1. Convert multiple-choice questions in the form of dict to qualified item by `Syntax parsing `_; 2. The input items are segmented and grouped according to the element type. Semantic Component Segmentation ------------- +--------------------------------- Because multiple-choice questions are given in the form of dict, it is necessary to convert them into text format while retaining their data relationship. This function can be realized by dict2str4sif function which can convert multiple-choice question items into character format and identify question stem and options。 Import Modules -+++++++++ ++++++++++++++++++++++++ :: @@ -39,7 +39,7 @@ Basic Usage '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' Optional additional parameters / interfaces -++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++++++++++++ 1.add_list_no_tag: if this parameter is true, it means that you need to count the labels in the options section. @@ -69,7 +69,7 @@ Optional additional parameters / interfaces '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' Structural Component Segmentation ------------- +------------------------------------------ This step is to segment sliced items. In this step, there is a depth option. You can select all positions or some labels for segmentation according to your needs, such as \SIFSep and \SIFTag. You can also select where to add labels, either at the head and tail or only at the head or tail. diff --git a/docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst b/docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst index 3901f4cb..c6535941 100644 --- a/docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst +++ b/docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst @@ -1,5 +1,5 @@ Semantic Component Segmentation ------------- +------------------------------------ Because multiple-choice questions are given in the form of dict, it is necessary to convert them into text format while retaining their data relationship. This function can be realized by dict2str4sif function which can convert multiple-choice question items into character format and identify question stem and options。 @@ -17,7 +17,7 @@ Basic Usage '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' Optional additional parameters / interfaces -++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++++++++++ 1.add_list_no_tag: if this parameter is true, it means that you need to count the labels in the options section. diff --git a/docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst b/docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst index 8661c3d6..f5c44f7e 100644 --- a/docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst +++ b/docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst @@ -1,5 +1,5 @@ Structural Component Segmentation ------------- +------------------------------------ This step is to segment sliced items. In this step, there is a depth option. You can select all positions or some labels for segmentation according to your needs, such as \SIFSep and \SIFTag. You can also select where to add labels, either at the head and tail or only at the head or tail. @@ -20,7 +20,7 @@ Basic Usage >>> ['如图所示,则', '\\bigtriangleup ABC', '的面积是', '\\SIFBlank', '。', \FigureID{1}] Optional additional parameters/interfaces -++++++++++++++++++++++ ++++++++++++++++++++++++++++++++++++++++++++++ 1.describe: count the number of elements of different types diff --git a/docs/source/tutorial/en/sif.rst b/docs/source/tutorial/en/sif.rst index 877cb503..be5c895d 100644 --- a/docs/source/tutorial/en/sif.rst +++ b/docs/source/tutorial/en/sif.rst @@ -1,12 +1,12 @@ Standard Item Format -=============== +======================= version: 0.2 For the convenience of follow-up research and use, we need a unified test question grammar standard. Grammar Rules ------------ +---------------- 1. Only Chinese characters, Chinese and English punctuation and line breaks are allowed in the question text. @@ -16,7 +16,7 @@ Grammar Rules 4. Text format description: we represent text in different styles with ``$\textf{item,CHAR_EN}$``. Currently, we have defined some styles: b-bold, i-italic, u-underline, w-wave, d-dotted, t-title. CHAR_EN Labels can be mixed and sorted alphabetically. An example: $\textf{EduNLP, b}$ looks **EduNLP** -5. Other mathematical symbols like English letters, Roman characters and numbers need to be expressed in latex format, that is, embedded in `$$`. +5. Other mathematical symbols like English letters, Roman characters and numbers need to be expressed in latex format, that is, embedded in ``$$`` . 6. For the entry standard of molecular formula, please refer to `INCHI `_ for the time being. @@ -46,11 +46,11 @@ Tips 3. Choices and blanks. -4. A single number or letter is also required to be between `$$` (automatic verification could already realize it). +4. A single number or letter is also required to be between ``$$`` (automatic verification could already realize it). 5. Try to make sure Chinese is not included in the latex formula such as ``\text{CHAR_CH}``. -6. When importing data using MySQL database, an ``\`` is automatically ignored which needs to be further processed as``\\``. +6. When importing data using MySQL database, an ``\`` is automatically ignored which needs to be further processed as ``\\``. Examples ----------------- @@ -89,7 +89,7 @@ Non-standard Format: ``则$a$的取值范围是(\u3000\u3000)`` Functions for judging whether text is in SIF format and converting to SIF format --------------------------------------------- +-------------------------------------------------------------------------------------------------- Call the Library ++++++++ diff --git a/docs/source/tutorial/en/tokenization/PureTextTokenizer.rst b/docs/source/tutorial/en/tokenization/PureTextTokenizer.rst index 8c36e67c..88ec975e 100644 --- a/docs/source/tutorial/en/tokenization/PureTextTokenizer.rst +++ b/docs/source/tutorial/en/tokenization/PureTextTokenizer.rst @@ -1,5 +1,5 @@ PureTextTokenizer -================ +=================== By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security. At the same time, special formulas such as $\\FormFigureID{...}$ and $\\FormFigureBase64{...}$ are screened out to facilitate the tokenization of text and plain text formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future. diff --git a/docs/source/tutorial/en/tokenize.rst b/docs/source/tutorial/en/tokenize.rst index 3411b74b..f6350614 100644 --- a/docs/source/tutorial/en/tokenize.rst +++ b/docs/source/tutorial/en/tokenize.rst @@ -1,5 +1,5 @@ Tokenization -======= +============== Tokenization, known as word segmentation and sentence segmentation, is a basic but very important step in the field of NLP. In EduNLP, we divided Tokenization into different levels according to different granularity. To avoid ambiguity, we define as follows: @@ -15,7 +15,7 @@ This module provides tokenization function of question text, converting question There are two modes: one is linear mode, which is used for text processing (word segmentation using jieba library). The other one is ast mode, which is used to parse the formula. Word Segmentation -------- +--------------------- Text-tokenization: A sentence (without formulas) consists of several "words" in order. The process of dividing a sentence into several words is called "Text-tokenization". According to the granularity of "words", it can be subdivided into "Word-tokenization" and "Char-tokenization". @@ -52,12 +52,13 @@ Examples: ['三', '角', '函', '数', '基', '初', '函', '数'] Sentence Segmentation -------- +---------------------------- During the process of sentence segmentation, a long document is divided into several sentences. Each sentence is a "token" (to be realized). Tokenization -------- +-------------- + Tokenization is comprehensive analysis. In this process, sentences with formulas are segmented into several markers. Each marker is a "token". The implementation of this function is tokenize function. The required results can be obtained by passing in items after Structural Component Segmentation. diff --git a/docs/source/tutorial/en/tokenize/Sentence Segmentation.rst b/docs/source/tutorial/en/tokenize/Sentence Segmentation.rst index 1a8d4950..902b2bb5 100644 --- a/docs/source/tutorial/en/tokenize/Sentence Segmentation.rst +++ b/docs/source/tutorial/en/tokenize/Sentence Segmentation.rst @@ -1,3 +1,3 @@ Sentence Segmentation -------- +------------------------- During the process of sentence segmentation, a long document is divided into several sentences. Each sentence is a "token" (to be realized). diff --git a/docs/source/tutorial/en/tokenize/Tokenization.rst b/docs/source/tutorial/en/tokenize/Tokenization.rst index fad25912..c955602b 100644 --- a/docs/source/tutorial/en/tokenize/Tokenization.rst +++ b/docs/source/tutorial/en/tokenize/Tokenization.rst @@ -1,5 +1,5 @@ Tokenization -------- +-------------- Tokenization is comprehensive analysis. In this process, sentences with formulas are segmented into several markers. Each marker is a "token". We provide some encapsulated tokenizers for users to call them conveniently. The following is a complete list of tokenizers. diff --git a/docs/source/tutorial/en/tokenize/WordSegmentation.rst b/docs/source/tutorial/en/tokenize/WordSegmentation.rst index a85f4dae..181f0b80 100644 --- a/docs/source/tutorial/en/tokenize/WordSegmentation.rst +++ b/docs/source/tutorial/en/tokenize/WordSegmentation.rst @@ -1,5 +1,5 @@ Word segmentation -------- +--------------------- Text-tokenization: A sentence (without formulas) consists of several "words" in order. The process of dividing a sentence into several words is called "Text-tokenization". According to the granularity of "words", it can be subdivided into "Word-tokenization" and "Char-tokenization". diff --git a/docs/source/tutorial/en/vectorization.rst b/docs/source/tutorial/en/vectorization.rst index 5b744eeb..e48fe730 100644 --- a/docs/source/tutorial/en/vectorization.rst +++ b/docs/source/tutorial/en/vectorization.rst @@ -1,5 +1,5 @@ Vectorization -========= +================== This section provides a simple interface to convert the incoming items into vectors directly. Currently, the option of whether to use the pre training model is provided. You can choose according to your needs. If you don't want to use the pre-trained model, you can call D2V directly, or call get_pretrained_i2v function if you want to use the pre-trained model. @@ -20,14 +20,14 @@ Overview Flow Don't use the pre-trained model: call existing models directly ------------------------------------- +-------------------------------------------------------------------------- You can use any pre-trained model provided by yourself (just give the storage path of the model) to convert the given question text into vectors. * Advantages: it is flexible to use your own model and its parameters can be adjusted freely. Import modules -++++++++++ ++++++++++++++++++++++++ :: @@ -105,7 +105,7 @@ Contents: the combination of each question segmentation sequences in one questio 0.05325712, -0.02097339, -0.01613594, 0.02904145, 0.0185046 ,... Specific process of processing -++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ 1.Call get_tokenizer function to get the result after word segmentation; @@ -124,7 +124,7 @@ Use the pre-training model provided by EduNLP to convert the given question text * Call this function to obtain the corresponding pre-training model. At present, the following pre training models are provided: d2v_all_256, d2v_sci_256, d2v_eng_256 and d2v_lit_256. Selection and Use of Models -################## +#################################### Select the pre-training model according to the subject: @@ -142,7 +142,7 @@ Select the pre-training model according to the subject: The concrete process of processing -################## +#################################### 1.Download the corresponding preprocessing model diff --git a/docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst b/docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst index 41dcab64..844fdd3b 100644 --- a/docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst +++ b/docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst @@ -1,5 +1,5 @@ Use the pre-training model: call get_pretrained_i2v directly ---------------------------------------------- +-------------------------------------------------------------------- Use the pre-training model provided by EduNLP to convert the given question text into vectors. @@ -10,7 +10,7 @@ Use the pre-training model provided by EduNLP to convert the given question text * Call this function to obtain the corresponding pre-training model. At present, the following pre training models are provided: d2v_all_256, d2v_sci_256, d2v_eng_256 and d2v_lit_256. Selection and use of models -################## +#################################### Select the pre-training model according to the subject: @@ -27,7 +27,7 @@ Select the pre-training model according to the subject: +--------------------+------------------------+ The concrete process of processing -################## +#################################### 1.Download the corresponding preprocessing model diff --git a/docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst b/docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst index 2989f8ba..62ce6155 100644 --- a/docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst +++ b/docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst @@ -1,12 +1,12 @@ Don't use the pre-trained model: call existing models directly ------------------------------------- +---------------------------------------------------------------- You can use any pre-trained model provided by yourself (just give the storage path of the model) to convert the given question text into vectors. * Advantages: it is flexible to use your own model and its parameters can be adjusted freely. Specific process of processing -++++++++++++++++++++ ++++++++++++++++++++++++++++++++++++ 1.Call get_tokenizer function to get the result after word segmentation; From 3d9e4acf40456e2e04ebcc85acb9bc519571c79c Mon Sep 17 00:00:00 2001 From: fannazya Date: Sun, 7 Nov 2021 20:06:14 +0800 Subject: [PATCH 6/6] update English version tutorial --- docs/source/tutorial/en/parse.rst | 4 +- docs/source/tutorial/en/pretrain.rst | 4 +- docs/source/tutorial/en/sif.rst | 2 +- docs/source/tutorial/en/vectorization.rst | 89 ++++++++++++----------- docs/source/tutorial/zh/vectorization.rst | 86 +++++++++++----------- 5 files changed, 93 insertions(+), 92 deletions(-) diff --git a/docs/source/tutorial/en/parse.rst b/docs/source/tutorial/en/parse.rst index 22542da2..5aba283d 100644 --- a/docs/source/tutorial/en/parse.rst +++ b/docs/source/tutorial/en/parse.rst @@ -43,7 +43,7 @@ Formula: firstly, in the word segmentation function, the formula of the original This module also provides the function of formula variable standardization, such as determining whether 'x' in several sub formulas is the same variable. -Call the library +Import modules +++++++++++++++++++++ :: @@ -233,7 +233,7 @@ Introduction of main content * Use _is_formula_legal function, check the completeness and analyzability of latex formula, and report an error for formulas that do not conform to latex syntax. -Call the library +Import modules >>>>>>>>>>>>>>>>>>> :: diff --git a/docs/source/tutorial/en/pretrain.rst b/docs/source/tutorial/en/pretrain.rst index 985cffde..58105f44 100644 --- a/docs/source/tutorial/en/pretrain.rst +++ b/docs/source/tutorial/en/pretrain.rst @@ -67,8 +67,8 @@ First level version: Second level version: -* Minor subjects(Chinese,Math,English,History,Geography,Politics,Biology,Physics,Chemistry) -* Major subjects(science, arts and all subject) +* Single subject(Chinese,Math,English,History,Geography,Politics,Biology,Physics,Chemistry) +* Multiple subject(science, arts and all subject) Third level version【to be finished】: diff --git a/docs/source/tutorial/en/sif.rst b/docs/source/tutorial/en/sif.rst index be5c895d..7650aae6 100644 --- a/docs/source/tutorial/en/sif.rst +++ b/docs/source/tutorial/en/sif.rst @@ -91,7 +91,7 @@ Non-standard Format: Functions for judging whether text is in SIF format and converting to SIF format -------------------------------------------------------------------------------------------------- -Call the Library +Import modules ++++++++ :: diff --git a/docs/source/tutorial/en/vectorization.rst b/docs/source/tutorial/en/vectorization.rst index e48fe730..eb59a34c 100644 --- a/docs/source/tutorial/en/vectorization.rst +++ b/docs/source/tutorial/en/vectorization.rst @@ -19,6 +19,51 @@ Overview Flow 4.Use the existing or pre-trained model we provided to convert the tokenized items into vectors. +Use the pre-training model: call get_pretrained_i2v directly +--------------------------------------------- + +Use the pre-training model provided by EduNLP to convert the given question text into vectors. + +* Advantages: Simple and convenient. + +* Disadvantages: Only the model given in the project can be used, which has great limitations. + +* Call this function to obtain the corresponding pre-training model. At present, the following pre training models are provided: d2v_all_256, d2v_sci_256, d2v_eng_256 and d2v_lit_256. + +Selection and Use of Models +#################################### + +Select the pre-training model according to the subject: + ++--------------------+------------------------+ +| Pre-training model name | Subject of model training data | ++====================+========================+ +| d2v_all_256 | all subject | ++--------------------+------------------------+ +| d2v_sci_256 | Science | ++--------------------+------------------------+ +| d2v_lit_256 | Arts | ++--------------------+------------------------+ +| d2v_eng_256 | English | ++--------------------+------------------------+ + + +The concrete process of processing +#################################### + +1.Download the corresponding preprocessing model + +2.Transfer the obtained model to D2V and process it with D2V + Convert the obtained model into D2V and process it through D2V + +Examples: + +:: + + >>> i2v = get_pretrained_i2v("d2v_sci_256") + >>> i2v(item) + + Don't use the pre-trained model: call existing models directly -------------------------------------------------------------------------- @@ -111,47 +156,3 @@ Specific process of processing 2.Select the model provided for vectorization depending on the model used. - -Use the pre-training model: call get_pretrained_i2v directly ---------------------------------------------- - -Use the pre-training model provided by EduNLP to convert the given question text into vectors. - -* Advantages: Simple and convenient. - -* Disadvantages: Only the model given in the project can be used, which has great limitations. - -* Call this function to obtain the corresponding pre-training model. At present, the following pre training models are provided: d2v_all_256, d2v_sci_256, d2v_eng_256 and d2v_lit_256. - -Selection and Use of Models -#################################### - -Select the pre-training model according to the subject: - -+--------------------+------------------------+ -| Pre-training model name | Subject of model training data | -+====================+========================+ -| d2v_all_256 | all subject | -+--------------------+------------------------+ -| d2v_sci_256 | Science | -+--------------------+------------------------+ -| d2v_lit_256 | Arts | -+--------------------+------------------------+ -| d2v_eng_256 | English | -+--------------------+------------------------+ - - -The concrete process of processing -#################################### - -1.Download the corresponding preprocessing model - -2.Transfer the obtained model to D2V and process it with D2V - Convert the obtained model into D2V and process it through D2V - -Examples: - -:: - - >>> i2v = get_pretrained_i2v("d2v_sci_256") - >>> i2v(item) diff --git a/docs/source/tutorial/zh/vectorization.rst b/docs/source/tutorial/zh/vectorization.rst index 8c57cac7..aff364ff 100644 --- a/docs/source/tutorial/zh/vectorization.rst +++ b/docs/source/tutorial/zh/vectorization.rst @@ -19,6 +19,49 @@ 4.使用已有或者使用提供的预训练模型,将令牌化后的item转换为向量。 +使用预训练模型:直接调用get_pretrained_i2v +--------------------------------------------- + +使用 EduNLP 项目组给定的预训练模型将给定的题目文本转成向量。 + +* 优点:简单方便。 + +* 缺点:只能使用项目中给定的模型,局限性较大。 + +* 调用此函数即可获得相应的预训练模型,目前提供以下的预训练模型:d2v_all_256、d2v_sci_256、d2v_eng_256、d2v_lit_256 + +模型选择与使用 +################## + +根据题目所属学科选择预训练模型: + ++--------------------+------------------------+ +| 预训练模型名称 | 模型训练数据的所属学科 | ++====================+========================+ +| d2v_all_256 | 全学科 | ++--------------------+------------------------+ +| d2v_sci_256 | 理科 | ++--------------------+------------------------+ +| d2v_lit_256 | 文科 | ++--------------------+------------------------+ +| d2v_eng_256 | 英语 | ++--------------------+------------------------+ + +处理的具体流程 +################## + +1.下载相应的预处理模型 + +2.将所得到的模型传入D2V,使用D2V进行处理 + +Examples: + +:: + + >>> i2v = get_pretrained_i2v("d2v_sci_256") + >>> i2v(item) + + 不使用预训练模型:直接调用已有模型 ------------------------------------ @@ -110,46 +153,3 @@ T2V 1.调用get_tokenizer函数,得到经过分词后的结果; 2.根据使用的模型,选择提供的模型类型,进行向量化处理。 - - -使用预训练模型:直接调用get_pretrained_i2v ---------------------------------------------- - -使用 EduNLP 项目组给定的预训练模型将给定的题目文本转成向量。 - -* 优点:简单方便。 - -* 缺点:只能使用项目中给定的模型,局限性较大。 - -* 调用此函数即可获得相应的预训练模型,目前提供以下的预训练模型:d2v_all_256、d2v_sci_256、d2v_eng_256、d2v_lit_256 - -模型选择与使用 -################## - -根据题目所属学科选择预训练模型: - -+--------------------+------------------------+ -| 预训练模型名称 | 模型训练数据的所属学科 | -+====================+========================+ -| d2v_all_256 | 全学科 | -+--------------------+------------------------+ -| d2v_sci_256 | 理科 | -+--------------------+------------------------+ -| d2v_lit_256 | 文科 | -+--------------------+------------------------+ -| d2v_eng_256 | 英语 | -+--------------------+------------------------+ - -处理的具体流程 -################## - -1.下载相应的预处理模型 - -2.将所得到的模型传入D2V,使用D2V进行处理 - -Examples: - -:: - - >>> i2v = get_pretrained_i2v("d2v_sci_256") - >>> i2v(item)