diff --git a/AUTHORS.md b/AUTHORS.md index 4188b5a6..73b40ab2 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -12,5 +12,6 @@ [Longhu Qin](https://github.com/KenelmQLH) +[Meikai Bao](https://github.com/BAOOOOOM) -The stared contributors are the corresponding authors. \ No newline at end of file +The stared contributors are the corresponding authors. diff --git a/README.md b/README.md index ed75a0e5..fab37193 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,14 @@ pip install EduNLP pip install EduNLP[full] ``` +### Usage + +```python +from EduNLP import get_pretrained_i2v +i2v = get_pretrained_i2v("d2v_all_256", "./model") +item_vector, token_vector = i2v(["the content of item 1", "the content of item 2"]) +``` + ### Tutorial For more details, please refer to the full documentation ([latest](https://edunlp.readthedocs.io/en/latest) | [stable](https://edunlp.readthedocs.io/en/stable)). diff --git a/asset/_static/d2v.png b/asset/_static/d2v.png new file mode 100644 index 00000000..71d6994e Binary files /dev/null and b/asset/_static/d2v.png differ diff --git a/asset/_static/d2v_bow_tfidf.png b/asset/_static/d2v_bow_tfidf.png new file mode 100644 index 00000000..c5215160 Binary files /dev/null and b/asset/_static/d2v_bow_tfidf.png differ diff --git a/asset/_static/d2v_general.png b/asset/_static/d2v_general.png new file mode 100644 index 00000000..524bd157 Binary files /dev/null and b/asset/_static/d2v_general.png differ diff --git a/asset/_static/d2v_stem_tf.png b/asset/_static/d2v_stem_tf.png new file mode 100644 index 00000000..4cb22522 Binary files /dev/null and b/asset/_static/d2v_stem_tf.png differ diff --git a/asset/_static/data.png b/asset/_static/data.png new file mode 100644 index 00000000..b6c9daa1 Binary files /dev/null and b/asset/_static/data.png differ diff --git a/asset/_static/formula.png b/asset/_static/formula.png new file mode 100644 index 00000000..3cabf913 Binary files /dev/null and b/asset/_static/formula.png differ diff --git a/asset/_static/i2v.png b/asset/_static/i2v.png new file mode 100644 index 00000000..3da11cd0 Binary files /dev/null and b/asset/_static/i2v.png differ diff --git a/asset/_static/parse.png b/asset/_static/parse.png new file mode 100644 index 00000000..fd345f20 Binary files /dev/null and b/asset/_static/parse.png differ diff --git a/asset/_static/prepare_dataset.jpg b/asset/_static/prepare_dataset.jpg new file mode 100644 index 00000000..e82d5c42 Binary files /dev/null and b/asset/_static/prepare_dataset.jpg differ diff --git a/asset/_static/seg.png b/asset/_static/seg.png new file mode 100644 index 00000000..a04de8bc Binary files /dev/null and b/asset/_static/seg.png differ diff --git a/asset/_static/sif.png b/asset/_static/sif.png new file mode 100644 index 00000000..30c7cfef Binary files /dev/null and b/asset/_static/sif.png differ diff --git a/asset/_static/sif_addition.png b/asset/_static/sif_addition.png new file mode 100644 index 00000000..db7ccfdc Binary files /dev/null and b/asset/_static/sif_addition.png differ diff --git a/asset/_static/tokenizer.png b/asset/_static/tokenizer.png new file mode 100644 index 00000000..f074449c Binary files /dev/null and b/asset/_static/tokenizer.png differ diff --git a/asset/_static/w2v_stem_text.png b/asset/_static/w2v_stem_text.png new file mode 100644 index 00000000..069f1468 Binary files /dev/null and b/asset/_static/w2v_stem_text.png differ diff --git a/asset/_static/w2v_stem_tf.png b/asset/_static/w2v_stem_tf.png new file mode 100644 index 00000000..1d628bb4 Binary files /dev/null and b/asset/_static/w2v_stem_tf.png differ diff --git a/docs/requirements.txt b/docs/requirements.txt index 7f8b9b23..5a185c62 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,4 +2,5 @@ sphinx sphinx_rtd_theme sphinx_toggleprompt sphinx-gallery>=0.6 -nbsphinx \ No newline at end of file +nbsphinx +m2r2 diff --git a/docs/source/conf.py b/docs/source/conf.py index 1605600f..9d6a118b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -46,14 +46,34 @@ def copy_tree(src, tar): 'sphinx.ext.mathjax', 'sphinx_toggleprompt', 'nbsphinx', - 'sphinx_gallery.load_style' + 'sphinx_gallery.load_style', + 'm2r2', + 'IPython.sphinxext.ipython_console_highlighting', + 'IPython.sphinxext.ipython_directive' ] # extension variables setting # npsphinx nbsphinx_thumbnails = { - 'build/blitz/sif/sif': '_static/item_figure.png', + 'build/blitz/sif/sif': '_static/sif.png', + 'build/blitz/sif/sif_addition': '_static/sif_addition.png', + 'build/blitz/utils/data': '_static/data.png', + 'build/blitz/formula/formula': '_static/formula.png', + 'build/blitz/seg/seg': '_static/seg.png', + 'build/blitz/parse/parse': '_static/parse.png', + 'build/blitz/formula/formula': '_static/formula.png', + 'build/blitz/tokenizer/tokenizer': '_static/tokenizer.png', + 'build/blitz/vectorization/i2v': '_static/i2v.png', + 'build/blitz/pretrain/prepare_dataset': '_static/prepare_dataset.jpg', + 'build/blitz/pretrain/gensim/d2v_bow_tfidf': '_static/d2v_bow_tfidf.png', + 'build/blitz/pretrain/gensim/d2v_general': '_static/d2v_general.png', + 'build/blitz/pretrain/gensim/d2v_stem_tf': '_static/d2v_stem_tf.png', + 'build/blitz/pretrain/gensim/w2v_stem_text': '_static/w2v_stem_text.png', + 'build/blitz/pretrain/gensim/w2v_stem_tf': '_static/w2v_stem_tf.png', + 'build/blitz/pretrain/seg_token/d2v': '_static/d2v.png', + 'build/blitz/pretrain/seg_token/d2v_d1': '_static/d2v_d1.png', + 'build/blitz/pretrain/seg_token/d2v_d2': '_static/d2v_d2.png', } # Add any paths that contain templates here, relative to this directory. @@ -62,7 +82,7 @@ def copy_tree(src, tar): # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # -source_suffix = ['.rst', '.md', '.ipynb'] +source_suffix = ['.rst', '.md'] # source_suffix = '.rst' # The language for content autogenerated by Sphinx. Refer to documentation @@ -75,7 +95,7 @@ def copy_tree(src, tar): # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build'] +exclude_patterns = ['_build','**.ipynb_checkpoints'] # -- Options for HTML output ------------------------------------------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 9c66ae39..13f8b2c6 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -84,6 +84,16 @@ But you can also install from source: Getting Started ------------------ + +One basic usage of EduNLP is to convert an item into a vector, i.e., + +.. code-block:: python + + from EduNLP import get_pretrained_i2v + i2v = get_pretrained_i2v("d2v_all_256", "./model") + item_vector, token_vector = i2v(["the content of item 1", "the content of item 2"]) + + For absolute beginners, start with the :doc:`Tutorial to EduNLP ` :doc:`(中文版) `. It covers the basic concepts of EduNLP and a step-by-step on training, loading and using the language models. diff --git a/docs/source/tutorial/zh/index.rst b/docs/source/tutorial/zh/index.rst index dce32fc2..18ce7d4e 100644 --- a/docs/source/tutorial/zh/index.rst +++ b/docs/source/tutorial/zh/index.rst @@ -11,13 +11,138 @@ tokenize vectorization - 示例 -------- + +标准项目格式 +^^^^^^^^ + +.. nbgallery:: + :caption: This is a thumbnail gallery: + :name: sif_gallery + :glob: + + Code for beginner to learn how to use SIF4Sci <../../build/blitz/sif/sif> + Code for beginner to learn how to use sif_additon <../../build/blitz/sif/sif_addition> + + +成分分解 +^^^^^^^^^^^ + +语义成分分解 +#################### + +.. nbgallery:: + :caption: This is a thumbnail gallery: + :name: dict2str4sif_gallery + :glob: + + Code for beginner to learn how to use dict2str4sif <../../build/blitz/utils/data.ipynb> + + +结构成分分解 +#################### + +.. nbgallery:: + :caption: This is a thumbnail gallery: + :name: seg_gallery + :glob: + + Code for beginner to learn how to use seg <../../build/blitz/seg/seg.ipynb> + + +语法解析 +^^^^^^^^^^^ + +文本语法结构解析 +#################### + +.. nbgallery:: + :caption: This is a thumbnail gallery: + :name: parse_gallery + :glob: + + Code for beginner to learn how to use parse <../../build/blitz/parse/parse.ipynb> + + +公式语法结构解析 +#################### + +.. nbgallery:: + :caption: This is a thumbnail gallery: + :name: formula_gallery + :glob: + + Code for beginner to learn how to use Formula <../../build/blitz/formula/formula.ipynb> + + +令牌化 +^^^^^^^^^^^ + +.. nbgallery:: + :caption: This is a thumbnail gallery: + :name: tokenizer_gallery + :glob: + + Code for beginner to learn how to use Tokenizer <../../build/blitz/tokenizer/tokenizer.ipynb> + + +向量化 +^^^^^^^^^^^ + +.. nbgallery:: + :caption: This is a thumbnail gallery: + :name: vectorization_gallery + :glob: + + Code for beginner to learn how to use i2v <../../build/blitz/vectorization/i2v.ipynb> + + +预训练 +^^^^^^^^^^^ + +获得数据集 +#################### + +.. nbgallery:: + :caption: This is a thumbnail gallery: + :name: rst1-gallery + :glob: + + prepare_dataset <../../build/blitz/pretrain/prepare_dataset.ipynb> + + +gensim模型d2v例子 +#################### + +.. nbgallery:: + :caption: This is a thumbnail gallery: + :name: rst2-gallery + :glob: + + d2v_general <../../build/blitz/pretrain/gensim/d2v_general.ipynb> + d2v_bow_tfidf <../../build/blitz/pretrain/gensim/d2v_bow_tfidf.ipynb> + d2v_stem_tf <../../build/blitz/pretrain/gensim/d2v_stem_tf.ipynb> + + +gensim模型w2v例子 +#################### + +.. nbgallery:: + :caption: This is a thumbnail gallery: + :name: rst3-gallery + :glob: + + w2v_stem_text <../../build/blitz/pretrain/gensim/w2v_stem_text.ipynb> + w2v_stem_tf <../../build/blitz/pretrain/gensim/w2v_stem_tf.ipynb> + + +seg_token例子 +#################### + .. nbgallery:: :caption: This is a thumbnail gallery: - :name: gallery + :name: rst4-gallery :glob: - :reversed: - ../../build/blitz/sif/sif \ No newline at end of file + d2v.ipynb <../../build/blitz/pretrain/seg_token/d2v.ipynb> diff --git a/docs/source/tutorial/zh/parse.rst b/docs/source/tutorial/zh/parse.rst index 380d3f4c..9d6ea22e 100644 --- a/docs/source/tutorial/zh/parse.rst +++ b/docs/source/tutorial/zh/parse.rst @@ -6,5 +6,31 @@ * 文本语法结构解析 * 公式语法结构解析 -公式语法结构解析 +其目的是: + + +1、将选择题中的括号,填空题中的下划线用特殊标识替换掉,并将字符、公式用$$包裹起来,使item能通过$符号准确的按照类型切割开; + +2、判断当前item是否合法,并报出错误类型。 + +具体处理内容 +-------------------- + +1.匹配公式之外的英文字母、数字,只对两个汉字之间的字母、数字做修正,其余匹配到的情况视为不合 latex 语法录入的公式 + +2.匹配“( )”型括号(包含英文格式和中文格式),即括号内无内容或为空格的括号,将括号替换$\\SIFChoice$ + +3.匹配下划线,替换连续的下划线或下划线中夹杂空格的情况,将其替换为$\\SIFBlank$ + +4.匹配latex公式,主要检查latex公式的完整性和可解析性,对latex 中出现中文字符发出警告 + +学习路线图 -------------------- + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + 文本语法结构解析 + 公式语法结构解析 + diff --git "a/docs/source/tutorial/zh/parse/\345\205\254\345\274\217\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" "b/docs/source/tutorial/zh/parse/\345\205\254\345\274\217\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" new file mode 100644 index 00000000..1a7717fb --- /dev/null +++ "b/docs/source/tutorial/zh/parse/\345\205\254\345\274\217\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" @@ -0,0 +1,61 @@ +公式语法结构解析 +-------------------- + +本功能主要由EduNLP.Formula模块实现,具有检查传入的公式是否合法,并将合法的公式转换为art树的形式。从实际使用的角度,本模块常作为中间处理过程,调用相应的模型即可自动选择本模块的相关参数,故一般不需要特别关注。 + +主要内容介绍 ++++++++++++++++ + +1.Formula:对传入的单个公式进行判断,判断传入的公式是否为str形式,如果是则使用ast的方法进行处理,否则进行报错。此外,提供了variable_standardization参数,当此参数为True时,使用变量标准化方法,即同一变量拥有相同的变量编号。 + +2.FormulaGroup:如果需要传入公式集则可调用此接口,最终将形成ast森林,森林中树的结构同Formula。 + + +Examples: + +:: + + >>> text = '支持公式如$\\frac{y}{x}$,$\\SIFBlank$,$\\FigureID{1}$,不支持公式如$\\frac{ \\dddot y}{x}$' + >>> text_parser = Parser(text) + >>> text_parser.description_list() + >>> text_parser.fomula_illegal_flag + >>> 1 + +:: + + >>> f = Formula("x") + >>> f + + >>> f.ast + [{'val': {'id': 0, 'type': 'mathord', 'text': 'x', 'role': None}, 'structure': {'bro': [None, None], 'child': None, 'father': None, 'forest': None}}] + >>> f.elements + [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None}] + >>> f.variable_standardization(inplace=True) + + >>> f.elements + [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}] + +:: + + >>> fg = FormulaGroup(["x + y", "y + x", "z + x"]) + >>> fg + ;;> + >>> fg = FormulaGroup(["x + y", Formula("y + x"), "z + x"]) + >>> fg + ;;> + >>> fg = FormulaGroup(["x", Formula("y"), "x"]) + >>> fg.elements + [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'y', 'role': None},\ + {'id': 2, 'type': 'mathord', 'text': 'x', 'role': None}] + >>> fg = FormulaGroup(["x", Formula("y"), "x"], variable_standardization=True) + >>> fg.elements + [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}, {'id': 1, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}, {'id': 2, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}] + +详细示范 ++++++++++++++++ + +.. toctree:: + :titlesonly: + + 树型处理效果 <../../../build/blitz/formula/tree.ipynb> + 公式解析效果案例 <../../../build/blitz/formula/formula.ipynb> diff --git "a/docs/source/tutorial/zh/parse/\346\226\207\346\234\254\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" "b/docs/source/tutorial/zh/parse/\346\226\207\346\234\254\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" new file mode 100644 index 00000000..f2f442a0 --- /dev/null +++ "b/docs/source/tutorial/zh/parse/\346\226\207\346\234\254\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" @@ -0,0 +1,39 @@ +文本语法结构解析 +-------------------- + +本部分主要由EduNLP.SIF.Parse模块实现,主要功能为将文本中的字母、数字等进行提取,将其转换为标准格式。 + +主要流程介绍 ++++++++++++++++ + +1.按照以下顺序,先后对传入的文本进行判断类型 + +* is_chinese:用于匹配中文字符 [\u4e00-\u9fa5] + +* is_alphabet:匹配公式之外的英文字母,将匹配到的只对两个汉字之间的字母做修正(使用$$包裹起来),其余匹配到的情况视为不合 latex 语法录入的公式 + +* is_number:匹配公式之外的数字,只对两个汉字之间的数字做修正(使用$$包裹起来),其余匹配到的情况视为不合 latex 语法录入的公式 + +2.匹配 latex 公式 + +* latex 中出现中文字符,打印且只打印一次 warning + +* 使用_is_formula_legal函数,检查latex公式的完整性和可解析性,对于不合法公式报错 + +Examples: + +:: + + >>> text = '生产某种零件的A工厂25名工人的日加工零件数_ _' + >>> text_parser = Parser(text) + >>> text_parser.description_list() + >>> text_parser.text + >>> '生产某种零件的$A$工厂$25$名工人的日加工零件数$\\SIFBlank$' + +详细示范 ++++++++++++++++ + +.. toctree:: + :titlesonly: + + 文本语法结构解析的案例 <../../../build/blitz/parse/parse.ipynb> diff --git a/docs/source/tutorial/zh/pretrain.rst b/docs/source/tutorial/zh/pretrain.rst index 0dbee20a..477717a4 100644 --- a/docs/source/tutorial/zh/pretrain.rst +++ b/docs/source/tutorial/zh/pretrain.rst @@ -8,12 +8,13 @@ * 如何加载预训练模型 * 公开的预训练模型 +学习路线图 +------------------ -训练模型 ---------- +.. toctree:: + :maxdepth: 1 + :titlesonly: -装载模型 --------- - -公开模型一览 ------------- + 训练模型 + 装载模型 + 公开模型一览 diff --git a/docs/source/tutorial/zh/pretrain/loading.rst b/docs/source/tutorial/zh/pretrain/loading.rst new file mode 100644 index 00000000..d930674b --- /dev/null +++ b/docs/source/tutorial/zh/pretrain/loading.rst @@ -0,0 +1,11 @@ +装载模型 +-------- + +将所得到的模型传入I2V模块即可装载模型 + +Examples: + +:: + + >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) diff --git a/docs/source/tutorial/zh/pretrain/pub.rst b/docs/source/tutorial/zh/pretrain/pub.rst new file mode 100644 index 00000000..3139910f --- /dev/null +++ b/docs/source/tutorial/zh/pretrain/pub.rst @@ -0,0 +1,85 @@ +公开模型一览 +------------ + +版本说明 +################## + +一级版本 + +* 公开版本1(luna_pub):高考 +* 公开版本2( luna_pub_large):高考 + 地区试题 + +二级版本: + +* 小科(Chinese,Math,English,History,Geography,Politics,Biology,Physics,Chemistry) +* 大科(理科science、文科literal、全科all) + +三级版本:【待完成】 + +* 不使用第三方初始化词表 +* 使用第三方初始化词表 + + + +模型命名规则:一级版本 + 二级版本 + gensim_luna_stem + 分词规则 + 模型方法 + 维度 + +Examples: + +:: + + 全量版本-全学科的D2V模型路径: + `/share/qlh/d2v_model/luna_pub/luna_pub_all_gensim_luna_stem_general_d2v_256.bin` + (备注:一个D2V模型含4个bin后缀的文件) + +模型训练数据说明 +################## + +* 当前【词向量w2v】【句向量d2v】模型所用的数据均为 【高中学段】 的题目 +* 测试数据:`[OpenLUNA.json] `_ + +当前提供以下模型,更多分学科、分题型模型正在训练中,敬请期待 + "d2v_all_256"(全科),"d2v_sci_256"(理科),"d2v_eng_256"(文科),"d2v_lit_256"(英语) + +模型训练案例 +------------ + +获得数据集 +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + prepare_dataset <../../../build/blitz/pretrain/prepare_dataset.ipynb> + +gensim模型d2v例子 +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + d2v_bow_tfidf <../../../build/blitz/pretrain/gensim/d2v_bow_tfidf.ipynb> + d2v_general <../../../build/blitz/pretrain/gensim/d2v_general.ipynb> + d2v_stem_tf <../../../build/blitz/pretrain/gensim/d2v_stem_tf.ipynb> + +gensim模型w2v例子 +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + w2v_stem_text <../../../build/blitz/pretrain/gensim/w2v_stem_text.ipynb> + w2v_stem_tf <../../../build/blitz/pretrain/gensim/w2v_stem_tf.ipynb> + +seg_token例子 +#################### + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + d2v.ipynb <../../../build/blitz/pretrain/seg_token/d2v.ipynb> + d2v_d1 <../../../build/blitz/pretrain/seg_token/d2v_d1.ipynb> + d2v_d2 <../../../build/blitz/pretrain/seg_token/d2v_d2.ipynb> diff --git a/docs/source/tutorial/zh/pretrain/start.rst b/docs/source/tutorial/zh/pretrain/start.rst new file mode 100644 index 00000000..f87d6afa --- /dev/null +++ b/docs/source/tutorial/zh/pretrain/start.rst @@ -0,0 +1,24 @@ +训练模型 +------------ + +如需训练模型则可直接train_vector函数接口,来使使训练模型更加方便。模块调用gensim库中的相关训练模型,目前提供了"sg"、 "cbow"、 "fastext"、 "d2v"、 "bow"、 "tfidf"的训练方法,并提供了embedding_dim参数,使之可以按照需求确定向量的维度。 + +基本步骤 +################## + +1.确定模型的类型,选择适合的Tokenizer(GensimWordTokenizer、 GensimSegTokenizer),使之令牌化; + +2.调用train_vector函数,即可得到所需的预训练模型。 + +Examples: + +:: + + >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] + + # 10 dimension with fasstext method + train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="d2v") diff --git a/docs/source/tutorial/zh/seg.rst b/docs/source/tutorial/zh/seg.rst index c3bc9439..e1e1c0db 100644 --- a/docs/source/tutorial/zh/seg.rst +++ b/docs/source/tutorial/zh/seg.rst @@ -7,12 +7,20 @@ * 语义成分分解 * 结构成分分解 -语义成分分解 ------------- +主要处理内容 +-------------------- -结构成分分解 ------------- +1.将字典输入形式的选择题通过 `语法解析 `_ 转换为符合条件的item; +2.将输入的item按照元素类型进行切分、分组。 +学习路线图 +-------------------- +.. toctree:: + :maxdepth: 1 + :titlesonly: + + 语义成分分解 + 结构成分分解 diff --git "a/docs/source/tutorial/zh/seg/\347\273\223\346\236\204\346\210\220\345\210\206\345\210\206\350\247\243.rst" "b/docs/source/tutorial/zh/seg/\347\273\223\346\236\204\346\210\220\345\210\206\345\210\206\350\247\243.rst" new file mode 100644 index 00000000..13ae96ca --- /dev/null +++ "b/docs/source/tutorial/zh/seg/\347\273\223\346\236\204\346\210\220\345\210\206\345\210\206\350\247\243.rst" @@ -0,0 +1,53 @@ +结构成分分解 +------------ + +对切片后的item中的各个元素进行分词,提供深度选项,可以按照需求选择所有地方切分或者在部分标签处切分(比如\SIFSep、\SIFTag处);对标签添加的位置也可以进行选择,可以在头尾处添加或仅在头或尾处添加。 + +具有两种模式,一种是linear模式,用于对文本进行处理(使用jieba库进行分词);一种是ast模式,用于对公式进行解析。 + +基础使用方法 +++++++++++++++++++ + +:: + + >>> test_item = r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$" + >>> seg(test_item) + >>> ['如图所示,则', '\\bigtriangleup ABC', '的面积是', '\\SIFBlank', '。', \FigureID{1}] + +可选的的额外参数/接口 +++++++++++++++++++++++ + +1.describe:可以统计出各种类型元素的数量 + +:: + + >>> s.describe() + {'t': 3, 'f': 1, 'g': 1, 'm': 1} + +2.filter:可以选择性的筛除某种或几种类型的元素 + +:: + + >>> with s.filter("f"): + ... s + ['如图所示,则', '的面积是', '\\SIFBlank', '。', \FigureID{1}] + >>> with s.filter(keep="t"): + ... s + ['如图所示,则', '的面积是', '。'] + +3.symbol:选择性的将部分类型的数据转换为特殊符号遮掩起来 + +:: + + >>> seg(test_item, symbol="fgm") + ['如图所示,则', '[FORMULA]', '的面积是', '[MARK]', '。', '[FIGURE]'] + >>> seg(test_item, symbol="tfgm") + ['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]'] + +详细示范 ++++++++++++ + +.. toctree:: + :titlesonly: + + 结构成分分解的案例 <../../../build/blitz/seg/seg.ipynb> diff --git "a/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst" "b/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst" new file mode 100644 index 00000000..0950dd87 --- /dev/null +++ "b/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst" @@ -0,0 +1,55 @@ +语义成分分解 +------------ + +由于选择题是以字典的形式给出,故需要将其在保留数据类型关系的情况下转换为文本格式。dict2str4sif函数就是实现此功能的一个模块,该模块可以将选择题形式的item转换为字符格式,并将题干和选项、各选项之间分割开来。 + + +基础使用方法 +++++++++++++++++++ + +:: + + >>> item = { + ... "stem": r"若复数$z=1+2 i+i^{3}$,则$|z|=$", + ... "options": ['0', '1', r'$\sqrt{2}$', '2'], + ... } + >>> dict2str4sif(item) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' + +可选的的额外参数/接口 +++++++++++++++++++++++ + +1.add_list_no_tag:当此参数为True较False时区别在于是否需要将选项部分的标签计数 + +:: + + >>> dict2str4sif(item, add_list_no_tag=True) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' + + >>> dict2str4sif(item, add_list_no_tag=False) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2$\\SIFTag{options_end}$' + +2.tag_mode:此参数为选择标签所在位置,delimiter为头尾都加标签,head为仅头部加标签,tail为仅尾部加标签 + +:: + + >>> dict2str4sif(item, tag_mode="head") # doctest: +ELLIPSIS + '$\\SIFTag{stem}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{options}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2' + + >>> dict2str4sif(item, tag_mode="tail") # doctest: +ELLIPSIS + '若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options}$' + +3.key_as_tag:当其为False时则不区分切分标签的类型,而是仅在选项之间加入$\SIFSep$ + +:: + + >>> dict2str4sif(item, key_as_tag=False) + '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' + +详细示范 +++++++++++++++++++++++ + +.. toctree:: + :titlesonly: + + 语义成分分解的案例 <../../../build/blitz/utils/data.ipynb> diff --git a/docs/source/tutorial/zh/sif.rst b/docs/source/tutorial/zh/sif.rst index d838a59a..0bb9f2ae 100644 --- a/docs/source/tutorial/zh/sif.rst +++ b/docs/source/tutorial/zh/sif.rst @@ -1,2 +1,109 @@ 标准项目格式 -=============== \ No newline at end of file +=============== + +version: 0.2 + +为了后续研究和使用的方便,我们需要一个统一的试题语法标准。 + +语法规则 +----------- + +1. 题目文本中只允许出现中文字符、中英文标点和换行符。 + +2. 使用 \$\SIFBlank\$ 替换横线,对于选择题中的括号使用 \$\SIFChoice\$ 替换。 + +3. 图片 ID 以公式的形式嵌入文本中:``$\FigureID{ uuid }$`` 或用 base64 编码表示,特别的,内容为公式的图片用 ``$\FormFigureID{ uuid }$`` 表示。 + +4. 文本标注格式:统一用 ``$\textf{item,CHAR_EN}$`` 表示,目前定义的有:b-加粗,i-斜体,u-下划线,w-下划波浪线,d-加点,t-标题。标注可以混用,按字母顺序排序,例如:$\textf{EduNLP, b}$ 表示 **EduNLP** + +5. 其余诸如,英文字母、罗马字符、数字等数学符号一律需要使用 latex 格式表示,即嵌在 ``$$`` 之中。 + +6. 分子式的录入标准暂且参考 `INCHI `_ + +7. 目前对 latex 内部语法没有要求。 + +:: + + 1. Item -> CHARACTER|EN_PUN_LIST|CH_PUN_LIST|FORMULA|QUES_MARK + 2. EN_PUN_LIST -> [',', '.', '?', '!', ':', ';', '\'', '\"', '(', ')', ' ','_','/','|','\\','<','>','[',']','-'] + 3. CH_PUN_LIST -> [',', '。', '!', '?', ':',';', '‘', '’', '“', '”', '(', ')', ' ', '、','《','》','—','.'] + 4. FORMULA -> $latex formula$ | $\FormFigureID{UUID}$ | $\FormFigureBase64{BASE64}$ + 5. FIGURE -> $\FigureID{UUID}$ | $\FigureBase64{BASE64}$ + 6. UUID -> [a-zA-Z\-0-9]+ + 7. CHARACTER -> CHAR_EN | CHAR_CH + 8. CHAR_EN -> [a-zA-Z]+ + 9. CHAR_CH -> [\u4e00-\u9fa5]+ + 10. DIGITAL -> [0-9]+ + 11. QUES_MARK -> $\SIFBlank$ | $\SIFChoice$ + + +注意事项 ++++++++++++++++ + +1. 保留字符与转义 + +2. 数字 + +3. 选空与填空 + +4. 对于单个的数字或字符也需要添加 ``$$`` (目前能实现自动校验) + +5. latex 公式中尽量不出现中文:(``\text{这里出现中文}``) + +6. MySql 数据库导入数据时会自动忽略一个 ``\``,所以录入的公式需要进一步处理为 ``\\`` + +示例 +----------------- + +标准形式: + +:: + + 1. 若$x,y$满足约束条件$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,则$z=x+7 y$的最大值$\\SIFUnderline$' + + 2. 已知函数$f(x)=|3 x+1|-2|x|$画出$y=f(x)$的图像求不等式$f(x)>f(x+1)$的解集$\\PictureID{3bf2ddf4-8af1-11eb-b750-b46bfc50aa29}$$\\PictureID{59b8bd14-8af1-11eb-93a5-b46bfc50aa29}$$\\PictureID{63118b3a-8b75-11eb-a5c0-b46bfc50aa29}$$\\PictureID{6a006179-8b76-11eb-b386-b46bfc50aa29}$$\\PictureID{088f15eb-8b7c-11eb-a86f-b46bfc50aa29}$ + +非标准形式: + +1. 字母、数字和数学符号连续混合出现: + + 例如: + + ``完成下面的2x2列联表,`` + + ``(单位:m3)`` + + ``则输出的n=`` + +2. 特殊的数学符号没有用 latex 公式表示: + + 例如: + + ``命题中真命题的序号是 ①`` + + ``AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.若D为AC的中点`` + +3. 出现以 unicode 编码写成的字符 + + 例如:``则$a$的取值范围是(\u3000\u3000)`` + + +Change Log +---------------- + +2021-05-18 + +修改: + +1. 原用 \$\SIFUnderline\$ 和 \$\SIFBracket\$ 来替换填空题中的横线和选择题中的括号,现分别用 \$\SIFBlank\$ 和 \$\SIFChoice\$ 替换。 + +2. 原统一用 ``$\PictureID{ uuid }$`` 表示图片,现使用 ``$\FigureID{ uuid }$`` ,其中对于数据公式,用 ``$\FormFigureID{ uuid }$`` 来表示。 + +2021-06-28 + +添加: + +1. 注明 ``$$`` 之中不能出现换行符。 + +2. 添加文本标注格式说明。 + diff --git a/docs/source/tutorial/zh/tokenization/GensimSegTokenizer.rst b/docs/source/tutorial/zh/tokenization/GensimSegTokenizer.rst new file mode 100644 index 00000000..f1a66d77 --- /dev/null +++ b/docs/source/tutorial/zh/tokenization/GensimSegTokenizer.rst @@ -0,0 +1,9 @@ +GensimSegTokenizer +===================== + +此令牌解析器在默认情况下对传入的item中的图片、分隔符、题目空缺符等部分则转换成特殊字符进行保护,从而对文本、公式、标签进行令牌化操作。此外,从令牌化方法而言,此令牌解析器对文本均采用线性的分析方法,而对公式采用抽象语法树的分析方法。 + +与GensimWordTokenizer相比,GensimSegTokenizer解析器主要区别是: + +* 提供了切分深度的选项,即可以在sep标签或者tag标签处进行切割 +* 默认在item组分(如text、formula)的头部插入开始标签 diff --git a/docs/source/tutorial/zh/tokenization/GensimWordTokenizer.rst b/docs/source/tutorial/zh/tokenization/GensimWordTokenizer.rst new file mode 100644 index 00000000..e8924e21 --- /dev/null +++ b/docs/source/tutorial/zh/tokenization/GensimWordTokenizer.rst @@ -0,0 +1,20 @@ +GensimWordTokenizer +===================== + +此令牌解析器在默认情况下对传入的item中的图片、题目空缺符等部分转换成特殊字符进行保护,从而对文本、公式、标签、分隔符进行令牌化操作。此外,从令牌化方法而言,此令牌解析器对文本均采用线性的分析方法,而对公式采用抽象语法树的分析方法,提供了general参数可供使用者选择:当general为true的时候则代表着传入的item并非标准格式,此时对公式也使用线性的分析方法;当general为false时则代表使用抽象语法树的方法对公式进行解析。 + +Examples +---------- + +:: + + >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] + >>> tokenizer = GensimWordTokenizer(symbol="fgmas", general=False) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]'] diff --git a/docs/source/tutorial/zh/tokenization/TextTokenizer.rst b/docs/source/tutorial/zh/tokenization/TextTokenizer.rst new file mode 100644 index 00000000..a17de29b --- /dev/null +++ b/docs/source/tutorial/zh/tokenization/TextTokenizer.rst @@ -0,0 +1,27 @@ +TextTokenizer +================ + +即文本令牌解析器,在默认情况下对传入的item中的图片、标签、分隔符、题目空缺符等部分则转换成特殊字符进行保护,从而对文本、公式进行令牌化操作。此外,此令牌解析器对文本、公式均采用线性的分析方法,并提供的key参数用于对传入的item进行预处理,待未来根据需求进行开发。 + + +Examples +---------- + +:: + + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokenizer = TextTokenizer() + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + >>> items = [{ + ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", + ... "options": ["1", "2"] + ... }] + >>> tokens = tokenizer(items, key=lambda x: x["stem"]) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] diff --git a/docs/source/tutorial/zh/tokenize.rst b/docs/source/tutorial/zh/tokenize.rst index 12855778..ce719757 100644 --- a/docs/source/tutorial/zh/tokenize.rst +++ b/docs/source/tutorial/zh/tokenize.rst @@ -5,19 +5,24 @@ 在EduNLP中我们将令牌化分为不同的粒度,为避免歧义,我们定义如下: * 词/字级别:分词 + * 句级别:分句 -* 资源级别:令牌化 -分词 -------- +* 资源级别:令牌化 -分句 -------- +本模块提供题目文本的令牌化解析(Tokenization),将题目转换成令牌序列,方便后续向量化表征试题。 -令牌化 -------- +在进入此模块前需要先后将item经过 `语法解析 `_ 和 `成分分解 `_ 处理,之后对切片后的item中的各个元素进行分词,提供深度选项,可以按照需求选择所有地方切分或者在部分标签处切分(比如\SIFSep、\SIFTag处);对标签添加的位置也可以进行选择,可以在头尾处添加或仅在头或尾处添加。 -我们提供了多种已经封装好的令牌化器供用户便捷调用,下面是一个示例 +具有两种模式,一种是linear模式,用于对文本进行处理(使用jieba库进行分词);一种是ast模式,用于对公式进行解析。 +学习路线图 +-------------------- -通过 可以查看更多令牌化器,下面是一个完整的令牌化器列表 +.. toctree:: + :maxdepth: 1 + :titlesonly: + + 分词 + 分句 + 令牌化 diff --git "a/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" "b/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" new file mode 100644 index 00000000..9782bece --- /dev/null +++ "b/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" @@ -0,0 +1,28 @@ +令牌化 +------- +即综合解析,将带公式的句子切分为若干标记的过程。每个标记为一个“令牌”(token)。 +我们提供了多种已经封装好的令牌化器供用户便捷调用,下面是一个示例: + +Examples + +:: + + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokenizer = TextTokenizer() + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + + + +通过查看"./EduNLP/Tokenizer/tokenizer.py"及"./EduNLP/Pretrain/gensim_vec.py"可以查看更多令牌化器,下面是一个完整的令牌化器列表 + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + ../tokenization/TextTokenizer + ../tokenization/GensimSegTokenizer + ../tokenization/GensimWordTokenizer diff --git "a/docs/source/tutorial/zh/tokenize/\345\210\206\345\217\245.rst" "b/docs/source/tutorial/zh/tokenize/\345\210\206\345\217\245.rst" new file mode 100644 index 00000000..67cf5679 --- /dev/null +++ "b/docs/source/tutorial/zh/tokenize/\345\210\206\345\217\245.rst" @@ -0,0 +1,4 @@ +分句 +------- + +将较长的文档切分成若干句子的过程称为“分句”。每个句子为一个“令牌”(token)(待实现)。 diff --git "a/docs/source/tutorial/zh/tokenize/\345\210\206\350\257\215.rst" "b/docs/source/tutorial/zh/tokenize/\345\210\206\350\257\215.rst" new file mode 100644 index 00000000..ec75b0cd --- /dev/null +++ "b/docs/source/tutorial/zh/tokenize/\345\210\206\350\257\215.rst" @@ -0,0 +1,36 @@ +分词 +------- + +词解析(text-tokenization):一个句子(不含公式)是由若干“词”按顺序构成的,将一个句子切分为若干词的过程称为“词解析”。根据词的粒度大小,又可细分为“词组解析”和"单字解析"。 + +:: + + - 词组解析 (word-tokenization):每一个词组为一个“令牌”(token)。 + + - 单字解析 (char-tokenization):单个字符即为一个“令牌”(token)。 + + +词解析分为两个主要步骤: + +1. 分词: + + - 词组解析:使用分词工具切分并提取题目文本中的词。本项目目前支持的分词工具有:`jieba` + + - 单字解析:按字符划分。 + +2. 筛选:过滤指定的停用词。 + + 本项目默认使用的停用词表:`[stopwords] `_ + 你也可以使用自己的停用词表,具体使用方法见下面的示例。 + +Examples: + +:: + + >>> text = "三角函数是基本初等函数之一" + >>> tokenize(text, granularity="word") + ['三角函数', '初等', '函数'] + + >>> tokenize(text, granularity="char") + ['三', '角', '函', '数', '基', '初', '函', '数'] + diff --git a/docs/source/tutorial/zh/vectorization.rst b/docs/source/tutorial/zh/vectorization.rst index c4be7cd1..89175ba6 100644 --- a/docs/source/tutorial/zh/vectorization.rst +++ b/docs/source/tutorial/zh/vectorization.rst @@ -1,2 +1,26 @@ 向量化 -======== +========= + +此部分提供了简便的接口,可以直接将传入的items经过转化得到向量。当前提供了是否使用预训练模型的选项,可根据需要进行选择,如不使用预训练模型则可直接调用D2V函数,使用预训练模型则调用get_pretrained_i2v函数。 + +总体流程 +--------------------------- + +1.对传入的item进行 `语法解析 `_ ,得到SIF格式; + +2.对sif_item进行 `成分分解 `_ ; + +3.对经过成分分解的item进行 `令牌化 `_; + +4.使用已有或者使用提供的预训练模型,将令牌化后的item转换为向量。 + +学习路线图 +--------------------------- + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + 不使用预训练模型 + 使用预训练模型 + diff --git "a/docs/source/tutorial/zh/vectorization/\344\270\215\344\275\277\347\224\250\351\242\204\350\256\255\347\273\203\346\250\241\345\236\213.rst" "b/docs/source/tutorial/zh/vectorization/\344\270\215\344\275\277\347\224\250\351\242\204\350\256\255\347\273\203\346\250\241\345\236\213.rst" new file mode 100644 index 00000000..5a26588f --- /dev/null +++ "b/docs/source/tutorial/zh/vectorization/\344\270\215\344\275\277\347\224\250\351\242\204\350\256\255\347\273\203\346\250\241\345\236\213.rst" @@ -0,0 +1,22 @@ +不使用预训练模型:直接调用D2V +------------------------------------ + +使用自己提供的任一预训练模型(给出模型存放路径即可)将给定的题目文本转成向量。 + +* 优点:可以使用自己的模型,另可调整训练参数,灵活性强。 + + +处理的具体流程 +++++++++++++++++++++ + +1.调用get_tokenizer函数,得到经过分词后的结果; + +2.调用T2V模块,根据需要选择是否使用预训练的t2v模型 + +Examples: + +:: + + >>> model_path = "../test_model/test_gensim_luna_stem_tf_d2v_256.bin" + >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) + >>> i2v(item) diff --git "a/docs/source/tutorial/zh/vectorization/\344\275\277\347\224\250\351\242\204\350\256\255\347\273\203\346\250\241\345\236\213.rst" "b/docs/source/tutorial/zh/vectorization/\344\275\277\347\224\250\351\242\204\350\256\255\347\273\203\346\250\241\345\236\213.rst" new file mode 100644 index 00000000..93d7a00b --- /dev/null +++ "b/docs/source/tutorial/zh/vectorization/\344\275\277\347\224\250\351\242\204\350\256\255\347\273\203\346\250\241\345\236\213.rst" @@ -0,0 +1,41 @@ +使用预训练模型:直接调用get_pretrained_i2v +--------------------------------------------- + +使用 EduNLP 项目组给定的预训练模型将给定的题目文本转成向量。 + +* 优点:简单方便。 + +* 缺点:只能使用项目中给定的模型,局限性较大。 + +* 调用此函数即可获得相应的预训练模型,目前提供以下的预训练模型:d2v_all_256、d2v_sci_256、d2v_eng_256、d2v_lit_256 + +模型选择与使用 +################## + +根据题目所属学科选择预训练模型: + ++--------------------+------------------------+ +| 预训练模型名称 | 模型训练数据的所属学科 | ++====================+========================+ +| d2v_all_256 | 全学科 | ++--------------------+------------------------+ +| d2v_sci_256 | 理科 | ++--------------------+------------------------+ +| d2v_lit_256 | 文科 | ++--------------------+------------------------+ +| d2v_eng_256 | 英语 | ++--------------------+------------------------+ + +处理的具体流程 +################## + +1.下载相应的预处理模型 + +2.将所得到的模型传入D2V,使用D2V进行处理 + +Examples: + +:: + + >>> i2v = get_pretrained_i2v("d2v_sci_256") + >>> i2v(item) diff --git a/examples/formula/formula.ipynb b/examples/formula/formula.ipynb index 2ee49390..f748a90a 100644 --- a/examples/formula/formula.ipynb +++ b/examples/formula/formula.ipynb @@ -1,69 +1,166 @@ { "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Formula\n", + "\n", + "## 概述\n", + "\n", + "Formula 首先在分词功能中对原始文本的公式做切分处理,另外提供 [公式解析树] 功能,可以将数学公式的抽象语法分析树用文本或图片的形式表示出来。 \n", + "\n", + "本模块另提供公式变量标准化的功能,如判断几个子公式内的‘x’为同一变量。" + ], + "metadata": {} + }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": true - }, + "execution_count": 1, + "source": [ + "import matplotlib.pyplot as plt\n", + "from EduNLP.Formula import Formula\n", + "from EduNLP.Formula import FormulaGroup\n", + "from EduNLP.Formula.viz import ForestPlotter" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 公式语法结构分析\n", + "\n", + "### 初始化实例\n", + "\n", + "- item 类型:`str or List[Dict]` \n", + "- item 内容:latex 公式 或 公式经解析后产生的抽象语法分析树(abstracted syntax tree)" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "f = Formula(\"x^2 + x+1 = y\")\n", + "f " + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "[{'structure': {'bro': [None, 3],\n 'child': [1, 2],\n 'father': None,\n 'forest': None},\n 'val': {'id': 0, 'role': None, 'text': '^', 'type': 'supsub'}},\n {'structure': {'bro': [None, 2], 'child': None, 'father': 0, 'forest': None},\n 'val': {'id': 1, 'role': 'base', 'text': 'x', 'type': 'mathord'}},\n {'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None},\n 'val': {'id': 2, 'role': 'sup', 'text': '2', 'type': 'textord'}},\n {'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None},\n 'val': {'id': 3, 'role': None, 'text': '+', 'type': 'bin'}},\n {'structure': {'bro': [3, 5], 'child': None, 'father': None, 'forest': None},\n 'val': {'id': 4, 'role': None, 'text': '1', 'type': 'textord'}},\n {'structure': {'bro': [4, 6], 'child': None, 'father': None, 'forest': None},\n 'val': {'id': 5, 'role': None, 'text': '=', 'type': 'rel'}},\n {'structure': {'bro': [5, None],\n 'child': None,\n 'father': None,\n 'forest': None},\n 'val': {'id': 6, 'role': None, 'text': 'y', 'type': 'mathord'}}]" + "text/plain": [ + "" + ] }, - "execution_count": 9, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], + "metadata": { + "collapsed": true + } + }, + { + "cell_type": "markdown", "source": [ - "import matplotlib.pyplot as plt\n", - "from EduNLP.Formula import Formula\n", - "from EduNLP.Formula import FormulaGroup\n", - "from EduNLP.Formula.viz import ForestPlotter\n", - "\n", - "Formula(\"x^2 + 1 = y\")" - ] + "- 查看公式切分后的结点元素:" + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 3, + "source": [ + "f.elements" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "NodeView((0, 1, 2, 3, 4, 5, 6))" + "text/plain": [ + "[{'id': 0, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", + " {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'},\n", + " {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'},\n", + " {'id': 3, 'type': 'bin', 'text': '+', 'role': None},\n", + " {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None},\n", + " {'id': 5, 'type': 'bin', 'text': '+', 'role': None},\n", + " {'id': 6, 'type': 'textord', 'text': '1', 'role': None},\n", + " {'id': 7, 'type': 'rel', 'text': '=', 'role': None},\n", + " {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None}]" + ] }, - "execution_count": 10, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], + "metadata": {} + }, + { + "cell_type": "markdown", "source": [ - "f = Formula(\"x^2 + 1 = y\", variable_standardization=True)\n", - "f.ast.nodes" + "- 查看公式的抽象语法分析树:" ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "metadata": {} }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 4, + "source": [ + "f.ast " + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "OutEdgeView([(1, 0), (2, 0)])" + "text/plain": [ + "[{'val': {'id': 0, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", + " 'structure': {'bro': [None, 3],\n", + " 'child': [1, 2],\n", + " 'father': None,\n", + " 'forest': None}},\n", + " {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'},\n", + " 'structure': {'bro': [None, 2], 'child': None, 'father': 0, 'forest': None}},\n", + " {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'},\n", + " 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}},\n", + " {'val': {'id': 3, 'type': 'bin', 'text': '+', 'role': None},\n", + " 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}},\n", + " {'val': {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None},\n", + " 'structure': {'bro': [3, 5], 'child': None, 'father': None, 'forest': None}},\n", + " {'val': {'id': 5, 'type': 'bin', 'text': '+', 'role': None},\n", + " 'structure': {'bro': [4, 6], 'child': None, 'father': None, 'forest': None}},\n", + " {'val': {'id': 6, 'type': 'textord', 'text': '1', 'role': None},\n", + " 'structure': {'bro': [5, 7], 'child': None, 'father': None, 'forest': None}},\n", + " {'val': {'id': 7, 'type': 'rel', 'text': '=', 'role': None},\n", + " 'structure': {'bro': [6, 8], 'child': None, 'father': None, 'forest': None}},\n", + " {'val': {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None},\n", + " 'structure': {'bro': [7, None],\n", + " 'child': None,\n", + " 'father': None,\n", + " 'forest': None}}]" + ] }, - "execution_count": 11, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 13, "source": [ - "f.ast.edges" + "print('nodes: ',f.ast_graph.nodes)\n", + "print('edges: ' ,f.ast_graph.edges)\n" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "nodes: [0, 1, 2, 3, 4, 5, 6, 7, 8]\n", + "edges: [(0, 1), (0, 2)]\n" + ] + } ], "metadata": { "collapsed": false, @@ -74,25 +171,27 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 17, + "source": [ + "ForestPlotter().export(\n", + " f.ast_graph, root_list=[node[\"val\"][\"id\"] for node in f.ast if node[\"structure\"][\"father\"] is None],\n", + ")\n", + "plt.show()" + ], "outputs": [ { + "output_type": "display_data", "data": { - "text/plain": "
", - "image/png": "\n" + "text/plain": [ + "
" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAV0AAADnCAYAAAC9roUQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWAElEQVR4nO3dW3BT94HH8e9fV0u2ZRswGEPC/ZaEhBAIuW2apM10uk3KtLPdzs4+93ln9mH7ug/70vdt+7Az2Z3pdHZ220mbaTZt2iYQciOhBBICOKHcwTeChWTdjnR0zj7IdoFgY8vS/xzC7/NEIkv/3zmSf4j//38k4/s+IiJiRyToACIidxOVroiIRSpdERGLVLoiIhapdEVELFLpiohYpNIVEbFIpSsiYpFKV0TEIpWuiIhFKl0REYtUuiIiFql0RUQsUumKiFik0hURsUilKyJikUpXRMQila6IiEUqXRERi1S6IiIWqXRFRCxS6YqIWKTSFRGxSKUrImKRSldExCKVroiIRSpdERGLVLoiIhapdEVELIrZHCwST476bnWFzTFnY2KJMa/mDLTr8W0f662OJ0zne9rNOYPIGPZzNdtrM+jXVJjO0fXa/bvcasb3fXuDGeOv+dGr1saby/kfv4Dv+6Zdj2/7WG91PGE639NuzhlExrCfq9lem0G/psJ0jq7X7t/lVgv19EL57Eczf8598PINt/l1l+z+/+Tau/9tO1bLzHl8bpX8oVfI7nspNJkAnNG/cPX1n9iMdNtM4y//G4Xj+2xGAm6fK3/4t+Q/fBnfq4cik1erkD/0CuO//Fc8pxSKTL7vk93/X0z88WfUy3lrmYIU6tKtjp/DzY2TPfBznOGTeLXKzBNYHT9Lav0uYpnld+yTNdfxmViC+LJ7qZcnQ5PJr7vUrpwj1rsyNJkAIh3d+DXHaqbb5fIqBaojn4OJhiZTJN5BZvdeEoNbiCTTochkjMGvlfHrLpGEvUxBCnXpAjiXT9L14PMklq0hEu8gtW5n0JFaaq7jS617mOTgFvy6G4pM1fEz1AsTOJeOUy/lQpEJYNnf/hNepYAXQPHOlsv3PWI9A8SXrqY6djoUmQDc3BixHvvTsrNl8qoVkqvvJ735CWpfXLCeKwihL93kqq2UPnuX2tWLjb8hzxwGILF8HeUzh3Hz40RTmYBTNm+243MLE+Te/9/GL2zE7rul2TIlV26m5/G/J7n6fqLpnlBkqpfz5A7+knphgkg8aTXTXLmiqQwYqJz/mHjfYCgyAZROHSS98VGreebKZCJRKhc/pXzuCLGe5dZzBUELaW0S9KJHEBnmIwyLM2E/V1pIWxgtpImIyKwCKd1brfR+lcZrZYbr73f94lG7x52LmxujOPQOlQuf4IycmvXnpv8VtZAMQZ2nVo6/kNubOUfzHXMuN/8L18ZzNJd2vqbCxurFEdPc3CilUwdxs8N073yRyY9/jzM8RHJgMx1rH6L8lw/xvTrpzY/jZodJb34CgNzBXxFNZ8BESG3YTeHYG0TTGbxKEfw6sd4B6qUcHWsfpvTZeyRXbsTEUziXPiVbyZN5ZC/Rrr4gDrnpY3azw+Q//DXx/jVUx89RL0zgVcv4TomeJ37QtnGv/uGnRFMZTDyJm7/Ckud+yOTR1/DKk3Ss3YFz8RjJex+kevYIxRP7yezaS+Hj1yEaI5bpb2xJ8lxMLIHnFEJ/nhY7fu3KWcrnjuJmh/GqZRIrNuBcPE60q4/U+l1UR0/h5saZPPIaAB3rHsa5cIxY70o8p0g9PzbvjIvNevUPPyW1YTfOpRNgImQeecHauEG8psImkHe6sZ4B0pseI9Y3yORHr5LasJt470q6HnyeyoVjVMfOEO1aSj3/xcwTBhDrHcBzysDU9I3vAZDatAe/7hJfck/j9ql9kb7XuD3Rv47ObU/jjM7+N2i7NXvM0XQPmUe/S/W6v/07t/4NRONtHTeW6af7kReJpDIklq/Hr1cBcPNXiGX6Sd6znWiqm/TGPUS7luBcPknn9m+AaTw3nduexq+7dO98AWPm/zIL6jwtdvx4/zpSa3fgjHxONN2DV8qRXH0ftYnLxHqWkxjYRL0wQcfaHcT6VuKVcqQ27KZeuEpm13eIdHQvKOdissZ6lpMc2Ei9kCXa2Ys7edXOuAG9psImkOTRrj6KJ94itX4XzuWTxHsHqGVHyB/6DR33PEBixXp81yHWt5LiZ+/O3M+vlvDKeeL9ayh88kdqE5cAKJ08AL5PvVIgkujAzY3jla5ROXcEgOqVsxRPHCAxsDGIwwWaP+Z6KUf+w1/fmN3Mf82g2XGn95eaSBSMwasU8etuYz9lR/fUtrH81M4KQ3LVNorH/gTT/2w1hmjXUoon9uN789/yFtR5Wuz4kXiS8ulDJAc24TlFYktW4QwPkehfS+3aKPX8OCaeoHLuKG52hEi6B0yE+LJ7KRzf19R+7KafWyJE0r1Eu3oxxhDrWmpn3IBeU2ET6O6F8rmjeE6Rzi1PkvvgZXr2fG/Bj1k49idSG3YveAtTULsXWnHMt3K7Ffl2jbvYnLbP060y3CpHkOdrobsXbL2mwnSO5soZdoHM6U5Lrd0x8+dmn7Cu7d9oURo7WnHMd9K4zQo6b9DjL4ReU3eWUO5euPl2Nz9O+eyRto1nU7Or3LUvLrYjzm3Hne/trfZVOk/FE/vJHvh5uyKF7rmbz5hh+p20LdDdC37NoV6YINq9FDc3TrSrj1jPCqqjp6heOY+bvUx68xPEMsspnz1C5eKnVMfO0HHvdspnj9C59UmckVOUTr0/s3JaPHmAxPL1uLlxevZ8r/Hkei75w7/FGEP3zoWt1AZ97NNKpw/Rs+yeUOW69vYvGqvzmx4j1rUk8DwQzvPUed8zuG0smWYyFYfeoT61gJbe+hSx7oXN67Y6k+/Vmfzo/6Yue6/Rcc8DLc0TJoHuXnCGh4ikuvGcEunNj1P6/P3GRPzAJhL9a274ZQIaH7TiufhuY9VzenfC9Sun8d4B/Op1n6Dke/hene4d32psLQtYM8dePPEWzqXj5A+9suCV5nbmmlmdb3HhNpsnrOep3b4KmUwkCvg4l46TXH2/tZxBCHT3QnJwC155kviSQUpD79C1/Xmcy0N4lUmq42duWPmMpntwr41gEmnc3Dh+rULp8/cArls5vR+v5lAvXiO+dBWTR39HffILTCTK5NHfYSx+stJsmjn2zvu+RnL1/WR27235O5Jmc/m+d8PqfNB5IJznCaB85jDOpePUJi6HJlPn1qfI7N7btnPVTKbk6sbvr2li18mdJJDpha77n/3S/+u498GZPydXbQUgsXz9DbfXS7lb/rNjeirh5vuEUTPHDu1fqFhoLmMi9D75D6HJMy1s5wkgtf4RUusfCVWmdmsmk3PhGJ33fa394QIW6O6FhYgk07N+BqhWTkXufJlHvxt0BCtCcVnHQq4DLxz7E7kPXsYZOUXlwiftjtYSYTw+ZbpzM4U1VxgzhVHg73Sz+14iMbARNzuC51bpfuibQGNRZHrVM5JI07HmIUyscUmn5xRmLiHMvfc/M/fL7nuJWM9yMo/9HfkPfz1zXXm0M5jPWwjr8SnTnZsprLnCmCmsAn+nG0ln6Nz2NABd27+OMzwEcMOqZ2rDrpknChrXYJdOHpj57+n7JVZuJLV+F252pOnrylstjMenTHduprDmCmOmsAq8dK+PUDj2BsnBxgT7Dauepw/NbBMDvvQ33l/vF2l8Fo4xTV9X3nphPD5lunMzhTVXGDOFk745ok2C/pT/IDLMRxi+jSDs50rfHLEwd9pnL4Tgna6IyN3D6jvdSDw56rtV+19Fegsmlhjzas5Aux7f9rHe6njCdL6n3ZwziIxhP1ezvTaDfk2F6Rxdr92/y61mtXQXyhgTBa4A9/m+/6XLnowx3wb+2ff9r1sPJyLShLBPL+wARm9VuFPeBvYYYzrsRRIRaV7YS/c54M3ZbvR9Pw98CjxuLZGIyCKEvXSfZY7SnfLm1M+JiIReaEvXGBMHngLeus2P7qPxjlhEJPRCW7rAbuC07/u3uxTlXWCHMabLQiYRkUUJc+nOZ2oB3/dLwGHgybYnEhFZpDCX7nM0pg7mQ1MMInJHCGXpTm0B2wMcuN3PTnkTla6I3AFCWbrAY8CnU1vC5uMDYKsxprd9kUREFi+spbuQqQV833eAg8DTbUskItICYS7d2y6i3URTDCISeqErXWNMJ43Lf9+9zY/eTBdJiEjoha50aVwQ8dHUVrCFOAysNcb0tyGTiEhLhLF0FzSfO833fZfGB+A80+pAIiKtEsbSfRZ4o8n7aopBREItVKU7teVrG40tYM3QYpqIhFqoSpfGlq+DU1vAmvEJ0G+MGWxhJhGRlglb6c7r8xZm4/u+B+xHUwwiElJhK91m9ufeTFMMIhJaoSndqa1ea2ls/VoMffiNiIRWaEqXxlavt6e2fi3GSSBljFm3+EgiIq0VptJd1HzuNL/x9cbaOiYioRSm0m3qoohZaIpBREIpFKU7tcWrH/i4RQ/5JvCcMca06PFERFoiFKVLYypg/9SWr1Y4A9SAzS16PBGRlghL6bZiq9iM6+Z1NcUgIqESptJt1XzuNM3rikjoBF66U1u7UjS2erXSPuAZY0zgxygiMi0MhfQs8ObUlEDL+L5/EcgCD7TycUVEFiMMpduOqYVpmmIQkVAJtHSntnS15KKIWegiCREJlaDf6W4GXBpbvNphP/C0MSbWpscXEVmQoEv3OWBfq+dzp/m+PwZcBh5ux+OLiCxUGEq3XVML07RfV0RCI7DSndrK9QztW0SbpnldEQmNIN/pPgBcm9ra1U5vAU8aYxJtHkdE5LaCLF0bUwv4vp8FPgcebfdYIiK3E0jpGmOitHer2M3eBJ6dGldEJDBBvdN9j0bpbmr3QMaYFNAH/AD4WbvHExGZS1ClmwfSNMqw3ao09gPfD5yzMJ6IyKyCKt0hGh9w8y/tHsj3/TqwF5gAPm33eCIiczFtui5BRERuIeiLI0RE7ioqXRERi+b8IJhIPDnqu9UVtsLMl4klxgDCkM3EEmNezRkIOoeI3BnmnNM1xvhrfvSqxTjzc/7HLwAQhmznf/wCvu/rW4dFZF40vSAiYlFbSrd89qOZP+c+ePlLt2f3vURx6J12DD2ruTLVSzlyB3/JxBv/YTWTiNx92lK61fFzuLlxsgd+jjN8Eq9WuaH0und+ux3DNp0pmu6h57HvE0l2Ws8lIneXtk0vOJdP0vXg8ySWrSES7yC1bme7hmpJpvLZIyRXtv2qZBG5y7WtdJOrtlL67F1qVy823lWeOTxzW3HoHcqn/4xXq7Rr+AVlqhez5D/4FbWJYXSxiIi0k3YvLJJ2L4jIQmj3goiIRfMq3VvtQFgsNzdGcegdKhc+wRk5NevPTb8Tv1WGZnPd7n6z3T5XFhGR+ZjXV5O7uVFKpw7iZofp3vkikx//Hmd4iOTAZjrWPkT5Lx/ie3XSmx/HzQ6T3vwEAFf/8FOiqQwmnsTNX2HJcz9k8uhreOVJOtbuwLl4jOS9D1I9e4Tiif1kdu2l8PHrEI0Ry/TjOSXwXEwsgecUWparduUs5XNHcbPDeNUyiRUbcC4eJ9rVR2r9Lqqjp3Bz40weeQ2AjnUP41w4Rqx3JZ5TpJ4fa9X5F5G7zLze6cZ6BkhveoxY3yCTH71KasNu4r0r6XrweSoXjlEdO0O0ayn1/BczxQYQy/TT/ciLRFIZEsvX49erALj5K8Qy/STv2U401U164x6iXUtwLp+kc/s3wDSmSDu3PY1fd+ne+QKN77FsTa54/zpSa3fgjHxONN2DV8qRXH0ftYnLxHqWkxjYRL0wQcfaHcT6VuKVcqQ27KZeuEpm13eIdHQv6qSLyN1rXqUb7eqjeOItUut34Vw+Sbx3gFp2hPyh39BxzwMkVqzHdx1ifSspfvbuX+849e04JhIFY/AqRfy6i193iXR041w6Tr2Uh0gUMCRXbaN47E8wvbhnDNGupRRP7Mf33JblisSTlE8fIjmwCc8pEluyCmd4iET/WmrXRqnnxzHxBJVzR3GzI0TSPWAixJfdS+H4PurlyebPuIjc1Ra0e6F87iieU6Rzy5PkPniZnj3fs5HxS27evRBkLu1eEJGFWNDuhdTaHXRueRLgS8XW7OJUK8yWq5lM+T+/wtXXf0K9eK2lGUVEYJ4LafMxvajl1xzqhQmi3Utxc+NEu/qI9aygOnqK6pXzuNnLM/Or197+RWPxatNjxLqWtCrKojJldu2leOItvEqBaGdvyzOJyN2tZft0pxe1nOEhIqluPKdEevPjlD5/vzG/OrCJRP+aGxa0Zhav2lC4zWZy81eoF7PEl65uSyYRubu1rHSnF7WSg1vwypPElwxSGnqHru3P41wewqtMUh0/M7Og5fveDYtX7bDQTABXX/938H3c/JW2ZBKRu5suA14kLaSJyELoMmAREYsWVbo3v0uea7eA7UtnF5JNRMSWpncvZPe9RGJgI252BM+t0v3QNwEonnhrZqdAJJGmY81DmFgcPJf84d9ijCExuJXK+aPE+1ZRL1wFEyG1fheTR1/DROL0/s0/LuqgFpqt8Okb+DWHxMrNuNdG8WsV4svWUPj490Q7l9Dz+PcxscSiMomIwCLe6UbSGTq3PQ1A1/av4wwPAdywUyC1YVejcAHfq9O941uNq9KqZSIdGapfnCfWN4jnlBr3S3Y1ftatLe6gFpgtvmQ1nlMimspMZfUASA5uITGwkVp2eFF5RESmLWJ64a93LRx7g+TgVoAbdwqcPoTvNj5vwUSiTB79HSaZxs1faRRevY7nlPDrNZKrtjX2xnb1zZShrWxepQAmgucUcIaHcC6fAKBy8QSVi8eI9w0uMo+ISIN2L8zCzY3hjJyic+tTt82i3QsiMl8tuyLtqybWs4JYz4qgY4jIV8yc73Qj8eSo71ZD1zwmlhgDCEM2E0uMeTVnIOgcInJnmLN0RUSktXRxhIiIRSpdERGLVLoiIhapdEVELFLpiohYpNIVEbFIpSsiYpFKV0TEIpWuiIhFKl0REYtUuiIiFql0RUQsUumKiFik0hURsUilKyJikUpXRMQila6IiEUqXRERi1S6IiIWqXRFRCxS6YqIWKTSFRGxSKUrImKRSldExCKVroiIRSpdERGLVLoiIhapdEVELFLpiohYpNIVEbFIpSsiYpFKV0TEIpWuiIhFKl0REYtUuiIiFql0RUQsUumKiFik0hURsUilKyJikUpXRMQila6IiEUqXRERi1S6IiIWqXRFRCxS6YqIWKTSFRGxSKUrImKRSldExCKVroiIRSpdERGLVLoiIhapdEVELFLpiohYpNIVEbFIpSsiYpFKV0TEIpWuiIhFKl0REYtUuiIiFql0RUQsUumKiFik0hURsUilKyJikUpXRMQila6IiEUqXRERi1S6IiIWqXRFRCxS6YqIWKTSFRGxSKUrImKRSldExCKVroiIRSpdERGLVLoiIhapdEVELFLpiohYpNIVEbFIpSsiYpFKV0TEIpWuiIhFKl0REYtUuiIiFql0RUQsUumKiFik0hURsUilKyJikUpXRMQila6IiEUqXRERi1S6IiIWqXRFRCxS6YqIWKTSFRGxSKUrImKRSldExCKVroiIRSpdERGLVLoiIhapdEVELFLpiohY9P8QdEv7a4+pVQAAAABJRU5ErkJggg==" }, "metadata": { "needs_background": "light" - }, - "output_type": "display_data" + } } ], - "source": [ - "ForestPlotter().export(\n", - " f.ast, root_list=[node[\"val\"][\"id\"] for node in f.element if node[\"structure\"][\"father\"] is None],\n", - ")\n", - "plt.show()" - ], "metadata": { "collapsed": false, "pycharm": { @@ -100,22 +199,42 @@ } } }, + { + "cell_type": "markdown", + "source": [ + "## 变量标准化\n", + "\n", + "下面这个例子中,`var` 为变量编号。同一变量拥有相同的变量编号。 \n", + "如:`x` 变量的编号为 `0`, `y` 变量的编号为 `1`。" + ], + "metadata": {} + }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 20, + "source": [ + "f.variable_standardization().elements" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "OutEdgeView([(1, 0), (2, 0)])" + "text/plain": [ + "[{'id': 0, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", + " {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0},\n", + " {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'},\n", + " {'id': 3, 'type': 'bin', 'text': '+', 'role': None},\n", + " {'id': 4, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0},\n", + " {'id': 5, 'type': 'bin', 'text': '+', 'role': None},\n", + " {'id': 6, 'type': 'textord', 'text': '1', 'role': None},\n", + " {'id': 7, 'type': 'rel', 'text': '=', 'role': None},\n", + " {'id': 8, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}]" + ] }, - "execution_count": 13, "metadata": {}, - "output_type": "execute_result" + "execution_count": 20 } ], - "source": [ - "f.ast.edges" - ], "metadata": { "collapsed": false, "pycharm": { @@ -123,22 +242,38 @@ } } }, + { + "cell_type": "markdown", + "source": [ + "## 方程组结构解析\n", + "\n", + "调用 `FormulaGroup` 类解析公式方程组,相关的属性和函数方法同上。" + ], + "metadata": {} + }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 21, + "source": [ + "fs = FormulaGroup([\n", + " \"x^2 = y\",\n", + " \"x^3 = y^2\",\n", + " \"x + y = \\pi\"\n", + "])\n", + "fs" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "[{'structure': {'bro': [None, 3],\n 'child': [1, 2],\n 'father': None,\n 'forest': None},\n 'val': {'id': 0, 'role': None, 'text': '^', 'type': 'supsub'}},\n {'structure': {'bro': [None, 2], 'child': None, 'father': 0, 'forest': None},\n 'val': {'id': 1, 'role': 'base', 'text': 'x', 'type': 'mathord', 'var': 0}},\n {'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None},\n 'val': {'id': 2, 'role': 'sup', 'text': '2', 'type': 'textord'}},\n {'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None},\n 'val': {'id': 3, 'role': None, 'text': '+', 'type': 'bin'}},\n {'structure': {'bro': [3, 5], 'child': None, 'father': None, 'forest': None},\n 'val': {'id': 4, 'role': None, 'text': '1', 'type': 'textord'}},\n {'structure': {'bro': [4, 6], 'child': None, 'father': None, 'forest': None},\n 'val': {'id': 5, 'role': None, 'text': '=', 'type': 'rel'}},\n {'structure': {'bro': [5, None],\n 'child': None,\n 'father': None,\n 'forest': None},\n 'val': {'id': 6, 'role': None, 'text': 'y', 'type': 'mathord', 'var': 1}}]" + "text/plain": [ + ";;>" + ] }, - "execution_count": 14, "metadata": {}, - "output_type": "execute_result" + "execution_count": 21 } ], - "source": [ - "f.variable_standardization()" - ], "metadata": { "collapsed": false, "pycharm": { @@ -148,47 +283,199 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 22, + "source": [ + "fs.elements" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "[[{'structure': {'bro': [None, 3],\n 'child': [1, 2],\n 'father': None,\n 'forest': None},\n 'val': {'id': 0, 'role': None, 'text': '^', 'type': 'supsub'}},\n {'structure': {'bro': [None, 2],\n 'child': None,\n 'father': 0,\n 'forest': [6, 12]},\n 'val': {'id': 1, 'role': 'base', 'text': 'x', 'type': 'mathord'}},\n {'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None},\n 'val': {'id': 2, 'role': 'sup', 'text': '2', 'type': 'textord'}},\n {'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None},\n 'val': {'id': 3, 'role': None, 'text': '=', 'type': 'rel'}},\n {'structure': {'bro': [3, None],\n 'child': None,\n 'father': None,\n 'forest': [10, 14]},\n 'val': {'id': 4, 'role': None, 'text': 'y', 'type': 'mathord'}}],\n [{'structure': {'bro': [None, 8],\n 'child': [6, 7],\n 'father': None,\n 'forest': None},\n 'val': {'id': 5, 'role': None, 'text': '^', 'type': 'supsub'}},\n {'structure': {'bro': [None, 7],\n 'child': None,\n 'father': 5,\n 'forest': [1, 12]},\n 'val': {'id': 6, 'role': 'base', 'text': 'x', 'type': 'mathord'}},\n {'structure': {'bro': [6, None], 'child': None, 'father': 5, 'forest': None},\n 'val': {'id': 7, 'role': 'sup', 'text': '3', 'type': 'textord'}},\n {'structure': {'bro': [5, 9], 'child': None, 'father': None, 'forest': None},\n 'val': {'id': 8, 'role': None, 'text': '=', 'type': 'rel'}},\n {'structure': {'bro': [8, None],\n 'child': [10, 11],\n 'father': None,\n 'forest': None},\n 'val': {'id': 9, 'role': None, 'text': '^', 'type': 'supsub'}},\n {'structure': {'bro': [None, 11],\n 'child': None,\n 'father': 9,\n 'forest': [4, 14]},\n 'val': {'id': 10, 'role': 'base', 'text': 'y', 'type': 'mathord'}},\n {'structure': {'bro': [10, None], 'child': None, 'father': 9, 'forest': None},\n 'val': {'id': 11, 'role': 'sup', 'text': '2', 'type': 'textord'}}],\n [{'structure': {'bro': [None, 13],\n 'child': None,\n 'father': None,\n 'forest': [1, 6]},\n 'val': {'id': 12, 'role': None, 'text': 'x', 'type': 'mathord'}},\n {'structure': {'bro': [12, 14], 'child': None, 'father': None, 'forest': None},\n 'val': {'id': 13, 'role': None, 'text': '+', 'type': 'bin'}},\n {'structure': {'bro': [13, 15],\n 'child': None,\n 'father': None,\n 'forest': [4, 10]},\n 'val': {'id': 14, 'role': None, 'text': 'y', 'type': 'mathord'}},\n {'structure': {'bro': [14, 16], 'child': None, 'father': None, 'forest': None},\n 'val': {'id': 15, 'role': None, 'text': '=', 'type': 'rel'}},\n {'structure': {'bro': [15, None],\n 'child': None,\n 'father': None,\n 'forest': None},\n 'val': {'id': 16, 'role': None, 'text': '\\\\pi', 'type': 'mathord'}}]]" + "text/plain": [ + "[{'id': 0, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", + " {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'},\n", + " {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'},\n", + " {'id': 3, 'type': 'rel', 'text': '=', 'role': None},\n", + " {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None},\n", + " {'id': 5, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", + " {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'},\n", + " {'id': 7, 'type': 'textord', 'text': '3', 'role': 'sup'},\n", + " {'id': 8, 'type': 'rel', 'text': '=', 'role': None},\n", + " {'id': 9, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", + " {'id': 10, 'type': 'mathord', 'text': 'y', 'role': 'base'},\n", + " {'id': 11, 'type': 'textord', 'text': '2', 'role': 'sup'},\n", + " {'id': 12, 'type': 'mathord', 'text': 'x', 'role': None},\n", + " {'id': 13, 'type': 'bin', 'text': '+', 'role': None},\n", + " {'id': 14, 'type': 'mathord', 'text': 'y', 'role': None},\n", + " {'id': 15, 'type': 'rel', 'text': '=', 'role': None},\n", + " {'id': 16, 'type': 'mathord', 'text': '\\\\pi', 'role': None}]" + ] }, - "execution_count": 15, "metadata": {}, - "output_type": "execute_result" + "execution_count": 22 } ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 23, "source": [ - "fs = FormulaGroup([\n", - " \"x^2 = y\",\n", - " \"x^3 = y^2\",\n", - " \"x + y = \\pi\"\n", - "])\n", - "fs" + "fs.ast" ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[{'val': {'id': 0, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", + " 'structure': {'bro': [None, 3],\n", + " 'child': [1, 2],\n", + " 'father': None,\n", + " 'forest': None}},\n", + " {'val': {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base'},\n", + " 'structure': {'bro': [None, 2],\n", + " 'child': None,\n", + " 'father': 0,\n", + " 'forest': [6, 12]}},\n", + " {'val': {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'},\n", + " 'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None}},\n", + " {'val': {'id': 3, 'type': 'rel', 'text': '=', 'role': None},\n", + " 'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None}},\n", + " {'val': {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None},\n", + " 'structure': {'bro': [3, None],\n", + " 'child': None,\n", + " 'father': None,\n", + " 'forest': [10, 14]}},\n", + " {'val': {'id': 5, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", + " 'structure': {'bro': [None, 8],\n", + " 'child': [6, 7],\n", + " 'father': None,\n", + " 'forest': None}},\n", + " {'val': {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base'},\n", + " 'structure': {'bro': [None, 7],\n", + " 'child': None,\n", + " 'father': 5,\n", + " 'forest': [1, 12]}},\n", + " {'val': {'id': 7, 'type': 'textord', 'text': '3', 'role': 'sup'},\n", + " 'structure': {'bro': [6, None], 'child': None, 'father': 5, 'forest': None}},\n", + " {'val': {'id': 8, 'type': 'rel', 'text': '=', 'role': None},\n", + " 'structure': {'bro': [5, 9], 'child': None, 'father': None, 'forest': None}},\n", + " {'val': {'id': 9, 'type': 'supsub', 'text': '\\\\supsub', 'role': None},\n", + " 'structure': {'bro': [8, None],\n", + " 'child': [10, 11],\n", + " 'father': None,\n", + " 'forest': None}},\n", + " {'val': {'id': 10, 'type': 'mathord', 'text': 'y', 'role': 'base'},\n", + " 'structure': {'bro': [None, 11],\n", + " 'child': None,\n", + " 'father': 9,\n", + " 'forest': [4, 14]}},\n", + " {'val': {'id': 11, 'type': 'textord', 'text': '2', 'role': 'sup'},\n", + " 'structure': {'bro': [10, None],\n", + " 'child': None,\n", + " 'father': 9,\n", + " 'forest': None}},\n", + " {'val': {'id': 12, 'type': 'mathord', 'text': 'x', 'role': None},\n", + " 'structure': {'bro': [None, 13],\n", + " 'child': None,\n", + " 'father': None,\n", + " 'forest': [1, 6]}},\n", + " {'val': {'id': 13, 'type': 'bin', 'text': '+', 'role': None},\n", + " 'structure': {'bro': [12, 14],\n", + " 'child': None,\n", + " 'father': None,\n", + " 'forest': None}},\n", + " {'val': {'id': 14, 'type': 'mathord', 'text': 'y', 'role': None},\n", + " 'structure': {'bro': [13, 15],\n", + " 'child': None,\n", + " 'father': None,\n", + " 'forest': [4, 10]}},\n", + " {'val': {'id': 15, 'type': 'rel', 'text': '=', 'role': None},\n", + " 'structure': {'bro': [14, 16],\n", + " 'child': None,\n", + " 'father': None,\n", + " 'forest': None}},\n", + " {'val': {'id': 16, 'type': 'mathord', 'text': '\\\\pi', 'role': None},\n", + " 'structure': {'bro': [15, None],\n", + " 'child': None,\n", + " 'father': None,\n", + " 'forest': None}}]" + ] + }, + "metadata": {}, + "execution_count": 23 } - } + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 25, + "source": [ + "ForestPlotter().export(\n", + " fs.ast_graph, root_list=[node[\"val\"][\"id\"] for node in fs.ast if node[\"structure\"][\"father\"] is None],\n", + ")" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "[[{'structure': {'bro': [None, 3],\n 'child': [1, 2],\n 'father': None,\n 'forest': None},\n 'val': {'id': 0, 'role': None, 'text': '^', 'type': 'supsub'}},\n {'structure': {'bro': [None, 2],\n 'child': None,\n 'father': 0,\n 'forest': [6, 12]},\n 'val': {'id': 1, 'role': 'base', 'text': 'x', 'type': 'mathord', 'var': 0}},\n {'structure': {'bro': [1, None], 'child': None, 'father': 0, 'forest': None},\n 'val': {'id': 2, 'role': 'sup', 'text': '2', 'type': 'textord'}},\n {'structure': {'bro': [0, 4], 'child': None, 'father': None, 'forest': None},\n 'val': {'id': 3, 'role': None, 'text': '=', 'type': 'rel'}},\n {'structure': {'bro': [3, None],\n 'child': None,\n 'father': None,\n 'forest': [10, 14]},\n 'val': {'id': 4, 'role': None, 'text': 'y', 'type': 'mathord', 'var': 1}}],\n [{'structure': {'bro': [None, 8],\n 'child': [6, 7],\n 'father': None,\n 'forest': None},\n 'val': {'id': 5, 'role': None, 'text': '^', 'type': 'supsub'}},\n {'structure': {'bro': [None, 7],\n 'child': None,\n 'father': 5,\n 'forest': [1, 12]},\n 'val': {'id': 6, 'role': 'base', 'text': 'x', 'type': 'mathord', 'var': 0}},\n {'structure': {'bro': [6, None], 'child': None, 'father': 5, 'forest': None},\n 'val': {'id': 7, 'role': 'sup', 'text': '3', 'type': 'textord'}},\n {'structure': {'bro': [5, 9], 'child': None, 'father': None, 'forest': None},\n 'val': {'id': 8, 'role': None, 'text': '=', 'type': 'rel'}},\n {'structure': {'bro': [8, None],\n 'child': [10, 11],\n 'father': None,\n 'forest': None},\n 'val': {'id': 9, 'role': None, 'text': '^', 'type': 'supsub'}},\n {'structure': {'bro': [None, 11],\n 'child': None,\n 'father': 9,\n 'forest': [4, 14]},\n 'val': {'id': 10, 'role': 'base', 'text': 'y', 'type': 'mathord', 'var': 1}},\n {'structure': {'bro': [10, None], 'child': None, 'father': 9, 'forest': None},\n 'val': {'id': 11, 'role': 'sup', 'text': '2', 'type': 'textord'}}],\n [{'structure': {'bro': [None, 13],\n 'child': None,\n 'father': None,\n 'forest': [1, 6]},\n 'val': {'id': 12, 'role': None, 'text': 'x', 'type': 'mathord', 'var': 0}},\n {'structure': {'bro': [12, 14], 'child': None, 'father': None, 'forest': None},\n 'val': {'id': 13, 'role': None, 'text': '+', 'type': 'bin'}},\n {'structure': {'bro': [13, 15],\n 'child': None,\n 'father': None,\n 'forest': [4, 10]},\n 'val': {'id': 14, 'role': None, 'text': 'y', 'type': 'mathord', 'var': 1}},\n {'structure': {'bro': [14, 16], 'child': None, 'father': None, 'forest': None},\n 'val': {'id': 15, 'role': None, 'text': '=', 'type': 'rel'}},\n {'structure': {'bro': [15, None],\n 'child': None,\n 'father': None,\n 'forest': None},\n 'val': {'id': 16, 'role': None, 'text': '\\\\pi', 'type': 'mathord'}}]]" + "text/plain": [ + "[Text(22.32, 181.2, 'id: 0\\ntype: supsub\\ntext: \\\\supsub\\nrole: None'),\n", + " Text(11.16, 108.72, 'id: 1\\ntype: mathord\\ntext: x\\nrole: base'),\n", + " Text(33.480000000000004, 108.72, 'id: 2\\ntype: textord\\ntext: 2\\nrole: sup'),\n", + " Text(55.8, 181.2, 'id: 3\\ntype: rel\\ntext: =\\nrole: None'),\n", + " Text(78.12, 181.2, 'id: 4\\ntype: mathord\\ntext: y\\nrole: None'),\n", + " Text(111.6, 181.2, 'id: 5\\ntype: supsub\\ntext: \\\\supsub\\nrole: None'),\n", + " Text(100.44, 108.72, 'id: 6\\ntype: mathord\\ntext: x\\nrole: base'),\n", + " Text(122.76, 108.72, 'id: 7\\ntype: textord\\ntext: 3\\nrole: sup'),\n", + " Text(145.08, 181.2, 'id: 8\\ntype: rel\\ntext: =\\nrole: None'),\n", + " Text(178.56, 181.2, 'id: 9\\ntype: supsub\\ntext: \\\\supsub\\nrole: None'),\n", + " Text(167.4, 108.72, 'id: 10\\ntype: mathord\\ntext: y\\nrole: base'),\n", + " Text(189.72, 108.72, 'id: 11\\ntype: textord\\ntext: 2\\nrole: sup'),\n", + " Text(212.04, 181.2, 'id: 12\\ntype: mathord\\ntext: x\\nrole: None'),\n", + " Text(234.36, 181.2, 'id: 13\\ntype: bin\\ntext: +\\nrole: None'),\n", + " Text(256.68, 181.2, 'id: 14\\ntype: mathord\\ntext: y\\nrole: None'),\n", + " Text(279.0, 181.2, 'id: 15\\ntype: rel\\ntext: =\\nrole: None'),\n", + " Text(301.32, 181.2, 'id: 16\\ntype: mathord\\ntext: \\\\pi\\nrole: None')]" + ] }, - "execution_count": 16, "metadata": {}, - "output_type": "execute_result" + "execution_count": 25 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAV0AAADnCAYAAAC9roUQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAU1klEQVR4nO3dWW+c133H8d95ZuNwJyVZFLVYtixLju3Ure0UKeJe1AHaAgZStEDQi170BfS2yAvoReEG6HLRl5CroldxgCBwbbR20sULHDu2KTm0ZUkWqZWSOFxmPb2YkTgcDWd7znPmmWe+H8CwuJ3nP3+e+fHZ5oyx1goA4Ecw7AIAYJwQugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgEaELAB4RugDgk7XW2X8mnV2XZF3+Z9LZ9WHV07ztKB7bII8xaT0Os/049CKqeeF67vmYy1HVHJfnnrOctNbKFWOMffxHbzgbT5K+fv01WWvNMOpp3nYUj63ddrpJWo/DbD8OvYhqXrieez7mclQ1x+W554q30wuVzVuSpJ1LH0mSSje+0uavf+Fr8wfWUdm8rc2Pfu58XEm69z//FnrcsHXsrL6v4rULQ62h+M1KLH7XO19+oO0Lvxra9quFDRU+fVulm5e0tfKu83HLG9dU+PTtSGq2tqZ7//vvzsfduvBLlW5ecj5u8dqFfc/FOEn72lDp+pfaWX1fQX5GteKWqtv3lJlf8rX5A+tITS/KVkrOx63ubCo1teig4nB1BJOzTh5fmBokK1vc8lpDuzpSM4e1s/p/w9v+9IJSUwvKHjmt8p1vnI+bml6ULRcjqbm0vqrMoZPOxzWpdKj5edC4lbvrCnJTA48bJc8X0vZOZaQmZ1W+u+53823qqG7eVJCbdD5ubee+qlt3ZKsVR2MPVkeQm1J18/ZQa7BWMpncEGrYX0eQyys17fsPYdOc2C2otHZRWyvvyqSzzset3r/laC4/OratFFW+fcX5uEFu0sH8fHTc9OJxVTZvhhw3Gt72dCef+s6+j7OPPansY0/62vyBdQS5KU0//33n4+aOPa3csadDjxu2jiA3pczi8aHWMHHiGU2ceMZrDe3qCHJTmn7u1aFtX5LmvvvDSMfNHDoR2dgTJ5+LZNwwOdBp3NzSUwOPGyXne7ou9l5dn4vpp6ZO2x7ksZXvrrcdc/fyx7K1at/jhamllcs+u+px1Ns+SJiafM2LbtvZu05xq6ft9yuKmpuNynMvLOd7ujur76myeFwygco3vlTQOL8y/dyrSs0e0fbFXymzsKzq9j1V7l2XLW7LZPPKHDrx8FC8ePUzBblJZ3uKXWv6/L+Umj2i0vpvlTl8avBxVt5RtbCh1Mwh1XYLSs0cUrWwIVvalmrV+uGZMcounVXx6ufKnXg2usfUR59dcNXjSLYdcS98zYtu26nt3FfhN28pNb2orU/+QxOPf1u54+2PMOJS8yg+98JyHrqpyXnJBDKpvaHTc489/HdmfknlO9+otluQrRQlWU2cel7Fq59JQSCTytS/0eGtbN1qCibnpFql66FZ13Hyswrys/UPalXVdjYVZHIK5h6TalVJRiadc3A+L359dtXjKLYddS98zYtu22k8CElSdvmcah0uqsWl5lF87oXlPHSnnnlFO6vvK3v8vGy1ovzpF/Z9PXv0jLJHz0hSfa+jVlVmYTnS847damr9OKpx8k+++PDfYc83xa3Prnocxbaj7oWvedFtO1PP/OHI1TyKz72wIrmQlj/zUv3/XRqRnjsaxebb6rUmX+O4ELc+D7M3w+6Fr8fucjujVnOcnnthRH4hrbZbUHVnU5JUvHahp1uooripuVNdcR673+0ltceDbHOYvZCi60e3cWvF7UjGHdQwxm3+3fdyYdGnSC6kFSemFWQnVd26o9zyOVW37ys9v6TMkdMqrl1U+dZlpRsvjKhsXFNq5rDKNy8pu/SUZAJVN29pa+VdTZ3/npe60rNHtHvpIwX5WeWOnXU6tqxV+fYV5ZbOKjW9EPljSWqPB9nmMHvRrbbU5Jw2P/ipZl76gYI+72Hu1ufCr3+u3Mnn++5zVPM4qnnR7XdfunlJxcufyGTzmnnhT/oaO0rO93RTk/MyQUqpmUNSrVb/q9u4QFGfXFa14pZspaTKxrX6D9maFASytarKN75yXVLXuuolVOt1OB47s3BMpbUvnAVut+0ltceDbHOYvehWm62W92pxOK4JUsounxvoD1tU8ziqedHtd//gomncRHIh7YF2J6wnTjyriabbNXYufbT/HM2Zl12X1FNd7W6ydjF2rVxUbtntiyTGsceDbnNYveiltkFfKNHLY3Y9bph5HNW86DZueuawUmcXlFlYHmj8qHh5GXDrObO77/5E26vvSfJ7UrxTHVGNG2Ryykf4xO6ljiT02FUdSehFlD0+aOyw83gYvUjPHY1d4EoRvgx488OfyWQnlMrX11gorV3U5PlXlFk4Vr9B+fInMiZQrbyr6v2byhw+peKVTzX9O3/c5t7D4dSR9Mfnqo441BCXOqKqIcrHNmo1x+H3HEZkoZtePC5b3pVkZCtFpRePS6a+bKVJZ6VUWjJGRkaZI6elWlWpmUN7N6rHvI6kP75RqyEudYzifBu1muPwew4jstBtPox7cH9d89d8HeZFVUfSH9+o1RCXOkZxvo1azXH4PYfBe6QBgE9O3/snBu9Z5bIe3iON90gbRg1RzD3eI8393B/4OWOb7peLmjHmY0l/ba390BhzXtIb1tp4Lno5oowxP5Z0x1r7942Pv5b0R9ba1eFW5p8x5l8lXbTW/kvj45uSvm2tXRtuZRhn3k4vGGNSkp6SdLHxqVVJJ4wxw3pLgaQ6J2ml6eOVxufGEb1A7Pg8p3tK0m1rbUGSrLVlSV+rHsRw57yk5neivND43DiiF4gdn6F7TvufAGp8zJ6HI8aYrOp/3JpPJYxlj40x05IWJV1u+vRY9gLx4jt0V1o+x+GeW09KumKtbV69elx7/LSkL6zd96L+ce0FYsRn6LYe6kkc7rlGj/fQC8QSe7rJ0q7H30iaMsbM+y9nqNr14itJy8aYiSHUA0iKyZ6uMY3X8CGsR3ps6/cEXtT4/XFr14uy6sHrbkFfoE9eQtcYMytpTtLV5s9ba29Jqkga/ioUydBu704azyMKeoFY8rWne071m9TbrVTMeTYHGkcL7Y4mpDHrsTEmUP1C2sU2Xx6rXiB+fIZuuzCQuI3HlcOSjKSbbb42bj0+KWnDWtvujbjGrReIGZ+h2+5QT+Jwz5VzklZs+9d1j1uPmW+ILV+he9Bhr8ThniudevyFpDONl2KPg67zjYu3GBb2dJPjwB5ba7clXZd02mdBQ9SpF3ck7Upa8loR0BB56Db2rs6q/UUNSfpSLHzjQqe9O2m8jijoBWLLx57uKUm3rLVb7b7IwjfOdLpYKY3XBSR6gdjyEbqdTi08wCmGEA5Y6KbVWPTYGDOjRxe6aTUWvUA8+Qjdbod6Eod7YZ3RowvdtBqXHrdb6KbVuPQCMcSebjLQ4z30ArHGnm4y9NLjaxqPhW966cWDhW/yHuoB9onVni73Tg6sa48bL5oYhwtIvfSiovpdM1y8hXeRhm5joZtZ1ZcXPJC19rbqC98cjbKeBOtl704ajyMKeoFYi3pP95y6X9R4YBz2wpxrHB10u0XqgUT3uLHQTad7wpsluheIr6hD97y6n1p4gIsbgznc+H+7hW5arSjZe3cnVX/7+XYL3bRivmEofOzp9rIHJnG4N6jzki4csNBNq6Tv3fV6akFivmFIfIQue7rR6qfHX0h60hiTjrCeYeqnFxfExVsMgY/TC+x5RKvnHo/Bwjf99OLBwjfHIq0IaBFZ6DYWunlKvV3UkOq38Bxn4Zu+9XMKR0r2EUW/vUj66RbEUJR7uo9LunnQQjetGgvfXBL3Tvarn0NqKdlHFP32Isl/gBBTUYZuv08AKdmB4FyPC920SmTQNC10c6WPH2O+wbsoQ7ef87kPJDIQInRG0mVrbamPn0lq0Dytg9/89CDMN3jHnu5o4w/bnkF6wXyDd3Hb0+XCRn/6vXAkSWuS8saYhQjqGaZBevGVpGMsfAOfot7THWgvjHsne9b30UTjRRQXlbw/boP0goVv4F0koWuMmZM0oy4L3bRi4Zu+DXI0ISXzFMOgveAUA7yKak/3nPq/qPFAEgPBuaaFbvo9by4lLGiaFrrhDxBiL8rQHSQMpIQFQoSONP5/a4CfTVrQnFJ9oZvCAD/LfINXUYXuoId6UvICISrnJK30uNBNq6QFTZg/8sw3eMWe7ugK84ftC0lPJGjhmzC9YOEbeBXHPV1uG+vNIHeHSJKstTuS1iU94bSi4QnTiw2x8A08ch66jYVuzqj3hW5aPVj4ZsJdVYkU5mhCStZhNb3AyIhiT/e0pBuNZQT7xsI3PQtzNCEl6zQOvcDIiCJ0Bz7Ua8KeRweN5S9Pqr+Fblol4jROY6GbefW30E0r5hu8iSp0wxzqSex5dDPIQjetkhI0/bz56UGYb/AmitANe6gnJWQvLEIujiaSEjSuesF8gxdOQ9cYMynpWbl5EnzLGDMdvqpkadza9JLCH02sSZowxnwrfFXD0Xgl2osKP9++krRkjDkduiigC9d7uq9J+gNJPww5zp9L+j1JfxG6ouSZlfS3Ct/j70nKS/rnsAUN0VFJfyPpL0OO86qkjKR/DF0R0IXr0H1L0o6kvws5zo8lbUv6z9AVJYy19p6ka5L+KeRQv1T9Qty7oYsaEmvtmqSbkv4h5FBvqn4h7p3QRQFdmMFeRQoAGETUb8EOAGhC6AKARx0XPAkyuXVbKUWyoLhJZWu2WnJ790Q6e71WLi65HDNqkfY4nb0uSS7Hj7LH9ALjoOM5XWOMffxHb0Sy4a9ff02ux/769ddkrR2p1aKi7rEkp32Ossf0AuMg1J5mZbO+fvbOpY8kSdXChgqfvh26qHZjlzfWVPjNW07GHjWP9OL2VW2thL/poHXcyuZt3X3nJ6HHjcpB82336mfa+nzwGw8OGre8cU3bF/87VM1Aq1DrqZauf6md1fcV5GdUK24pNb2g1JSbN5ltHTuzcEyl62GWGhhdrb0o37kqE4Q/M9M6bnrmkLLHzjqoOBoHzTdbKSmYnHU+bmZhWZV7Nxw+AsDJhbS90xO13YJKa4Ou6Nh57N2rn6pWHGjhsoTY60V6YVmV+7dka1Wn41prFf+1vB+dbyadVW37vvNxK5u3Vb55KeS4wH6h9nQnn/rOI5+b+27YF0q1H3vixLOaOPGsk7FHTWsvsrkpZQ+fcj6uJOXPvBx63KhENd86jTv78p+FHh9oxi1jAOBR19At313ve9Dy3fWHFyWa7V7+eN8hcbex23293efabWvU0OeDt9vLz/TSh17Gj1MvkExdTy/srL6nyuJxyQQq3/hSwdSCKnfXNf3cq0rNHtH2yjuqFjaUmjmk2m5BqZlDqhY2ZEvbUq2q8u0rkjHKLp1V8ernyjWdIug2dvHqZ6oW7qi0/ltlFpYlY1S5u6bSxLRstSKTyqhW2lF6fvRvlaTP0fdh1HqBZOoauqnJeckEMqm9b03PPfbw30F+VkG+ceW4VlVtZ1NBJqdg7jGpVpVkZNI5mXS277FTUwuqFu4oc+iEbKUsEwQKJmYa33dUQX5GxW8+7/MhxxN97q3WMH3oZfw49QLJ1DV0p555RTur7yt7/LxstaL86Rf2fb3141b5J198+O/c0v63Pes69hO/2608ZY+c7vo9o4A+10XZh57Gj1EvkEw9XUjLn3lJQWZi3wRtPc9V2y2ourPZdwGtY3cat1bc1uavf6Facavv7YyCfnoRZuxO49pqRYVP3lRl8/ZA23EhyvnWOv44zzcMx8C3jO2svqfixLSC7KSqW3eUWz6n6vZ9peeXlJqc0+YHP9XMSz9QkMk5Gzczv6T09IIUpAYte6QMq8cmlXHy4guXOtUsa1W+fUW5pbNKTff34hzmG3wb+JmVmpyXCVJKzRySarX6Cxca6zjYarn+TQO8V2CncWu7BRWvXZBqYd6DcHQMpcflXSlI7Y0fE51qziwcU2nti74Dt9u44zbf4MfAe7pTz7zy8N/tzp0NetN6t3HnX/mrgcYdRcPq8dT57w00bpQ61VwrF5Vbftr5uNJ4zTf4Ea9jSGAAQSYX61fSAc0GDt3Wm8Nd3yzOzed1UfZ51HpML5AEfZ1e2PzwZzLZCaXysyrfXVdp7aImz7+izMIxVTdvaWvlXVXv31B2+bzKty7LZHIKspOaPPv7occvfPKmJk49L1spqbq1ofKtK5p6/lUFmYmBHnicRdnnUesxvUDS9LWnm148riA3JcnIVopKLx6XWlalCiamVdu6q1pxSyaVqV+gcDB+en5JpRtfaffKp43vTu4bakbZ51HrMb1A0vS1p9t832T+zEv7vjb9/PdDF9Np/ImTzz389+7ljzX9wp/KJPRWnij7PGo9phdImo5v18N7pEWP9wXbQy8wDjqGLgDALW4ZAwCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8IjQBQCPCF0A8Oj/AWdihSPDqhKLAAAAAElFTkSuQmCC" + }, + "metadata": { + "needs_background": "light" + } } ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 28, "source": [ - "fs.variable_standardization()" + "for ft in fs.variable_standardization():\n", + " print(ft.elements)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[{'id': 0, 'type': 'supsub', 'text': '\\\\supsub', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, {'id': 2, 'type': 'textord', 'text': '2', 'role': 'sup'}, {'id': 3, 'type': 'rel', 'text': '=', 'role': None}, {'id': 4, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}]\n", + "[{'id': 5, 'type': 'supsub', 'text': '\\\\supsub', 'role': None}, {'id': 6, 'type': 'mathord', 'text': 'x', 'role': 'base', 'var': 0}, {'id': 7, 'type': 'textord', 'text': '3', 'role': 'sup'}, {'id': 8, 'type': 'rel', 'text': '=', 'role': None}, {'id': 9, 'type': 'supsub', 'text': '\\\\supsub', 'role': None}, {'id': 10, 'type': 'mathord', 'text': 'y', 'role': 'base', 'var': 1}, {'id': 11, 'type': 'textord', 'text': '2', 'role': 'sup'}]\n", + "[{'id': 12, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}, {'id': 13, 'type': 'bin', 'text': '+', 'role': None}, {'id': 14, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}, {'id': 15, 'type': 'rel', 'text': '=', 'role': None}, {'id': 16, 'type': 'mathord', 'text': '\\\\pi', 'role': None}]\n" + ] + } ], "metadata": { "collapsed": false, @@ -200,23 +487,25 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "name": "python3", + "display_name": "Python 3.8.5 64-bit" }, "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", + "pygments_lexer": "ipython3", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "file_extension": ".py" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 2 } \ No newline at end of file diff --git a/examples/formula/tree.ipynb b/examples/formula/tree.ipynb index b5f0fd10..ce8bb972 100644 --- a/examples/formula/tree.ipynb +++ b/examples/formula/tree.ipynb @@ -2,20 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": true - }, - "outputs": [ - { - "data": { - "text/plain": "{'value': 1, 'id': 0}" - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": 1, "source": [ "import networkx as nx\n", "\n", @@ -26,28 +13,45 @@ "g.add_edge(0, 1)\n", "g.add_edge(0, 2)\n", "g.nodes[0]" - ] + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'value': 1, 'id': 0}" + ] + }, + "metadata": {}, + "execution_count": 1 + } + ], + "metadata": { + "collapsed": true + } } ], "metadata": { "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "name": "python3", + "display_name": "Python 3.8.5 64-bit" }, "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", + "pygments_lexer": "ipython3", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "file_extension": ".py" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 2 } \ No newline at end of file diff --git a/examples/parse/parse.ipynb b/examples/parse/parse.ipynb new file mode 100644 index 00000000..94272b65 --- /dev/null +++ b/examples/parse/parse.ipynb @@ -0,0 +1,145 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# parse" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 导入类" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "from EduNLP.Formula.ast import str2ast, katex_parse\r\n", + "from EduNLP.SIF.parser import Parser" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输入\n", + "\n", + "类型:str \n", + "内容:题目文本 (text)" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "text1 = '生产某种零件的A工厂25名工人的日加工零件数_ _'\r\n", + "text2 = 'X的分布列为( )'\r\n", + "text3 = '① AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.AC的中点为D'\r\n", + "text4 = '支持公式如$\\\\frac{y}{x}$,$\\\\SIFBlank$,$\\\\FigureID{1}$,不支持公式如$\\\\frac{ \\\\dddot y}{x}$'" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输出" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 尝试转换为标准形式" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "text_parser1 = Parser(text1)\r\n", + "text_parser1.description_list()\r\n", + "print('text_parser1.text:',text_parser1.text)\r\n", + "\r\n", + "\r\n", + "text_parser2 = Parser(text2)\r\n", + "text_parser2.description_list()\r\n", + "print('text_parser2.text:',text_parser2.text)\r\n" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "text_parser1.text: 生产某种零件的$A$工厂$25$名工人的日加工零件数$\\SIFBlank$\n", + "text_parser2.text: $X$的分布列为$\\SIFChoice$\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 判断是否有语法问题" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "text_parser3 = Parser(text3)\r\n", + "text_parser3.description_list()\r\n", + "print('text_parser3.error_flag: ',text_parser3.error_flag)\r\n", + "\r\n", + "\r\n", + "text_parser4 = Parser(text4)\r\n", + "text_parser4.description_list()\r\n", + "print('text_parser4.fomula_illegal_flag: ',text_parser4.fomula_illegal_flag)\r\n" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "text_parser3.error_flag: 1\n", + "text_parser4.fomula_illegal_flag: 1\n" + ] + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.6.3", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.6.3 64-bit" + }, + "interpreter": { + "hash": "6f23ddf1f0697a8f0c43dd2435bdb82528077c79e9967f824fba6a3b52b05faf" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/pretrain/gensim/d2v_bow_tfidf.ipynb b/examples/pretrain/gensim/d2v_bow_tfidf.ipynb index 66a77fa6..154279dc 100644 --- a/examples/pretrain/gensim/d2v_bow_tfidf.ipynb +++ b/examples/pretrain/gensim/d2v_bow_tfidf.ipynb @@ -3,7 +3,14 @@ { "cell_type": "markdown", "source": [ - "# 1. load and tokenize test_items" + "# d2v_bow_tfidf" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 1. load and tokenize test_items" ], "metadata": {} }, @@ -158,7 +165,7 @@ { "cell_type": "markdown", "source": [ - "# 2. train and test model by 'bow'" + "## 2. train and test model by 'bow'" ], "metadata": { "pycharm": { @@ -226,7 +233,7 @@ { "cell_type": "markdown", "source": [ - "# 3. train and test model by 'tfidf'" + "## 3. train and test model by 'tfidf'" ], "metadata": {} }, @@ -321,4 +328,4 @@ }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +} diff --git a/examples/pretrain/gensim/d2v_general.ipynb b/examples/pretrain/gensim/d2v_general.ipynb index 67ac5a8e..d1d8605a 100644 --- a/examples/pretrain/gensim/d2v_general.ipynb +++ b/examples/pretrain/gensim/d2v_general.ipynb @@ -3,7 +3,14 @@ { "cell_type": "markdown", "source": [ - "# 1. Get token example from item\n", + "# d2v_general" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 1. Get token example from item\r\n", "> Notes: use geneal('linear') tokenizition method, which means do not parse formulas" ], "metadata": {} @@ -109,7 +116,7 @@ { "cell_type": "markdown", "source": [ - "# 2. Load Model and test item" + "## 2. Load Model and test item" ], "metadata": {} }, diff --git a/examples/pretrain/gensim/d2v_stem_tf.ipynb b/examples/pretrain/gensim/d2v_stem_tf.ipynb index 1a602795..f9d76a1c 100644 --- a/examples/pretrain/gensim/d2v_stem_tf.ipynb +++ b/examples/pretrain/gensim/d2v_stem_tf.ipynb @@ -1,15 +1,42 @@ { "cells": [ + { + "cell_type": "markdown", + "source": [ + "# d2v_stem_tf" + ], + "metadata": {} + }, { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "source": [ + "import json\r\n", + "from tqdm import tqdm\r\n", + "\r\n", + "def load_items():\r\n", + " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\r\n", + " for line in f:\r\n", + " yield json.loads(line)\r\n", + "\r\n", + "\r\n", + "from EduNLP.Pretrain import GensimWordTokenizer\r\n", + "\r\n", + "tokenizer = GensimWordTokenizer(symbol=\"gm\")\r\n", + "sif_items = []\r\n", + "for item in tqdm(load_items(), \"sifing\"):\r\n", + " sif_item = tokenizer(\r\n", + " item[\"stem\"]\r\n", + " )\r\n", + " if sif_item:\r\n", + " sif_items.append(sif_item.tokens)\r\n", + "\r\n", + "sif_items[0]" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "d:\\env\\python3.8\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", " warnings.warn(msg)\n", @@ -17,8 +44,8 @@ ] }, { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "'LaTeX-incompatible input and strict mode is set to \\'warn\\': Unrecognized Unicode character \"①\" (9312) [unknownSymbol]'\n", "'LaTeX-incompatible input and strict mode is set to \\'warn\\': Unrecognized Unicode character \"②\" (9313) [unknownSymbol]'\n", @@ -158,6 +185,7 @@ ] }, { + "output_type": "execute_result", "data": { "text/plain": [ "['已知',\n", @@ -198,87 +226,64 @@ " '=']" ] }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 1 } ], - "source": [ - "import json\r\n", - "from tqdm import tqdm\r\n", - "\r\n", - "def load_items():\r\n", - " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\r\n", - " for line in f:\r\n", - " yield json.loads(line)\r\n", - "\r\n", - "\r\n", - "from EduNLP.Pretrain import GensimWordTokenizer\r\n", - "\r\n", - "tokenizer = GensimWordTokenizer(symbol=\"gm\")\r\n", - "sif_items = []\r\n", - "for item in tqdm(load_items(), \"sifing\"):\r\n", - " sif_item = tokenizer(\r\n", - " item[\"stem\"]\r\n", - " )\r\n", - " if sif_item:\r\n", - " sif_items.append(sif_item.tokens)\r\n", - "\r\n", - "sif_items[0]" - ] + "metadata": { + "collapsed": true + } }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - }, + "source": [ + "len(sif_items)" + ], "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "788" ] }, - "execution_count": 2, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], - "source": [ - "len(sif_items)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - }, - "outputs": [], - "source": [ - "from EduNLP.Pretrain import train_vector" - ] + } }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, + "source": [ + "from EduNLP.Pretrain import train_vector" + ], + "outputs": [], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - }, + } + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "# 10 dimension with fasstext method\r\n", + "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_tf_\", 10, method=\"d2v\")" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "EduNLP, INFO Epoch #0: loss-0.0000 \n", "EduNLP, INFO Epoch #1: loss-0.0000 \n", @@ -294,32 +299,35 @@ ] }, { + "output_type": "execute_result", "data": { "text/plain": [ "'../../../data/w2v/gensim_luna_stem_tf_d2v_10.bin'" ] }, - "execution_count": 4, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } ], - "source": [ - "# 10 dimension with fasstext method\n", - "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_tf_\", 10, method=\"d2v\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - }, + } + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "from EduNLP.Vector import D2V\r\n", + "\r\n", + "d2v = D2V(\"../../../data/w2v/gensim_luna_stem_tf_d2v_10.bin\")\r\n", + "d2v(sif_items[0])" + ], "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "array([-0.16680606, -0.04633714, 0.05006265, 0.2665265 , -0.04968905,\n", @@ -327,17 +335,16 @@ " dtype=float32)" ] }, - "execution_count": 5, "metadata": {}, - "output_type": "execute_result" + "execution_count": 5 } ], - "source": [ - "from EduNLP.Vector import D2V\n", - "\n", - "d2v = D2V(\"../../../data/w2v/gensim_luna_stem_tf_d2v_10.bin\")\n", - "d2v(sif_items[0])" - ] + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } } ], "metadata": { @@ -354,5 +361,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "nbformat_minor": 2 +} diff --git a/examples/pretrain/gensim/w2v_stem_text.ipynb b/examples/pretrain/gensim/w2v_stem_text.ipynb index 01a38b20..3c9b6ca9 100644 --- a/examples/pretrain/gensim/w2v_stem_text.ipynb +++ b/examples/pretrain/gensim/w2v_stem_text.ipynb @@ -1,15 +1,38 @@ { "cells": [ + { + "cell_type": "markdown", + "source": [ + "# w2v_stem_text" + ], + "metadata": {} + }, { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "source": [ + "import json\r\n", + "from tqdm import tqdm\r\n", + "\r\n", + "def load_items():\r\n", + " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\r\n", + " for line in f:\r\n", + " yield json.loads(line)\r\n", + "\r\n", + "from EduNLP.Pretrain import train_vector, GensimWordTokenizer\r\n", + "\r\n", + "tokenizer = GensimWordTokenizer(symbol=\"fgm\")\r\n", + "\r\n", + "sif_items = [\r\n", + " tokenizer(item[\"stem\"]).tokens for item in tqdm(load_items(), \"sifing\")\r\n", + "]\r\n", + "\r\n", + "sif_items[0]" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "d:\\env\\python3.8\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", " warnings.warn(msg)\n", @@ -17,50 +40,38 @@ ] }, { + "output_type": "execute_result", "data": { - "text/plain": "['已知', '集合', '[FORMULA]', '[FORMULA]']" + "text/plain": [ + "['已知', '集合', '[FORMULA]', '[FORMULA]']" + ] }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 1 } ], - "source": [ - "import json\n", - "from tqdm import tqdm\n", - "\n", - "def load_items():\n", - " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\n", - " for line in f:\n", - " yield json.loads(line)\n", - "\n", - "from EduNLP.Pretrain import train_vector, GensimWordTokenizer\n", - "\n", - "tokenizer = GensimWordTokenizer(symbol=\"fgm\")\n", - "\n", - "sif_items = [\n", - " tokenizer(item[\"stem\"]).tokens for item in tqdm(load_items(), \"sifing\")\n", - "]\n", - "\n", - "sif_items[0]" - ] + "metadata": { + "collapsed": true + } }, { "cell_type": "code", "execution_count": 2, + "source": [ + "len(sif_items)" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "792" + "text/plain": [ + "792" + ] }, - "execution_count": 2, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], - "source": [ - "len(sif_items)" - ], "metadata": { "collapsed": false, "pycharm": { @@ -71,10 +82,14 @@ { "cell_type": "code", "execution_count": 3, + "source": [ + "# 100 dimension with skipgram method\r\n", + "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_t_\", 100)" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "EduNLP, INFO Epoch #0: loss-0.0000 \n", "EduNLP, INFO Epoch #1: loss-0.0000 \n", @@ -85,18 +100,16 @@ ] }, { + "output_type": "execute_result", "data": { - "text/plain": "'../../../data/w2v/gensim_luna_stem_t_sg_100.kv'" + "text/plain": [ + "'../../../data/w2v/gensim_luna_stem_t_sg_100.kv'" + ] }, - "execution_count": 3, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], - "source": [ - "# 100 dimension with skipgram method\n", - "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_t_\", 100)" - ], "metadata": { "collapsed": false, "pycharm": { @@ -107,10 +120,14 @@ { "cell_type": "code", "execution_count": 4, + "source": [ + "# 50 dimension with cbow method\r\n", + "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_t_\", 50, method=\"cbow\")" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "EduNLP, INFO Epoch #0: loss-0.0000 \n", "EduNLP, INFO Epoch #1: loss-0.0000 \n", @@ -121,18 +138,16 @@ ] }, { + "output_type": "execute_result", "data": { - "text/plain": "'../../../data/w2v/gensim_luna_stem_t_cbow_50.kv'" + "text/plain": [ + "'../../../data/w2v/gensim_luna_stem_t_cbow_50.kv'" + ] }, - "execution_count": 4, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } ], - "source": [ - "# 50 dimension with cbow method\n", - "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_t_\", 50, method=\"cbow\")" - ], "metadata": { "collapsed": false, "pycharm": { @@ -143,10 +158,14 @@ { "cell_type": "code", "execution_count": 5, + "source": [ + "# 10 dimension with fasstext method\r\n", + "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_t_\", 10, method=\"fasttext\")" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "EduNLP, INFO Epoch #0: loss-0.0000 \n", "EduNLP, INFO Epoch #1: loss-0.0000 \n", @@ -157,18 +176,16 @@ ] }, { + "output_type": "execute_result", "data": { - "text/plain": "'../../../data/w2v/gensim_luna_stem_t_fasttext_10.bin'" + "text/plain": [ + "'../../../data/w2v/gensim_luna_stem_t_fasttext_10.bin'" + ] }, - "execution_count": 5, "metadata": {}, - "output_type": "execute_result" + "execution_count": 5 } ], - "source": [ - "# 10 dimension with fasstext method\n", - "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_t_\", 10, method=\"fasttext\")" - ], "metadata": { "collapsed": false, "pycharm": { @@ -179,22 +196,44 @@ { "cell_type": "code", "execution_count": 6, + "source": [ + "from EduNLP.Vector import W2V\r\n", + "\r\n", + "w2v = W2V(\"../../../data/w2v/gensim_luna_stem_t_sg_100.kv\")\r\n", + "w2v[\"[FORMULA]\"]" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "array([-0.16754825, 0.2707899 , 0.01005908, -0.03040857, 0.10938002,\n -0.28348687, 0.19054936, 0.41737646, -0.3885515 , -0.14650987,\n 0.1157743 , -0.2406684 , -0.11294927, 0.12082661, 0.1759571 ,\n 0.17807944, 0.07178611, -0.16182491, -0.18266837, -0.52223957,\n -0.05876796, 0.0450548 , 0.26906556, 0.02253102, 0.1025768 ,\n 0.29827935, -0.441235 , -0.06949052, -0.22638813, -0.10846554,\n -0.05917242, 0.12802479, 0.21151058, -0.4611071 , -0.16157094,\n 0.32488874, 0.36630565, -0.36908495, 0.24223483, -0.3510737 ,\n -0.15079798, 0.10832163, 0.00392658, -0.20019084, 0.18827583,\n -0.17247967, -0.27385622, 0.17878376, 0.05156241, 0.30575123,\n -0.16626868, 0.01431947, 0.05540735, 0.03373449, 0.36685058,\n -0.05511234, 0.09583379, -0.09495933, 0.01121055, 0.18113017,\n 0.29060405, 0.06472825, 0.20568778, -0.02780204, -0.17310621,\n 0.23243082, 0.2480153 , 0.07856195, -0.03825858, 0.10257348,\n -0.02105796, 0.4248383 , 0.03114873, -0.09995517, 0.16022007,\n 0.08843125, 0.06128069, -0.03922344, 0.02587396, 0.03067247,\n 0.1209543 , -0.05948736, -0.25567266, 0.53167033, -0.4149 ,\n 0.08551055, 0.42399153, 0.18317291, 0.12455773, -0.10759205,\n 0.17496923, 0.2781072 , 0.25744784, 0.1921185 , 0.43071204,\n 0.09138201, -0.37603223, -0.07436363, 0.2961049 , 0.02517671],\n dtype=float32)" + "text/plain": [ + "array([-0.16754825, 0.2707899 , 0.01005908, -0.03040857, 0.10938002,\n", + " -0.28348687, 0.19054936, 0.41737646, -0.3885515 , -0.14650987,\n", + " 0.1157743 , -0.2406684 , -0.11294927, 0.12082661, 0.1759571 ,\n", + " 0.17807944, 0.07178611, -0.16182491, -0.18266837, -0.52223957,\n", + " -0.05876796, 0.0450548 , 0.26906556, 0.02253102, 0.1025768 ,\n", + " 0.29827935, -0.441235 , -0.06949052, -0.22638813, -0.10846554,\n", + " -0.05917242, 0.12802479, 0.21151058, -0.4611071 , -0.16157094,\n", + " 0.32488874, 0.36630565, -0.36908495, 0.24223483, -0.3510737 ,\n", + " -0.15079798, 0.10832163, 0.00392658, -0.20019084, 0.18827583,\n", + " -0.17247967, -0.27385622, 0.17878376, 0.05156241, 0.30575123,\n", + " -0.16626868, 0.01431947, 0.05540735, 0.03373449, 0.36685058,\n", + " -0.05511234, 0.09583379, -0.09495933, 0.01121055, 0.18113017,\n", + " 0.29060405, 0.06472825, 0.20568778, -0.02780204, -0.17310621,\n", + " 0.23243082, 0.2480153 , 0.07856195, -0.03825858, 0.10257348,\n", + " -0.02105796, 0.4248383 , 0.03114873, -0.09995517, 0.16022007,\n", + " 0.08843125, 0.06128069, -0.03922344, 0.02587396, 0.03067247,\n", + " 0.1209543 , -0.05948736, -0.25567266, 0.53167033, -0.4149 ,\n", + " 0.08551055, 0.42399153, 0.18317291, 0.12455773, -0.10759205,\n", + " 0.17496923, 0.2781072 , 0.25744784, 0.1921185 , 0.43071204,\n", + " 0.09138201, -0.37603223, -0.07436363, 0.2961049 , 0.02517671],\n", + " dtype=float32)" + ] }, - "execution_count": 6, "metadata": {}, - "output_type": "execute_result" + "execution_count": 6 } ], - "source": [ - "from EduNLP.Vector import W2V\n", - "\n", - "w2v = W2V(\"../../../data/w2v/gensim_luna_stem_t_sg_100.kv\")\n", - "w2v[\"[FORMULA]\"]" - ], "metadata": { "collapsed": false, "pycharm": { @@ -223,5 +262,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "nbformat_minor": 2 +} diff --git a/examples/pretrain/gensim/w2v_stem_tf.ipynb b/examples/pretrain/gensim/w2v_stem_tf.ipynb index 0a549870..4dd90cff 100644 --- a/examples/pretrain/gensim/w2v_stem_tf.ipynb +++ b/examples/pretrain/gensim/w2v_stem_tf.ipynb @@ -1,15 +1,42 @@ { "cells": [ + { + "cell_type": "markdown", + "source": [ + "# w2v_stem_tf" + ], + "metadata": {} + }, { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "source": [ + "import json\r\n", + "from tqdm import tqdm\r\n", + "\r\n", + "def load_items():\r\n", + " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\r\n", + " for line in f:\r\n", + " yield json.loads(line)\r\n", + "\r\n", + "\r\n", + "from EduNLP.Pretrain import GensimWordTokenizer\r\n", + "\r\n", + "tokenizer = GensimWordTokenizer(symbol=\"gm\")\r\n", + "sif_items = []\r\n", + "for item in tqdm(load_items(), \"sifing\"):\r\n", + " sif_item = tokenizer(\r\n", + " item[\"stem\"]\r\n", + " )\r\n", + " if sif_item:\r\n", + " sif_items.append(sif_item.tokens)\r\n", + "\r\n", + "sif_items[0]" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "d:\\env\\python3.8\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", " warnings.warn(msg)\n", @@ -17,8 +44,8 @@ ] }, { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "'LaTeX-incompatible input and strict mode is set to \\'warn\\': Unrecognized Unicode character \"①\" (9312) [unknownSymbol]'\n", "'LaTeX-incompatible input and strict mode is set to \\'warn\\': Unrecognized Unicode character \"②\" (9313) [unknownSymbol]'\n", @@ -158,54 +185,74 @@ ] }, { + "output_type": "execute_result", "data": { - "text/plain": "['埃及',\n '胡夫',\n '金字塔',\n '古代',\n '世界',\n '建筑',\n '奇迹',\n '形状',\n '视为',\n '正四',\n '棱锥',\n '以该',\n '四',\n '棱锥',\n '高为',\n '边长',\n '正方形',\n '面积',\n '等于',\n '四',\n '棱锥',\n '侧面',\n '三角形',\n '面积',\n '侧面',\n '三角形',\n '底边',\n '高',\n '底面',\n '正方形',\n '边长',\n '比值',\n '[FIGURE]',\n '[FIGURE]',\n '[FIGURE]',\n '[FIGURE]',\n '[FIGURE]']" + "text/plain": [ + "['埃及',\n", + " '胡夫',\n", + " '金字塔',\n", + " '古代',\n", + " '世界',\n", + " '建筑',\n", + " '奇迹',\n", + " '形状',\n", + " '视为',\n", + " '正四',\n", + " '棱锥',\n", + " '以该',\n", + " '四',\n", + " '棱锥',\n", + " '高为',\n", + " '边长',\n", + " '正方形',\n", + " '面积',\n", + " '等于',\n", + " '四',\n", + " '棱锥',\n", + " '侧面',\n", + " '三角形',\n", + " '面积',\n", + " '侧面',\n", + " '三角形',\n", + " '底边',\n", + " '高',\n", + " '底面',\n", + " '正方形',\n", + " '边长',\n", + " '比值',\n", + " '[FIGURE]',\n", + " '[FIGURE]',\n", + " '[FIGURE]',\n", + " '[FIGURE]',\n", + " '[FIGURE]']" + ] }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 1 } ], - "source": [ - "import json\n", - "from tqdm import tqdm\n", - "\n", - "def load_items():\n", - " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\n", - " for line in f:\n", - " yield json.loads(line)\n", - "\n", - "\n", - "from EduNLP.Pretrain import GensimWordTokenizer\n", - "\n", - "tokenizer = GensimWordTokenizer(symbol=\"gm\")\n", - "sif_items = []\n", - "for item in tqdm(load_items(), \"sifing\"):\n", - " sif_item = tokenizer(\n", - " item[\"stem\"]\n", - " )\n", - " if sif_item:\n", - " sif_items.append(sif_item.tokens)\n", - "\n", - "sif_items[0]" - ] + "metadata": { + "collapsed": true + } }, { "cell_type": "code", "execution_count": 3, + "source": [ + "len(sif_items)" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "792" + "text/plain": [ + "792" + ] }, - "execution_count": 3, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], - "source": [ - "len(sif_items)" - ], "metadata": { "collapsed": false, "pycharm": { @@ -216,10 +263,10 @@ { "cell_type": "code", "execution_count": 4, - "outputs": [], "source": [ "from EduNLP.Pretrain import train_vector" ], + "outputs": [], "metadata": { "collapsed": false, "pycharm": { @@ -230,10 +277,14 @@ { "cell_type": "code", "execution_count": 5, + "source": [ + "# 100 dimension with skipgram method\r\n", + "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_tf_\", 100)" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "EduNLP, INFO Epoch #0: loss-0.0000 \n", "EduNLP, INFO Epoch #1: loss-0.0000 \n", @@ -244,18 +295,16 @@ ] }, { + "output_type": "execute_result", "data": { - "text/plain": "'../../../data/w2v/gensim_luna_stem_tf_sg_100.kv'" + "text/plain": [ + "'../../../data/w2v/gensim_luna_stem_tf_sg_100.kv'" + ] }, - "execution_count": 5, "metadata": {}, - "output_type": "execute_result" + "execution_count": 5 } ], - "source": [ - "# 100 dimension with skipgram method\n", - "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_tf_\", 100)" - ], "metadata": { "collapsed": false, "pycharm": { @@ -266,10 +315,14 @@ { "cell_type": "code", "execution_count": 6, + "source": [ + "# 50 dimension with cbow method\r\n", + "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_tf_\", 50, method=\"cbow\")" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "EduNLP, INFO Epoch #0: loss-0.0000 \n", "EduNLP, INFO Epoch #1: loss-0.0000 \n", @@ -280,18 +333,16 @@ ] }, { + "output_type": "execute_result", "data": { - "text/plain": "'../../../data/w2v/gensim_luna_stem_tf_cbow_50.kv'" + "text/plain": [ + "'../../../data/w2v/gensim_luna_stem_tf_cbow_50.kv'" + ] }, - "execution_count": 6, "metadata": {}, - "output_type": "execute_result" + "execution_count": 6 } ], - "source": [ - "# 50 dimension with cbow method\n", - "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_tf_\", 50, method=\"cbow\")" - ], "metadata": { "collapsed": false, "pycharm": { @@ -302,10 +353,14 @@ { "cell_type": "code", "execution_count": 7, + "source": [ + "# 10 dimension with fasstext method\r\n", + "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_tf_\", 10, method=\"fasttext\")" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "EduNLP, INFO Epoch #0: loss-0.0000 \n", "EduNLP, INFO Epoch #1: loss-0.0000 \n", @@ -316,18 +371,16 @@ ] }, { + "output_type": "execute_result", "data": { - "text/plain": "'../../../data/w2v/gensim_luna_stem_tf_fasttext_10.bin'" + "text/plain": [ + "'../../../data/w2v/gensim_luna_stem_tf_fasttext_10.bin'" + ] }, - "execution_count": 7, "metadata": {}, - "output_type": "execute_result" + "execution_count": 7 } ], - "source": [ - "# 10 dimension with fasstext method\n", - "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_tf_\", 10, method=\"fasttext\")" - ], "metadata": { "collapsed": false, "pycharm": { @@ -338,22 +391,26 @@ { "cell_type": "code", "execution_count": 8, + "source": [ + "from EduNLP.Vector import W2V\n", + "\n", + "w2v = W2V(\"../../../data/w2v/gensim_luna_stem_tf_fasttext_10.bin\", method=\"fasttext\")\n", + "w2v[\"[FIGURE]\"]" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "array([ 0.3322667 , -0.701586 , -0.6528301 , -0.02556002, 0.44070247,\n 0.44261315, 0.54466563, 0.8991576 , -1.0600986 , 0.19438864],\n dtype=float32)" + "text/plain": [ + "array([ 0.3322667 , -0.701586 , -0.6528301 , -0.02556002, 0.44070247,\n", + " 0.44261315, 0.54466563, 0.8991576 , -1.0600986 , 0.19438864],\n", + " dtype=float32)" + ] }, - "execution_count": 8, "metadata": {}, - "output_type": "execute_result" + "execution_count": 8 } ], - "source": [ - "from EduNLP.Vector import W2V\n", - "\n", - "w2v = W2V(\"../../../data/w2v/gensim_luna_stem_tf_fasttext_10.bin\", method=\"fasttext\")\n", - "w2v[\"[FIGURE]\"]" - ], "metadata": { "collapsed": false, "pycharm": { @@ -382,5 +439,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "nbformat_minor": 2 +} diff --git a/examples/pretrain/prepare_dataset.ipynb b/examples/pretrain/prepare_dataset.ipynb index 0f1acb0e..d33f39b3 100644 --- a/examples/pretrain/prepare_dataset.ipynb +++ b/examples/pretrain/prepare_dataset.ipynb @@ -1,40 +1,49 @@ { "cells": [ + { + "cell_type": "markdown", + "source": [ + "# prepare_dataset" + ], + "metadata": {} + }, { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "source": [ + "from EduData import get_data\r\n", + "\r\n", + "get_data(\"open-luna\", \"../../data/\")\r\n" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "downloader, INFO http://base.ustc.edu.cn/data/OpenLUNA/OpenLUNA.json is saved as ..\\..\\data\\OpenLUNA.json\n" ] }, { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Downloading ..\\..\\data\\OpenLUNA.json 100.00%: 275142 | 275142\n" ] }, { + "output_type": "execute_result", "data": { - "text/plain": "'../../data/'" + "text/plain": [ + "'../../data/'" + ] }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 1 } ], - "source": [ - "from EduData import get_data\n", - "\n", - "get_data(\"open-luna\", \"../../data/\")\n" - ] + "metadata": { + "collapsed": true + } } ], "metadata": { @@ -57,5 +66,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "nbformat_minor": 2 +} diff --git a/examples/pretrain/seg_token/d2v.ipynb b/examples/pretrain/seg_token/d2v.ipynb index 12ee5a99..909d7846 100644 --- a/examples/pretrain/seg_token/d2v.ipynb +++ b/examples/pretrain/seg_token/d2v.ipynb @@ -1,15 +1,55 @@ { "cells": [ + { + "cell_type": "markdown", + "source": [ + "# d2v" + ], + "metadata": {} + }, { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "source": [ + "import warnings\r\n", + "from tqdm import tqdm\r\n", + "import json\r\n", + "from EduNLP.utils import dict2str4sif\r\n", + "\r\n", + "def load_items():\r\n", + " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\r\n", + " for line in f:\r\n", + " yield json.loads(line)\r\n", + "\r\n", + "from EduNLP.Pretrain import GensimSegTokenizer\r\n", + "\r\n", + "tokenizer = GensimSegTokenizer(depth=None)\r\n", + "sif_items = []\r\n", + "for item in tqdm(load_items(), \"sifing\"):\r\n", + " keys = [\"stem\"]\r\n", + " item[\"options\"] = eval(item[\"options\"])\r\n", + " if item[\"options\"]:\r\n", + " keys.append(\"options\")\r\n", + " try:\r\n", + " item_str = dict2str4sif(\r\n", + " item,\r\n", + " key_as_tag=True,\r\n", + " add_list_no_tag=False,\r\n", + " keys=keys,\r\n", + " tag_mode=\"head\"\r\n", + " )\r\n", + " except TypeError:\r\n", + " continue\r\n", + " sif_item = tokenizer(\r\n", + " item_str\r\n", + " )\r\n", + " if sif_item:\r\n", + " sif_items.append(sif_item)" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "d:\\env\\python3.8\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", " warnings.warn(msg)\n", @@ -17,8 +57,8 @@ ] }, { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "'LaTeX-incompatible input and strict mode is set to \\'warn\\': Unrecognized Unicode character \"①\" (9312) [unknownSymbol]'\n", "'LaTeX-incompatible input and strict mode is set to \\'warn\\': Unrecognized Unicode character \"②\" (9313) [unknownSymbol]'\n", @@ -190,59 +230,65 @@ ] } ], - "source": [ - "import warnings\n", - "from tqdm import tqdm\n", - "import json\n", - "from EduNLP.utils import dict2str4sif\n", - "\n", - "def load_items():\n", - " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\n", - " for line in f:\n", - " yield json.loads(line)\n", - "\n", - "from EduNLP.Pretrain import GensimSegTokenizer\n", - "\n", - "tokenizer = GensimSegTokenizer(depth=None)\n", - "sif_items = []\n", - "for item in tqdm(load_items(), \"sifing\"):\n", - " keys = [\"stem\"]\n", - " item[\"options\"] = eval(item[\"options\"])\n", - " if item[\"options\"]:\n", - " keys.append(\"options\")\n", - " try:\n", - " item_str = dict2str4sif(\n", - " item,\n", - " key_as_tag=True,\n", - " add_list_no_tag=False,\n", - " keys=keys,\n", - " tag_mode=\"head\"\n", - " )\n", - " except TypeError:\n", - " continue\n", - " sif_item = tokenizer(\n", - " item_str\n", - " )\n", - " if sif_item:\n", - " sif_items.append(sif_item)" - ] + "metadata": { + "collapsed": true + } }, { "cell_type": "code", "execution_count": 2, + "source": [ + "sif_items[0]" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "[['\\\\SIFTag{stem}'],\n ['已知', '集合'],\n ['mathord',\n '=',\n 'mathord',\n '\\\\mid',\n 'mathord',\n 'textord',\n '{ }',\n '^',\n '-',\n 'textord',\n 'mathord',\n '-',\n 'textord',\n '<',\n 'textord',\n '\\\\{',\n ',',\n 'mathord',\n '=',\n '\\\\{',\n '-',\n 'textord',\n ',',\n 'textord',\n ',',\n 'textord',\n ',',\n 'textord',\n '\\\\}',\n ','],\n ['mathord', '\\\\cap', 'mathord', '='],\n ['\\\\SIFTag{options}'],\n ['\\\\', '{', '\\\\'],\n ['\\\\', '{', '\\\\'],\n ['\\\\', '{', '\\\\'],\n ['\\\\', '{', '\\\\']]" + "text/plain": [ + "[['\\\\SIFTag{stem}'],\n", + " ['已知', '集合'],\n", + " ['mathord',\n", + " '=',\n", + " 'mathord',\n", + " '\\\\mid',\n", + " 'mathord',\n", + " 'textord',\n", + " '{ }',\n", + " '^',\n", + " '-',\n", + " 'textord',\n", + " 'mathord',\n", + " '-',\n", + " 'textord',\n", + " '<',\n", + " 'textord',\n", + " '\\\\{',\n", + " ',',\n", + " 'mathord',\n", + " '=',\n", + " '\\\\{',\n", + " '-',\n", + " 'textord',\n", + " ',',\n", + " 'textord',\n", + " ',',\n", + " 'textord',\n", + " ',',\n", + " 'textord',\n", + " '\\\\}',\n", + " ','],\n", + " ['mathord', '\\\\cap', 'mathord', '='],\n", + " ['\\\\SIFTag{options}'],\n", + " ['\\\\', '{', '\\\\'],\n", + " ['\\\\', '{', '\\\\'],\n", + " ['\\\\', '{', '\\\\'],\n", + " ['\\\\', '{', '\\\\']]" + ] }, - "execution_count": 2, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], - "source": [ - "sif_items[0]" - ], "metadata": { "collapsed": false, "pycharm": { @@ -253,19 +299,21 @@ { "cell_type": "code", "execution_count": 3, + "source": [ + "len(sif_items)" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "770" + "text/plain": [ + "770" + ] }, - "execution_count": 3, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], - "source": [ - "len(sif_items)" - ], "metadata": { "collapsed": false, "pycharm": { @@ -276,17 +324,17 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ - "from EduNLP.Pretrain import train_vector\n", - "from gensim.models.doc2vec import TaggedDocument\n", - "\n", - "train_vector(\n", - " sif_items,\n", - " \"../../../data/w2v/gensim_luna_stem_tf_\",\n", - " 10\n", + "from EduNLP.Pretrain import train_vector\r\n", + "from gensim.models.doc2vec import TaggedDocument\r\n", + "\r\n", + "train_vector(\r\n", + " sif_items,\r\n", + " \"../../../data/w2v/gensim_luna_stem_tf_\",\r\n", + " 10\r\n", ")" ], + "outputs": [], "metadata": { "collapsed": false, "pycharm": { @@ -315,5 +363,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "nbformat_minor": 2 +} diff --git a/examples/pretrain/seg_token/d2v_d1.ipynb b/examples/pretrain/seg_token/d2v_d1.ipynb index 0e7047b2..49711e88 100644 --- a/examples/pretrain/seg_token/d2v_d1.ipynb +++ b/examples/pretrain/seg_token/d2v_d1.ipynb @@ -1,15 +1,54 @@ { "cells": [ + { + "cell_type": "markdown", + "source": [ + "# d2v_d1" + ], + "metadata": {} + }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "source": [ + "from tqdm import tqdm\r\n", + "import json\r\n", + "from EduNLP.utils import dict2str4sif\r\n", + "\r\n", + "def load_items():\r\n", + " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\r\n", + " for line in f:\r\n", + " yield json.loads(line)\r\n", + "\r\n", + "from EduNLP.Pretrain import GensimSegTokenizer\r\n", + "\r\n", + "tokenizer = GensimSegTokenizer(depth=1)\r\n", + "sif_items = []\r\n", + "for item in tqdm(load_items(), \"sifing\"):\r\n", + " keys = [\"stem\"]\r\n", + " item[\"options\"] = eval(item[\"options\"])\r\n", + " if item[\"options\"]:\r\n", + " keys.append(\"options\")\r\n", + " try:\r\n", + " item_str = dict2str4sif(\r\n", + " item,\r\n", + " key_as_tag=True,\r\n", + " add_list_no_tag=False,\r\n", + " keys=keys,\r\n", + " tag_mode=\"head\"\r\n", + " )\r\n", + " except TypeError:\r\n", + " continue\r\n", + " sif_item = tokenizer(\r\n", + " item_str\r\n", + " )\r\n", + " if sif_item:\r\n", + " sif_items.append(sif_item)" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "d:\\env\\python3.8\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", " warnings.warn(msg)\n", @@ -17,8 +56,8 @@ ] }, { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "'LaTeX-incompatible input and strict mode is set to \\'warn\\': Unrecognized Unicode character \"①\" (9312) [unknownSymbol]'\n", "'LaTeX-incompatible input and strict mode is set to \\'warn\\': Unrecognized Unicode character \"②\" (9313) [unknownSymbol]'\n", @@ -190,58 +229,51 @@ ] } ], - "source": [ - "from tqdm import tqdm\n", - "import json\n", - "from EduNLP.utils import dict2str4sif\n", - "\n", - "def load_items():\n", - " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\n", - " for line in f:\n", - " yield json.loads(line)\n", - "\n", - "from EduNLP.Pretrain import GensimSegTokenizer\n", - "\n", - "tokenizer = GensimSegTokenizer(depth=1)\n", - "sif_items = []\n", - "for item in tqdm(load_items(), \"sifing\"):\n", - " keys = [\"stem\"]\n", - " item[\"options\"] = eval(item[\"options\"])\n", - " if item[\"options\"]:\n", - " keys.append(\"options\")\n", - " try:\n", - " item_str = dict2str4sif(\n", - " item,\n", - " key_as_tag=True,\n", - " add_list_no_tag=False,\n", - " keys=keys,\n", - " tag_mode=\"head\"\n", - " )\n", - " except TypeError:\n", - " continue\n", - " sif_item = tokenizer(\n", - " item_str\n", - " )\n", - " if sif_item:\n", - " sif_items.append(sif_item)" - ] + "metadata": { + "collapsed": true + } }, { "cell_type": "code", "execution_count": 3, + "source": [ + "sif_items[1]" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "[['\\\\SIFTag{stem}'],\n ['[TEXT_BEGIN]', '复数'],\n ['[FORMULA_BEGIN]',\n 'mathord',\n '=',\n 'textord',\n '+',\n 'textord',\n 'mathord',\n '+',\n 'mathord',\n 'textord',\n '{ }',\n '^'],\n ['[TEXT_BEGIN]'],\n ['[FORMULA_BEGIN]', 'textord', 'mathord', 'textord', '='],\n ['\\\\SIFTag{options}'],\n ['[TEXT_BEGIN]'],\n ['\\\\SIFSep'],\n ['[TEXT_BEGIN]'],\n ['\\\\SIFSep'],\n ['[FORMULA_BEGIN]', 'textord', '{ }', '\\\\sqrt'],\n ['\\\\SIFSep'],\n ['[TEXT_BEGIN]']]" + "text/plain": [ + "[['\\\\SIFTag{stem}'],\n", + " ['[TEXT_BEGIN]', '复数'],\n", + " ['[FORMULA_BEGIN]',\n", + " 'mathord',\n", + " '=',\n", + " 'textord',\n", + " '+',\n", + " 'textord',\n", + " 'mathord',\n", + " '+',\n", + " 'mathord',\n", + " 'textord',\n", + " '{ }',\n", + " '^'],\n", + " ['[TEXT_BEGIN]'],\n", + " ['[FORMULA_BEGIN]', 'textord', 'mathord', 'textord', '='],\n", + " ['\\\\SIFTag{options}'],\n", + " ['[TEXT_BEGIN]'],\n", + " ['\\\\SIFSep'],\n", + " ['[TEXT_BEGIN]'],\n", + " ['\\\\SIFSep'],\n", + " ['[FORMULA_BEGIN]', 'textord', '{ }', '\\\\sqrt'],\n", + " ['\\\\SIFSep'],\n", + " ['[TEXT_BEGIN]']]" + ] }, - "execution_count": 3, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], - "source": [ - "sif_items[1]" - ], "metadata": { "collapsed": false, "pycharm": { @@ -252,19 +284,21 @@ { "cell_type": "code", "execution_count": 4, + "source": [ + "len(sif_items)" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "770" + "text/plain": [ + "770" + ] }, - "execution_count": 4, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } ], - "source": [ - "len(sif_items)" - ], "metadata": { "collapsed": false, "pycharm": { @@ -293,5 +327,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "nbformat_minor": 2 +} diff --git a/examples/pretrain/seg_token/d2v_d2.ipynb b/examples/pretrain/seg_token/d2v_d2.ipynb index 6ecc5216..076c1e95 100644 --- a/examples/pretrain/seg_token/d2v_d2.ipynb +++ b/examples/pretrain/seg_token/d2v_d2.ipynb @@ -1,15 +1,54 @@ { "cells": [ + { + "cell_type": "markdown", + "source": [ + "# d2v_d2" + ], + "metadata": {} + }, { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "source": [ + "from tqdm import tqdm\r\n", + "import json\r\n", + "from EduNLP.utils import dict2str4sif\r\n", + "\r\n", + "def load_items():\r\n", + " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\r\n", + " for line in f:\r\n", + " yield json.loads(line)\r\n", + "\r\n", + "from EduNLP.Pretrain import GensimSegTokenizer\r\n", + "\r\n", + "tokenizer = GensimSegTokenizer(depth=2)\r\n", + "sif_items = []\r\n", + "for item in tqdm(load_items(), \"sifing\"):\r\n", + " keys = [\"stem\"]\r\n", + " item[\"options\"] = eval(item[\"options\"])\r\n", + " if item[\"options\"]:\r\n", + " keys.append(\"options\")\r\n", + " try:\r\n", + " item_str = dict2str4sif(\r\n", + " item,\r\n", + " key_as_tag=True,\r\n", + " add_list_no_tag=False,\r\n", + " keys=keys,\r\n", + " tag_mode=\"head\"\r\n", + " )\r\n", + " except TypeError:\r\n", + " continue\r\n", + " sif_item = tokenizer(\r\n", + " item_str\r\n", + " )\r\n", + " if sif_item:\r\n", + " sif_items.append(sif_item)" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "d:\\env\\python3.8\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", " warnings.warn(msg)\n", @@ -17,8 +56,8 @@ ] }, { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "'LaTeX-incompatible input and strict mode is set to \\'warn\\': Unrecognized Unicode character \"①\" (9312) [unknownSymbol]'\n", "'LaTeX-incompatible input and strict mode is set to \\'warn\\': Unrecognized Unicode character \"②\" (9313) [unknownSymbol]'\n", @@ -190,58 +229,51 @@ ] } ], - "source": [ - "from tqdm import tqdm\n", - "import json\n", - "from EduNLP.utils import dict2str4sif\n", - "\n", - "def load_items():\n", - " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\n", - " for line in f:\n", - " yield json.loads(line)\n", - "\n", - "from EduNLP.Pretrain import GensimSegTokenizer\n", - "\n", - "tokenizer = GensimSegTokenizer(depth=2)\n", - "sif_items = []\n", - "for item in tqdm(load_items(), \"sifing\"):\n", - " keys = [\"stem\"]\n", - " item[\"options\"] = eval(item[\"options\"])\n", - " if item[\"options\"]:\n", - " keys.append(\"options\")\n", - " try:\n", - " item_str = dict2str4sif(\n", - " item,\n", - " key_as_tag=True,\n", - " add_list_no_tag=False,\n", - " keys=keys,\n", - " tag_mode=\"head\"\n", - " )\n", - " except TypeError:\n", - " continue\n", - " sif_item = tokenizer(\n", - " item_str\n", - " )\n", - " if sif_item:\n", - " sif_items.append(sif_item)" - ] + "metadata": { + "collapsed": true + } }, { "cell_type": "code", "execution_count": 2, + "source": [ + "sif_items[1]" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "[['\\\\SIFTag{stem}'],\n ['[TEXT_BEGIN]', '复数'],\n ['[FORMULA_BEGIN]',\n 'mathord',\n '=',\n 'textord',\n '+',\n 'textord',\n 'mathord',\n '+',\n 'mathord',\n 'textord',\n '{ }',\n '^'],\n ['[TEXT_BEGIN]'],\n ['[FORMULA_BEGIN]', 'textord', 'mathord', 'textord', '='],\n ['\\\\SIFTag{options}'],\n ['[TEXT_BEGIN]'],\n ['\\\\SIFSep'],\n ['[TEXT_BEGIN]'],\n ['\\\\SIFSep'],\n ['[FORMULA_BEGIN]', 'textord', '{ }', '\\\\sqrt'],\n ['\\\\SIFSep'],\n ['[TEXT_BEGIN]']]" + "text/plain": [ + "[['\\\\SIFTag{stem}'],\n", + " ['[TEXT_BEGIN]', '复数'],\n", + " ['[FORMULA_BEGIN]',\n", + " 'mathord',\n", + " '=',\n", + " 'textord',\n", + " '+',\n", + " 'textord',\n", + " 'mathord',\n", + " '+',\n", + " 'mathord',\n", + " 'textord',\n", + " '{ }',\n", + " '^'],\n", + " ['[TEXT_BEGIN]'],\n", + " ['[FORMULA_BEGIN]', 'textord', 'mathord', 'textord', '='],\n", + " ['\\\\SIFTag{options}'],\n", + " ['[TEXT_BEGIN]'],\n", + " ['\\\\SIFSep'],\n", + " ['[TEXT_BEGIN]'],\n", + " ['\\\\SIFSep'],\n", + " ['[FORMULA_BEGIN]', 'textord', '{ }', '\\\\sqrt'],\n", + " ['\\\\SIFSep'],\n", + " ['[TEXT_BEGIN]']]" + ] }, - "execution_count": 2, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], - "source": [ - "sif_items[1]" - ], "metadata": { "collapsed": false, "pycharm": { @@ -270,5 +302,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "nbformat_minor": 2 +} diff --git a/examples/seg/seg.ipynb b/examples/seg/seg.ipynb new file mode 100644 index 00000000..751d1439 --- /dev/null +++ b/examples/seg/seg.ipynb @@ -0,0 +1,287 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Code for beginner to learn how to use seg\r\n", + "\r\n", + "In this notebook, we will show you the basic usage to apply SIF to prepare data for conducting scientific experiments.\r\n", + "\r\n", + "We use the demo item (an exercise from LUNA) shown in the following Figure.\r\n", + "![Figure](../../asset/_static/item.png).\r\n", + "The SIF expression of this item can be written as follows:" + ], + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "item = {\r\n", + " \"stem\": r\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\r\n", + " \"options\": [\"$p_1=p_2$\", \"$p_1=p_3$\", \"$p_2=p_3$\", \"$p_1=p_2+p_3$\"]\r\n", + "}\r\n", + "item[\"stem\"]" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "'如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\\\SIFChoice$$\\\\FigureID{1}$'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "from PIL import Image\r\n", + "img = Image.open(\"../../asset/_static/item_figure.png\")\r\n", + "figures = {\"1\": img}\r\n", + "img" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ], + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "### Segment" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "sif4sci(item[\"stem\"], figures=figures, tokenization=False, symbol=\"tfgm\")\r\n" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[FIGURE]']" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "segments = sif4sci(item[\"stem\"], figures=figures, tokenization=False)\r\n", + "segments" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形', 'ABC', '的斜边', 'BC', ', 直角边', 'AB', ', ', 'AC', '.', '\\\\bigtriangleup ABC', '的三边所围成的区域记为', 'I', ',黑色部分记为', 'II', ', 其余部分记为', 'III', '.在整个图形中随机取一点,此点取自', 'I,II,III', '的概率分别记为', 'p_1,p_2,p_3', ',则', '\\\\SIFChoice', \\FigureID{1}]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "segments.text_segments" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形',\n", + " '的斜边',\n", + " ', 直角边',\n", + " ', ',\n", + " '.',\n", + " '的三边所围成的区域记为',\n", + " ',黑色部分记为',\n", + " ', 其余部分记为',\n", + " '.在整个图形中随机取一点,此点取自',\n", + " '的概率分别记为',\n", + " ',则']" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "segments.figure_segments" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "[\\FigureID{1}]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 8, + "source": [ + "segments.figure_segments[0].figure" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ], + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 9, + "source": [ + "segments.formula_segments" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['ABC',\n", + " 'BC',\n", + " 'AB',\n", + " 'AC',\n", + " '\\\\bigtriangleup ABC',\n", + " 'I',\n", + " 'II',\n", + " 'III',\n", + " 'I,II,III',\n", + " 'p_1,p_2,p_3']" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 10, + "source": [ + "segments.ques_mark_segments" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "['\\\\SIFChoice']" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/sif/sif.ipynb b/examples/sif/sif.ipynb index 8bd80a7e..3376cd6d 100644 --- a/examples/sif/sif.ipynb +++ b/examples/sif/sif.ipynb @@ -627,4 +627,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/examples/sif/sif_addition.ipynb b/examples/sif/sif_addition.ipynb new file mode 100644 index 00000000..57830c43 --- /dev/null +++ b/examples/sif/sif_addition.ipynb @@ -0,0 +1,740 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# sif_addition" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "from EduNLP.SIF import is_sif, to_sif,sif4sci" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## is_sif" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + " text = '若$x,y$满足约束条件' \\\r\n", + " '$\\\\left\\\\{\\\\begin{array}{c}2 x+y-2 \\\\leq 0 \\\\\\\\ x-y-1 \\\\geq 0 \\\\\\\\ y+1 \\\\geq 0\\\\end{array}\\\\right.$,' \\\r\n", + " '则$z=x+7 y$的最大值$\\\\SIFUnderline$'\r\n", + " \r\n", + "is_sif(text)\r\n" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", + "is_sif(text)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "False" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## to_sif" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", + "to_sif(text)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...'" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## sif4sci\n", + " to_symbolize:\n", + " - \"t\": text\n", + " - \"f\": formula\n", + " - \"g\": figure\n", + " - \"m\": question mark" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 14, + "source": [ + " test_item = r\"如图所示,则$\\bigtriangleup ABC$的面积是$\\SIFBlank$。$\\FigureID{1}$\"\r\n", + " t1 = sif4sci(test_item)\r\n", + " t1" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['如图所示', '\\\\bigtriangleup', 'ABC', '面积', '\\\\SIFBlank', \\FigureID{1}]" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 15, + "source": [ + "t1.describe()" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'t': 2, 'f': 2, 'g': 1, 'm': 1}" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 17, + "source": [ + "with t1.filter('fgm'):\n", + " print(t1)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['如图所示', '面积']\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 18, + "source": [ + "with t1.filter(keep='t'):\n", + " print(t1)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['如图所示', '面积']\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 19, + "source": [ + "with t1.filter():\n", + " print(t1)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['如图所示', '\\\\bigtriangleup', 'ABC', '面积', '\\\\SIFBlank', \\FigureID{1}]\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 20, + "source": [ + "t1.text_tokens" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['如图所示', '面积']" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 23, + "source": [ + "t1.formula_tokens" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['\\\\bigtriangleup', 'ABC']" + ] + }, + "metadata": {}, + "execution_count": 23 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 24, + "source": [ + "t1.figure_tokens" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[\\FigureID{1}]" + ] + }, + "metadata": {}, + "execution_count": 24 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 25, + "source": [ + "t1.ques_mark_tokens" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['\\\\SIFBlank']" + ] + }, + "metadata": {}, + "execution_count": 25 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 26, + "source": [ + "sif4sci(test_item, symbol=\"gm\", tokenization_params={\"formula_params\": {\"method\": \"ast\"}})" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['如图所示', , '面积', '[MARK]', '[FIGURE]']" + ] + }, + "metadata": {}, + "execution_count": 26 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 27, + "source": [ + "sif4sci(test_item, symbol=\"tfgm\")" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]']" + ] + }, + "metadata": {}, + "execution_count": 27 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 28, + "source": [ + "sif4sci(test_item, symbol=\"gm\", tokenization_params={\"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\"}})" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['如图所示', '\\\\bigtriangleup', 'A', 'B', 'C', '面积', '[MARK]', '[FIGURE]']" + ] + }, + "metadata": {}, + "execution_count": 28 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 29, + "source": [ + " test_item_1 = {\n", + " \"stem\": r\"若$x=2$, $y=\\sqrt{x}$,则下列说法正确的是$\\SIFChoice$\",\n", + " \"options\": [r\"$x < y$\", r\"$y = x$\", r\"$y < x$\"]\n", + " }" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 30, + "source": [ + " tls = [\n", + " sif4sci(e, symbol=\"gm\",\n", + " tokenization_params={\n", + " \"formula_params\": {\n", + " \"method\": \"ast\", \"return_type\": \"list\", \"ord2token\": True, \"var_numbering\": True,\n", + " \"link_variable\": False}\n", + " })\n", + " for e in ([test_item_1[\"stem\"]] + test_item_1[\"options\"])\n", + " ]" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 33, + "source": [ + "tls" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[['mathord_0', '=', 'textord', 'mathord_1', '=', 'mathord_0', '{ }', '\\\\sqrt', '说法', '正确', '[MARK]'],\n", + " ['mathord_0', '<', 'mathord_1'],\n", + " ['mathord_0', '=', 'mathord_1'],\n", + " ['mathord_0', '<', 'mathord_1']]" + ] + }, + "metadata": {}, + "execution_count": 33 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 34, + "source": [ + "tls[1:]" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[['mathord_0', '<', 'mathord_1'],\n", + " ['mathord_0', '=', 'mathord_1'],\n", + " ['mathord_0', '<', 'mathord_1']]" + ] + }, + "metadata": {}, + "execution_count": 34 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 35, + "source": [ + "from EduNLP.utils import dict2str4sif\n", + "\n", + "test_item_1_str = dict2str4sif(test_item_1, tag_mode=\"head\", add_list_no_tag=False)\n", + "test_item_1_str " + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'$\\\\SIFTag{stem}$若$x=2$, $y=\\\\sqrt{x}$,则下列说法正确的是$\\\\SIFChoice$$\\\\SIFTag{options}$$x < y$$\\\\SIFSep$$y = x$$\\\\SIFSep$$y < x$'" + ] + }, + "metadata": {}, + "execution_count": 35 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 36, + "source": [ + "tl1 = sif4sci(\n", + " test_item_1_str, \n", + " symbol=\"gm\", \n", + " tokenization_params={\n", + " \"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\", \"ord2token\": True}\n", + " })\n", + " " + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 37, + "source": [ + "tl1.get_segments()[0]" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['\\\\SIFTag{stem}']" + ] + }, + "metadata": {}, + "execution_count": 37 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 38, + "source": [ + "tl1.get_segments()[1:3]" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[['[TEXT_BEGIN]', '[TEXT_END]'],\n", + " ['[FORMULA_BEGIN]', 'mathord', '=', 'textord', '[FORMULA_END]']]" + ] + }, + "metadata": {}, + "execution_count": 38 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 39, + "source": [ + "tl1.get_segments(add_seg_type=False)[0:3]" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[['\\\\SIFTag{stem}'],\n", + " ['mathord', '=', 'textord'],\n", + " ['mathord', '=', 'mathord', '{ }', '\\\\sqrt']]" + ] + }, + "metadata": {}, + "execution_count": 39 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 41, + "source": [ + "test_item_2 = {\"options\": [r\"$x < y$\", r\"$y = x$\", r\"$y < x$\"]}" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 42, + "source": [ + "test_item_2_str = dict2str4sif(test_item_2, tag_mode=\"head\", add_list_no_tag=False)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 43, + "source": [ + "test_item_2_str" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'$\\\\SIFTag{options}$$x < y$$\\\\SIFSep$$y = x$$\\\\SIFSep$$y < x$'" + ] + }, + "metadata": {}, + "execution_count": 43 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 44, + "source": [ + "tl2 = sif4sci(test_item_2_str, symbol=\"gms\",\n", + " tokenization_params={\"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\"}})\n", + "tl2 " + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['\\\\SIFTag{options}', 'x', '<', 'y', '[SEP]', 'y', '=', 'x', '[SEP]', 'y', '<', 'x']" + ] + }, + "metadata": {}, + "execution_count": 44 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 45, + "source": [ + "tl2.get_segments(add_seg_type=False)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[['\\\\SIFTag{options}'],\n", + " ['x', '<', 'y'],\n", + " ['[SEP]'],\n", + " ['y', '=', 'x'],\n", + " ['[SEP]'],\n", + " ['y', '<', 'x']]" + ] + }, + "metadata": {}, + "execution_count": 45 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 46, + "source": [ + "tl2.get_segments(add_seg_type=False, drop=\"s\")" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[['\\\\SIFTag{options}'], ['x', '<', 'y'], ['y', '=', 'x'], ['y', '<', 'x']]" + ] + }, + "metadata": {}, + "execution_count": 46 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 47, + "source": [ + "tl3 = sif4sci(test_item_1[\"stem\"], symbol=\"gs\")\n", + "tl3.text_segments" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[['说法', '正确']]" + ] + }, + "metadata": {}, + "execution_count": 47 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 48, + "source": [ + "tl3.formula_segments" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[['x', '=', '2'], ['y', '=', '\\\\sqrt', '{', 'x', '}']]" + ] + }, + "metadata": {}, + "execution_count": 48 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 49, + "source": [ + "tl3.figure_segments" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[]" + ] + }, + "metadata": {}, + "execution_count": 49 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 50, + "source": [ + "tl3.ques_mark_segments" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[['\\\\SIFChoice']]" + ] + }, + "metadata": {}, + "execution_count": 50 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [], + "outputs": [], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/test_model/test_gensim_luna_stem_tf_d2v_256.bin b/examples/test_model/test_gensim_luna_stem_tf_d2v_256.bin new file mode 100644 index 00000000..7a56bca4 Binary files /dev/null and b/examples/test_model/test_gensim_luna_stem_tf_d2v_256.bin differ diff --git a/examples/tokenizer/test_stopwords.txt b/examples/tokenizer/test_stopwords.txt new file mode 100644 index 00000000..8183ecf4 --- /dev/null +++ b/examples/tokenizer/test_stopwords.txt @@ -0,0 +1,9 @@ +一旦 +一时 +一来 +一样 +一次 +一片 +一番 +一直 +一致 \ No newline at end of file diff --git a/examples/tokenizer/tokenizer.ipynb b/examples/tokenizer/tokenizer.ipynb new file mode 100644 index 00000000..4819b00d --- /dev/null +++ b/examples/tokenizer/tokenizer.ipynb @@ -0,0 +1,501 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Tokenizer\n", + "\n", + "## 概述\n", + "\n", + "为了方便后续向量化表征试题,本模块提供题目文本的令牌化解析(Tokenization),即将题目转换成令牌序列。 \n", + "\n", + "根据构成题目的元素类型,解析功能分为 **“文本解析”** 和 **“公式解析”** 两部分。\n", + "\n", + "### 文本解析\n", + "\n", + "根据题目文本切分粒度的大小,文本解析又分为 **“句解析”** 和 **“词解析”**。\n", + "\n", + "(1) 句解析(sentence-tokenization):将较长的文档切分成若干句子的过程称为“分句”。每个句子为一个“令牌”(token)。(待实现) \n", + " \n", + "\n", + "(2) 词解析(text-tokenization):一个句子(不含公式)是由若干“词”按顺序构成的,将一个句子切分为若干词的过程称为“词解析”。根据词的粒度大小,又可细分为“词组解析”和\"单字解析\"。\n", + "- 词组解析 (word-tokenization):每一个词组为一个“令牌”(token)。\n", + "- 单字解析 (char-tokenization):单个字符即为一个“令牌”(token)。\n", + "\n", + "### 公式解析\n", + "\n", + "公式解析(formula-tokenization):理科类文本中常常含有公式。将一个符合 latex 语法的公式切分为标记字符列表的过程称为“公式解析”。每个标记字符为一个“令牌”(token)。 \n", + " " + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 文本解析" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 句解析\n", + "\n", + "待实现..." + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 词解析\n", + "\n", + "词解析分为两个主要步骤: \n", + "\n", + "(1) 分词: \n", + "- 词组解析:使用分词工具切分并提取题目文本中的词。 \n", + " 本项目目前支持的分词工具有:`jieba` \n", + "- 单字解析:按字符划分。\n", + " \n", + " \n", + "(2) 筛选:过滤指定的停用词。 \n", + "- 本项目默认使用的停用词表:[stopwords](https://github.com/bigdata-ustc/EduNLP/blob/master/EduNLP/meta_data/sif_stopwords.txt) \n", + "- 你也可以使用自己的停用词表,具体使用方法见下面的示例。\n" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "# 导入模块\n", + "from EduNLP.SIF.tokenization.text import tokenize " + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "# 输入\n", + "text = \"三角函数是基本初等函数之一\"" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "#### 词组解析\n", + "\n", + "分词粒度参数选择 word: `granularity = \"word\"` " + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "# 输出:默认使用 EduNLP 项目提供的停用词表\n", + "tokenize(text, granularity=\"word\")" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['三角函数', '初等', '函数']" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "#### 单字解析\n", + "\n", + "分词粒度参数选择 word: `granularity = \"char\"` " + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "# 输出:默认使用 EduNLP 项目提供的停用词表\n", + "tokenize(text, granularity=\"char\")" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['三', '角', '函', '数', '基', '初', '函', '数']" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "#### 停用词表" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "# 获取自己的停用词表\n", + "spath = \"test_stopwords.txt\"\n", + "from EduNLP.SIF.tokenization.text.stopwords import get_stopwords\n", + "stopwords = get_stopwords(spath)\n", + "stopwords" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'一旦', '一时', '一来', '一样', '一次', '一片', '一番', '一直', '一致'}" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "# 输出:传入停用词表(stopwords)\n", + "tokenize(text,granularity=\"word\",stopwords=stopwords)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['三角函数', '是', '基本', '初等', '函数', '之一']" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 公式解析\n", + "切分出 latex 公式的每个标记符号。针对本模块更加详细的解释参见 [formula](../formula/formula.ipynb)" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "# 导入模块\n", + "from EduNLP.SIF.tokenization.formula import tokenize" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 输入" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 8, + "source": [ + "formula = \"\\\\frac{\\\\pi}{x + y} + 1 = x\"" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 输出" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(1)如果您想按 latex 语法标记拆分公式的各个部分,并得到顺序序列结果,输出方法可以选择:`linear`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 9, + "source": [ + "tokenize(formula, method=\"linear\")" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['\\\\frac', '{', '\\\\pi', '}', '{', 'x', '+', 'y', '}', '+', '1', '=', 'x']" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(2) 如果您想得到公式解析出的语法分析树序列,输出方法可以选择:`ast`\n", + "> 抽象语法分析树,简称语法树(Syntax tree),是源代码语法结构的一种抽象表示。它以树状的形式表现编程语言的语法结构,树上的每个节点都表示源代码中的一种结构。 \n", + "> 因此,ast 可以看做是公式的语法结构表征。" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 10, + "source": [ + "tokenize(formula, method=\"ast\", return_type=\"list\", ord2token=False)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['\\\\pi', '{ }', 'x', '+', 'y', '{ }', '\\\\frac', '+', '1', '=', 'x']" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(3)如果您只是关心公式的结构和类型,并不关心变量具体是什么,比如二元二次方程 `x^2 + y = 1` ,它从公式结构和类型上来说,和 `w^2 + z = 1` 没有区别。 \n", + "此时,您可以设置如下参数:`ord2token = True`,将公式变量名转换成 token" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 11, + "source": [ + "# 输出形式选择抽象语法分析树(ast)且将公式变量名转换成 token\n", + "tokenize(formula, method=\"ast\", return_type=\"list\", ord2token=True)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['mathord',\n", + " '{ }',\n", + " 'mathord',\n", + " '+',\n", + " 'mathord',\n", + " '{ }',\n", + " '\\\\frac',\n", + " '+',\n", + " 'textord',\n", + " '=',\n", + " 'mathord']" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(4) 如果您除了 (3) 中提供的功能之外,还需要区分不同的变量。此时可以另外设置参数:`var_numbering=True`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 12, + "source": [ + "# 输出形式选择抽象语法分析树(ast)且将公式变量名转换成带编号的 token\n", + "tokenize(formula, method=\"ast\", return_type=\"list\", ord2token=True, var_numbering=True)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['mathord_con',\n", + " '{ }',\n", + " 'mathord_0',\n", + " '+',\n", + " 'mathord_1',\n", + " '{ }',\n", + " '\\\\frac',\n", + " '+',\n", + " 'textord',\n", + " '=',\n", + " 'mathord_0']" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 综合解析\n", + "\n", + "综合解析,即综合以上两种解析方式(标记解析 + 公式解析),提供对题目文本的全解析。另外,如遇到特殊符号将转换成常量,例如:\n", + "```python\n", + "FIGURE_SYMBOL = \"[FIGURE]\" # $\\SIFChoice$\n", + "QUES_MARK_SYMBOL = \"[MARK]\" # $\\FigureID{1}$\n", + "```\n" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 39, + "source": [ + "# 导入模块\n", + "from EduNLP.Tokenizer import get_tokenizer\n", + "\n", + "# 输入\n", + "item = {\n", + " \"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\n", + "}\n", + "\n", + "# 输出\n", + "tokenizer = get_tokenizer(\"text\")\n", + "tokens = tokenizer(item)\n", + "next(tokens) " + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['如图',\n", + " '古希腊',\n", + " '数学家',\n", + " '希波',\n", + " '克拉底',\n", + " '研究',\n", + " '几何图形',\n", + " '此图',\n", + " '三个',\n", + " '半圆',\n", + " '三个',\n", + " '半圆',\n", + " '直径',\n", + " '直角三角形',\n", + " 'ABC',\n", + " '斜边',\n", + " 'BC',\n", + " '直角',\n", + " 'AB',\n", + " 'AC',\n", + " '\\x08',\n", + " 'igtriangleupABC',\n", + " '三边',\n", + " '围成',\n", + " '区域',\n", + " '记',\n", + " 'I',\n", + " '黑色',\n", + " '记',\n", + " 'II',\n", + " '其余部分',\n", + " '记',\n", + " 'III',\n", + " '图形',\n", + " '中',\n", + " '随机',\n", + " '取',\n", + " '一点',\n", + " '此点',\n", + " '取自',\n", + " 'I',\n", + " ',',\n", + " 'II',\n", + " ',',\n", + " 'III',\n", + " '概率',\n", + " '记',\n", + " 'p',\n", + " '_',\n", + " '1',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '2',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '3',\n", + " '[MARK]',\n", + " '[FIGURE]']" + ] + }, + "metadata": {}, + "execution_count": 39 + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/utils/data.ipynb b/examples/utils/data.ipynb new file mode 100644 index 00000000..d1045c66 --- /dev/null +++ b/examples/utils/data.ipynb @@ -0,0 +1,199 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# data" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "from EduNLP.utils import dict2str4sif" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/home/lvrui/.local/lib/python3.8/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", + " warnings.warn(msg)\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "item = {\r\n", + " \"stem\": r\"若复数$z=1+2 i+i^{3}$,则$|z|=$\",\r\n", + " \"options\": ['0', '1', r'$\\sqrt{2}$', '2'],\r\n", + " }\r\n", + "item" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'stem': '若复数$z=1+2 i+i^{3}$,则$|z|=$',\n", + " 'options': ['0', '1', '$\\\\sqrt{2}$', '2']}" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "# 给题目各个部分加标签\r\n", + "dict2str4sif(item) # doctest: +ELLIPSIS" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'$\\\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\\\SIFTag{stem_end}$$\\\\SIFTag{options_begin}$$\\\\SIFTag{list_0}$0$\\\\SIFTag{list_1}$1$\\\\SIFTag{list_2}$$\\\\sqrt{2}$$\\\\SIFTag{list_3}$2$\\\\SIFTag{options_end}$'" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "dict2str4sif(item, add_list_no_tag=True) # doctest: +ELLIPSIS" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'$\\\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\\\SIFTag{stem_end}$$\\\\SIFTag{options_begin}$$\\\\SIFTag{list_0}$0$\\\\SIFTag{list_1}$1$\\\\SIFTag{list_2}$$\\\\sqrt{2}$$\\\\SIFTag{list_3}$2$\\\\SIFTag{options_end}$'" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "dict2str4sif(item, tag_mode=\"head\") # doctest: +ELLIPSIS" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'$\\\\SIFTag{stem}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\\\SIFTag{options}$$\\\\SIFTag{list_0}$0$\\\\SIFTag{list_1}$1$\\\\SIFTag{list_2}$$\\\\sqrt{2}$$\\\\SIFTag{list_3}$2'" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 9, + "source": [ + "dict2str4sif(item, tag_mode=\"tail\") # doctest: +ELLIPSIS" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'若复数$z=1+2 i+i^{3}$,则$|z|=$$\\\\SIFTag{stem}$$\\\\SIFTag{list_0}$0$\\\\SIFTag{list_1}$1$\\\\SIFTag{list_2}$$\\\\sqrt{2}$$\\\\SIFTag{list_3}$2$\\\\SIFTag{options}$'" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 10, + "source": [ + "dict2str4sif(item, add_list_no_tag=False) # doctest: +ELLIPSIS" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'$\\\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\\\SIFTag{stem_end}$$\\\\SIFTag{options_begin}$0$\\\\SIFSep$1$\\\\SIFSep$$\\\\sqrt{2}$$\\\\SIFSep$2$\\\\SIFTag{options_end}$'" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 11, + "source": [ + "dict2str4sif(item, key_as_tag=False)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\\\SIFSep$1$\\\\SIFSep$$\\\\sqrt{2}$$\\\\SIFSep$2'" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/vectorization/get_pretrained_i2v.ipynb b/examples/vectorization/get_pretrained_i2v.ipynb new file mode 100644 index 00000000..9fe707b7 --- /dev/null +++ b/examples/vectorization/get_pretrained_i2v.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# get_pretrained_i2v\n", + "\n", + "## 概述\n", + "\n", + "使用 EduNLP 项目组给定的预训练模型将给定的题目文本转成向量。\n", + "\n", + "- 优点:简单方便。\n", + "- 缺点:只能使用项目中给定的模型,局限性较大。\n" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 导入功能块" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "from EduNLP import get_pretrained_i2v" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输入\n", + "\n", + "类型:str \n", + "内容:题目文本 (text)" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "item = {\n", + "\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\n", + "}\n" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 模型选择与使用" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "根据题目所属学科选择预训练模型: \n", + "\n", + " 预训练模型名称 | 模型训练数据的所属学科 \n", + " -------------- | ---------------------- \n", + " d2v_all_256 | 全学科 \n", + " d2v_sci_256 | 理科 \n", + " d2v_eng_256 | 英语 \n", + " d2v_lit_256 | 文科 \n", + "\n" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "i2v = get_pretrained_i2v(\"d2v_sci_256\")" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "EduNLP, INFO Use pretrained t2v model d2v_sci_256\n", + "downloader, INFO http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_science_256.zip is saved as /home/lvrui/.EduNLP/model/general_science_256.zip\n", + "downloader, INFO file existed, skipped\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 注意:\n", + " 默认的 EduNLP 项目存储地址为根目录(`~/.EduNLP`),模型存储地址为项目存储地址下的 `model` 文件夹。您可以通过修改下面的环境变量来修改模型存储地址:\n", + " - EduNLP 项目存储地址:`EDUNLPPATH = xx/xx/xx`\n", + " - 模型存储地址:`EDUNLPMODELPATH = xx/xx/xx`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "print(i2v(item))" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "([array([-2.38860980e-01, 7.09681511e-02, -2.71706015e-01, 1.64714813e-01,\n", + " 2.81243492e-02, -1.82386801e-01, 9.22331214e-02, 1.31783364e-02,\n", + " 9.15176645e-02, 3.14464062e-01, 9.37800854e-02, -2.28523940e-01,\n", + " -2.60597020e-01, 6.49375990e-02, 9.75619778e-02, -1.97933778e-01,\n", + " 8.29798505e-02, -2.26491719e-01, -1.77030653e-01, -3.56038064e-02,\n", + " 6.22844934e-01, -2.66110301e-01, 8.00080523e-02, -1.60827965e-01,\n", + " -1.78654417e-01, -1.33000776e-01, 2.76004016e-01, 1.79546073e-01,\n", + " 8.71006995e-02, 2.33958483e-01, 1.76031828e-01, 1.55402005e-01,\n", + " -1.38987333e-01, -1.92975491e-01, -1.09528497e-01, 1.12305783e-01,\n", + " 2.32549626e-02, 7.75609687e-02, -2.43636876e-01, 6.35311157e-02,\n", + " -4.82399836e-02, -2.24204548e-02, 7.49862418e-02, -1.91449642e-01,\n", + " 9.72701237e-02, 4.00750965e-01, 2.81992704e-01, 3.07581365e-01,\n", + " -4.68867749e-01, -3.03025767e-02, -1.95257351e-01, 1.79073047e-02,\n", + " -2.15334237e-01, 9.98005569e-02, -2.62755096e-01, -2.39337608e-01,\n", + " 3.44270498e-01, 1.50241479e-01, -2.96006531e-01, -3.81666899e-01,\n", + " -1.19041964e-01, 6.18071109e-02, 6.49120063e-02, 9.94637012e-02,\n", + " 1.23297565e-01, 1.29930690e-01, 1.27305657e-01, -1.53804764e-01,\n", + " 7.04720244e-03, -1.33500487e-01, -1.51161134e-01, 1.13862932e-01,\n", + " -2.44814962e-01, -8.95622373e-02, 4.76458520e-02, -5.92206642e-02,\n", + " 2.88407020e-02, -5.88610955e-02, -4.25557904e-02, 3.20446432e-01,\n", + " -2.61463765e-02, 7.19539896e-02, -1.32161498e-01, 1.62227061e-02,\n", + " 1.20197656e-03, -2.03355268e-01, -6.83294982e-03, -2.82588631e-01,\n", + " -1.61395460e-01, -5.05547188e-02, -2.27462381e-01, -1.70932785e-01,\n", + " 1.41351461e-01, -1.30069017e-01, -1.83039993e-01, -6.79691881e-02,\n", + " -2.15642393e-01, -7.84436688e-02, 1.77202985e-01, 4.50607650e-02,\n", + " 7.02605024e-02, 8.01992565e-02, -1.55584306e-01, -2.00563252e-01,\n", + " 1.17082551e-01, 9.73844752e-02, -1.10356934e-01, -1.37866074e-02,\n", + " -8.57235789e-02, -5.56467362e-02, -9.36827138e-02, 6.82030804e-03,\n", + " 6.92379624e-02, -2.28701755e-01, 6.70390204e-02, 1.34586483e-01,\n", + " 2.25231394e-01, 1.33322045e-01, -8.82911906e-02, 1.42205298e-01,\n", + " 2.41012901e-01, 7.94170424e-03, -7.02124536e-02, 2.51370400e-01,\n", + " 1.04983136e-01, -6.39194548e-02, 5.24720028e-02, 7.16757867e-03,\n", + " -1.08169973e-01, -1.08731678e-02, 1.69618204e-02, 7.87692815e-02,\n", + " -2.26539060e-01, 3.29003595e-02, 1.91522852e-01, 2.75921494e-01,\n", + " -1.64055750e-01, 5.83723187e-02, 9.84422341e-02, 3.21688712e-01,\n", + " -2.62310840e-02, -2.08140060e-01, 1.14425711e-01, 1.23823956e-01,\n", + " -8.62085819e-03, -4.14005108e-02, -3.41566652e-02, 1.34680912e-01,\n", + " 4.27634180e-01, 1.42883554e-01, -1.54787973e-01, 7.96157196e-02,\n", + " 1.40678003e-01, 1.39171826e-02, 1.66003749e-01, -4.85638082e-02,\n", + " 5.88261709e-02, 9.51106697e-02, 1.81014258e-02, 1.44485429e-01,\n", + " 4.01205927e-01, 6.77596256e-02, -5.52676022e-01, -1.87850371e-01,\n", + " 1.12366609e-01, -6.84190989e-02, 9.48949978e-02, 2.23454669e-01,\n", + " -1.69843137e-01, 2.09085494e-01, 4.29946512e-01, -3.36349100e-01,\n", + " 6.12608856e-03, -1.46142125e-01, -5.11092655e-02, 8.06671828e-02,\n", + " 1.81744993e-01, -6.78945482e-02, -5.77093139e-02, 1.52337164e-01,\n", + " 2.21259117e-01, 3.35705757e-01, -2.51778495e-02, 1.03662543e-01,\n", + " -4.21361588e-02, 1.43061429e-01, -3.92947495e-01, -4.89463992e-02,\n", + " -9.15660262e-02, -1.00108273e-01, 3.86523217e-01, -4.25569601e-02,\n", + " 4.10154127e-02, -3.41399819e-01, 2.13903114e-02, 8.09015241e-03,\n", + " 9.56344381e-02, 1.12729572e-01, 7.25207478e-02, -6.64384067e-02,\n", + " -2.73666024e-01, -2.79651750e-02, 1.18422434e-01, -5.22459708e-02,\n", + " -2.47057881e-02, 2.84700710e-02, 2.07451075e-01, -9.74238589e-02,\n", + " 8.08936954e-02, 4.07307222e-02, -1.35277033e-01, 2.18436554e-01,\n", + " 1.28792310e-02, -1.20433331e-01, 2.41929386e-02, 1.28128864e-02,\n", + " -7.39881098e-02, -1.12995692e-01, 7.69245178e-02, -2.87000872e-02,\n", + " 1.64782573e-02, -2.78794408e-01, -2.64403820e-01, -2.43874848e-01,\n", + " 1.77457914e-01, 4.11631197e-01, -6.09753132e-02, 2.84967333e-01,\n", + " 9.81074646e-02, -2.68213183e-01, 1.52153388e-01, 2.42148209e-02,\n", + " 1.24371536e-01, 6.02926640e-03, 8.22689310e-02, 2.82294262e-04,\n", + " -1.40584474e-02, 4.09389734e-02, -2.58334547e-01, -9.83026102e-02,\n", + " -1.91695184e-01, -2.61005852e-02, -2.21736208e-01, -4.36628833e-02,\n", + " 9.49840024e-02, -5.16017936e-02, 2.17577979e-01, 2.58604765e-01,\n", + " 6.33814484e-02, -7.10158283e-03, 9.87893157e-03, -2.26405971e-02,\n", + " 1.67435139e-01, 2.90897069e-03, 2.35914681e-02, 5.43428905e-06],\n", + " dtype=float32)], None)\n" + ] + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/vectorization/get_pretrained_t2v.ipynb b/examples/vectorization/get_pretrained_t2v.ipynb new file mode 100644 index 00000000..c0982e81 --- /dev/null +++ b/examples/vectorization/get_pretrained_t2v.ipynb @@ -0,0 +1,160 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# get_pretrained_t2v\n", + "\n", + "## 概述\n", + "\n", + "使用 EduNLP 项目组给定的预训练模型将一组题目的切分序列表征为向量。\n", + "\n", + "- 优点:简单方便。\n", + "- 缺点:只能使用项目中给定的模型,局限性较大。" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 导入功能块" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "from tqdm import tqdm\n", + "from EduNLP.SIF.segment import seg\n", + "from EduNLP.SIF.tokenization import tokenize\n", + "from EduNLP.Pretrain import GensimWordTokenizer\n", + "from EduNLP.Vector import get_pretrained_t2v" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输入\n", + "\n", + "类型:list \n", + "内容:一个题组中每个题目切分序列的组合。\n", + "> 这里需要调用 `GensimWordTokenizer` 将题目文本(`str` 类型)转换成 tokens。" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "def load_items():\n", + " test_items = [\n", + " {'ques_content':'有公式$\\\\FormFigureID{wrong1?}$和公式$\\\\FormFigureBase64{wrong2?}$,如图$\\\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$'},\n", + " {'ques_content':'如图$\\\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$'},\n", + " {'ques_content':'
Below is a discussion on a website.
t2v\n", + "t2v = get_pretrained_t2v(\"d2v_sci_256\")" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "downloader, INFO http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_science_256.zip is saved as /home/lvrui/.EduNLP/model/general_science_256.zip\n", + "downloader, INFO file existed, skipped\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 注意:\n", + " 默认的 EduNLP 项目存储地址为根目录(`~/.EduNLP`),模型存储地址为项目存储地址下的 `model` 文件夹。您可以通过修改下面的环境变量来修改模型存储地址:\n", + " - EduNLP 项目存储地址:`EDUNLPPATH = xx/xx/xx`\n", + " - 模型存储地址:`EDUNLPMODELPATH = xx/xx/xx`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "t2v(token_items)" + ], + "outputs": [], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/vectorization/i2v.ipynb b/examples/vectorization/i2v.ipynb new file mode 100644 index 00000000..3122fbce --- /dev/null +++ b/examples/vectorization/i2v.ipynb @@ -0,0 +1,192 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# I2V\n", + "\n", + "## 概述\n", + "\n", + "使用自己提供的任一预训练模型(给出模型存放路径即可)将给定的题目文本转成向量。\n", + "\n", + "- 优点:可以使用自己的模型,另可调整训练参数,灵活性强。" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 导入类" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "from EduNLP.I2V import D2V" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输入\n", + "\n", + "类型:str \n", + "内容:题目文本 (text)" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 37, + "source": [ + "item = {\n", + "\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\n", + "}" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输出" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 34, + "source": [ + "model_path = \"../test_model/test_gensim_luna_stem_tf_d2v_256.bin\"\n", + "i2v = D2V(\"text\",\"d2v\",filepath=model_path, pretrained_t2v = False)\n", + "i2v " + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 34 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 35, + "source": [ + "i2v(item)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "([array([ 4.76559885e-02, -1.60574958e-01, 1.94614579e-03, 2.40295693e-01,\n", + " 2.24517003e-01, -3.24351490e-02, 4.35789041e-02, -1.65670961e-02,\n", + " -7.77302235e-02, 4.23757173e-02, 4.62658405e-02, 7.54115507e-02,\n", + " -4.54682261e-02, -1.82153687e-01, 5.55203669e-02, 4.23391759e-02,\n", + " 8.86691213e-02, 6.97413310e-02, -2.47167766e-01, 2.54209518e-01,\n", + " -3.76413465e-02, 3.58376503e-02, -1.39907554e-01, -8.55517760e-02,\n", + " -1.62535697e-01, -4.44540828e-02, -3.99694731e-03, 1.83905549e-02,\n", + " -8.03738683e-02, -9.05910060e-02, 1.45633578e-01, 9.63102728e-02,\n", + " -7.19666481e-02, -8.49684048e-03, -1.51718438e-01, -1.46381939e-02,\n", + " 8.34727809e-02, -7.11122975e-02, 1.66607365e-01, -1.14558250e-01,\n", + " -1.72963589e-01, 4.86062802e-02, -1.63086802e-02, -3.68945636e-02,\n", + " 2.46143237e-01, 5.40899672e-03, 5.04904091e-02, 1.16586924e-01,\n", + " 7.59096816e-02, 1.20751150e-02, 1.04407202e-02, 3.19544263e-02,\n", + " -6.02783300e-02, 1.18572332e-01, -2.19343737e-01, 2.67594811e-02,\n", + " 1.01860933e-01, -2.87170410e-02, 5.16606905e-02, 1.62313670e-01,\n", + " -5.12879491e-02, -1.62193626e-02, -6.77167401e-02, 1.67254247e-02,\n", + " 1.10977821e-01, 8.02466944e-02, -2.00764649e-02, 1.28788516e-01,\n", + " -7.20706284e-02, -6.22547232e-02, 1.06899485e-01, 4.60059335e-03,\n", + " -1.99650228e-01, -1.38489634e-01, 7.20307231e-02, -4.98757213e-02,\n", + " -1.94095057e-02, -5.85906627e-03, 1.47433639e-01, 4.68258560e-02,\n", + " 9.31144804e-02, -4.59938832e-02, 3.38427201e-02, 4.83937971e-02,\n", + " -1.27312467e-01, 2.01561809e-01, 1.10482745e-01, -1.70595810e-01,\n", + " -9.55015421e-02, -7.73611516e-02, 4.43056040e-02, -1.65684260e-02,\n", + " 1.65379923e-02, -1.26138464e-01, 8.31304193e-02, 2.06687212e-01,\n", + " -1.69529378e-01, 3.43789416e-03, 1.19198427e-01, -1.38129979e-01,\n", + " -1.87937781e-01, -8.27087983e-02, -1.76488962e-02, 8.51018950e-02,\n", + " 8.15693215e-02, 2.30262652e-02, 1.05074964e-01, 3.13350782e-02,\n", + " 1.53877333e-01, 1.01772640e-02, 9.17675197e-02, -1.32400826e-01,\n", + " 5.29836975e-02, 2.52282787e-02, -6.19753152e-02, -5.56256585e-02,\n", + " 3.87686864e-02, 4.30755690e-02, 7.57815093e-02, 2.63280701e-02,\n", + " 4.59217802e-02, -1.17288530e-01, 1.76368475e-01, 9.27482091e-04,\n", + " 2.64808517e-02, 9.73805785e-03, 1.90501258e-01, 1.02596413e-02,\n", + " -5.55249080e-02, -1.17555618e-01, -9.98716354e-02, 1.28057361e-01,\n", + " -4.52451073e-02, 7.51599446e-02, -3.01250312e-02, 6.24186322e-02,\n", + " 5.77449016e-02, 2.07213312e-02, -2.53734970e-03, -1.69801563e-01,\n", + " -2.28750743e-02, -2.55512260e-02, 1.70693725e-01, 2.35232189e-01,\n", + " -2.71384805e-01, -1.84327438e-01, 4.16823551e-02, 8.70332569e-02,\n", + " 1.82847306e-01, 2.76729286e-01, -4.31840494e-02, -1.38212308e-01,\n", + " -3.26297544e-02, -4.25132550e-02, -1.62892416e-01, 1.91870285e-03,\n", + " 1.52552709e-01, -1.01523520e-02, -9.16219354e-02, -5.46490997e-02,\n", + " 6.06994517e-02, -6.42470419e-02, 7.96310753e-02, -5.70830703e-02,\n", + " -8.82780831e-03, -3.94574478e-02, 9.63162258e-02, 1.54309124e-01,\n", + " 1.81100428e-01, 8.63620341e-02, 1.56518817e-02, -4.08006124e-02,\n", + " 5.20652272e-02, 8.38029310e-02, -1.55516326e-01, 3.57730500e-03,\n", + " -1.50946556e-02, 2.84812655e-02, 1.37905419e-01, 8.77659023e-02,\n", + " 8.23542774e-02, -1.04377635e-01, 4.80731949e-03, 1.18891411e-02,\n", + " 9.32120830e-02, 7.88019150e-02, -1.44494563e-01, -7.53350407e-02,\n", + " -1.13602541e-01, 5.43805361e-02, 1.64935380e-01, -2.00515296e-02,\n", + " 1.92917317e-01, -4.35359031e-02, 8.92477036e-02, -4.37481068e-02,\n", + " 4.01461311e-02, -2.59898454e-01, -1.11872263e-01, -1.25746787e-01,\n", + " -2.34577611e-01, -6.69524372e-02, 5.55978045e-02, -1.91931397e-01,\n", + " 5.87355606e-02, 1.01886272e-01, -2.64038593e-01, -2.05450356e-02,\n", + " -1.97510555e-01, 9.13371146e-02, 1.49546817e-01, -3.91026959e-02,\n", + " 5.94646595e-02, 1.29657034e-02, -3.72891256e-04, 5.56622408e-02,\n", + " 1.61776438e-01, 2.29037628e-02, -1.94774106e-01, -5.02247922e-02,\n", + " -5.45939505e-02, 5.31783216e-02, 1.26433298e-01, -1.23263724e-01,\n", + " 8.53074417e-02, -1.41412809e-01, -7.71067888e-02, 1.21865064e-01,\n", + " 4.73318882e-02, 7.20091909e-02, -9.83269960e-02, 1.99413914e-02,\n", + " -1.88907124e-02, -2.14710683e-02, -4.93260436e-02, 1.64937660e-01,\n", + " -1.07827298e-01, -7.75848776e-02, -6.23578345e-03, -1.05760902e-01,\n", + " -4.14819457e-02, 5.95730543e-02, 4.11023498e-02, -2.18305327e-02,\n", + " -2.30057724e-02, -3.34391668e-02, 1.30382255e-01, 5.10290638e-02,\n", + " -1.21569566e-01, -1.23630039e-01, -1.83883369e-01, 1.10945016e-01,\n", + " -1.05633408e-01, -8.24846700e-02, -3.76710802e-01, -4.50239740e-02],\n", + " dtype=float32)],\n", + " None)" + ] + }, + "metadata": {}, + "execution_count": 35 + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/vectorization/t2v.ipynb b/examples/vectorization/t2v.ipynb new file mode 100644 index 00000000..908ff182 --- /dev/null +++ b/examples/vectorization/t2v.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# T2V\n", + "\n", + "## 概述\n", + "\n", + "使用自己提供的任一预训练模型(给出模型存放路径即可)将一组题目的切分序列表征为向量。\n", + "\n", + "- 优点:模型及其参数可自主调整,灵活性强。\n" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 导入功能块" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 12, + "source": [ + "from tqdm import tqdm\n", + "from EduNLP.SIF.segment import seg\n", + "from EduNLP.SIF.tokenization import tokenize\n", + "from EduNLP.Pretrain import GensimWordTokenizer\n", + "from EduNLP.Vector import T2V" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输入\n", + "\n", + "类型:list \n", + "内容:一个题组中每个题目切分序列的组合。\n", + "> 这里需要调用 `GensimWordTokenizer` 将题目文本(`str` 类型)转换成 tokens。" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 11, + "source": [ + "print(type(token_items))\n", + "print(type(token_items[0]))" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "token_items[0]" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['公式',\n", + " '[FORMULA]',\n", + " '公式',\n", + " '[FORMULA]',\n", + " '如图',\n", + " '[FIGURE]',\n", + " 'x',\n", + " ',',\n", + " 'y',\n", + " '约束条件',\n", + " '[SEP]',\n", + " 'z',\n", + " '=',\n", + " 'x',\n", + " '+',\n", + " '7',\n", + " 'y',\n", + " '最大值',\n", + " '[MARK]']" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输出" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "path = \"../test_model/test_gensim_luna_stem_tf_d2v_256.bin\"\n", + "t2v = T2V('d2v',filepath=path)\n", + "t2v(token_items)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[array([ 0.0256574 , 0.06061139, -0.00121044, -0.0167674 , -0.0111706 ,\n", + " 0.05325712, -0.02097339, -0.01613594, 0.02904145, 0.0185046 ,\n", + " 0.03473525, 0.00628165, 0.03696947, 0.00666153, -0.02352318,\n", + " -0.00458236, 0.02308686, -0.02153478, 0.01579256, -0.01575841,\n", + " -0.02654778, 0.01376328, 0.02539059, -0.01098955, 0.02203193,\n", + " -0.01503642, 0.01310026, -0.03569775, -0.00450978, 0.02522727,\n", + " -0.01547103, -0.00907244, -0.00072009, -0.0021727 , 0.02894731,\n", + " 0.01382611, 0.01647377, 0.00452782, -0.02488854, 0.02741116,\n", + " 0.0489724 , -0.04156181, -0.00855933, 0.01783935, 0.00704233,\n", + " 0.01296936, -0.06078439, -0.04922014, -0.0206639 , 0.00820663,\n", + " 0.02565274, 0.0164784 , 0.00996537, -0.02215545, 0.06741589,\n", + " 0.01634789, -0.0094168 , 0.00183323, 0.00853508, -0.0547929 ,\n", + " 0.00405556, 0.01386227, -0.04204945, 0.02175955, -0.01960315,\n", + " -0.05279269, -0.01511251, -0.02905018, -0.00405249, 0.03328003,\n", + " -0.00487469, -0.00338632, 0.01793213, 0.00942458, -0.02468935,\n", + " 0.03548338, -0.00907473, 0.00927462, -0.02545504, 0.02286367,\n", + " -0.01822809, 0.03625014, -0.00976438, -0.00188348, 0.06408882,\n", + " -0.04314236, -0.00193059, 0.02433112, -0.0091018 , 0.0276503 ,\n", + " -0.0036342 , -0.02485391, 0.02309245, 0.01880057, -0.00893952,\n", + " -0.03391525, 0.02678591, -0.00618519, -0.03601262, 0.0327184 ,\n", + " 0.09240578, 0.03631649, -0.00700663, -0.01786321, -0.02987848,\n", + " 0.00315695, -0.02082208, -0.00494443, -0.02717963, -0.00938541,\n", + " -0.0329605 , 0.0069218 , 0.01227082, 0.00856757, -0.0008222 ,\n", + " -0.0067637 , -0.01577486, 0.0628339 , -0.02329138, -0.00475964,\n", + " 0.02197625, 0.03022351, 0.00256966, -0.00247619, -0.01218352,\n", + " 0.01257284, 0.0051926 , -0.05297434, -0.0057066 , 0.01031242,\n", + " 0.02414824, -0.0115857 , 0.01625632, -0.03126714, -0.02389767,\n", + " -0.01417263, 0.02280749, -0.01431546, -0.00771551, 0.0264634 ,\n", + " 0.00115387, -0.01903204, -0.00100629, 0.00608774, 0.03787961,\n", + " 0.05098663, 0.03064756, -0.00654223, -0.01838502, -0.01889201,\n", + " 0.04686983, -0.02295219, -0.00901293, 0.00916024, -0.00013042,\n", + " 0.01236307, -0.00918534, 0.01792936, 0.00862702, -0.00018518,\n", + " -0.00566689, 0.00499178, 0.0246148 , -0.0170825 , 0.01850726,\n", + " 0.00031357, 0.02411471, 0.01080729, -0.01361136, -0.06226439,\n", + " 0.01830878, 0.01209503, -0.00980596, -0.01865078, 0.03692432,\n", + " -0.04503555, 0.0037965 , -0.04214804, -0.05657932, -0.01566005,\n", + " 0.00271924, -0.00026349, -0.00783886, 0.01218421, -0.03205092,\n", + " -0.02793218, -0.00298462, 0.00380523, 0.04471321, -0.02079478,\n", + " 0.0100926 , 0.00450996, -0.03412817, 0.03027697, 0.00872989,\n", + " 0.01512562, 0.01527565, 0.03683509, 0.05608684, 0.01055199,\n", + " 0.01637757, -0.01995301, -0.01610573, 0.04207385, 0.00058077,\n", + " 0.03846577, 0.04952911, -0.02142448, 0.0049874 , -0.00308159,\n", + " -0.02233348, 0.02013967, -0.01194606, -0.02481469, 0.01824989,\n", + " -0.00939436, -0.00374474, 0.02278485, 0.04107878, 0.01870474,\n", + " -0.00310527, -0.00257802, -0.03689042, -0.0200304 , -0.04838364,\n", + " 0.0035307 , 0.02496746, -0.0385387 , 0.01649689, 0.01429029,\n", + " 0.04338812, -0.05614391, -0.01632982, 0.03378268, 0.01393604,\n", + " -0.03859077, 0.01855484, 0.00241599, -0.00985778, 0.00530987,\n", + " 0.03700508, -0.06107654, -0.00972089, 0.02251891, 0.01154722,\n", + " 0.00913082, -0.0267815 , -0.01723521, 0.0136464 , 0.01965802,\n", + " 0.04769301, -0.02218902, -0.01268643, 0.00650465, 0.00985247,\n", + " 0.0029873 ], dtype=float32),\n", + " array([ 0.00877787, 0.03242666, -0.00026327, -0.01881958, -0.00730135,\n", + " 0.03559063, -0.01825701, -0.01065201, 0.01681685, 0.01074173,\n", + " 0.02253641, 0.0082016 , 0.02200216, 0.00088347, -0.0205142 ,\n", + " -0.01339685, 0.01239092, -0.01781665, 0.01000167, -0.01227449,\n", + " -0.03044926, 0.00296532, 0.01440197, -0.01035894, 0.01061506,\n", + " -0.00530907, 0.00484147, -0.02209524, 0.00735557, 0.01712263,\n", + " -0.00231011, -0.01255511, -0.00114341, -0.01413104, 0.02112199,\n", + " 0.01123461, 0.01380601, -0.00019924, -0.02128731, 0.01526375,\n", + " 0.02988552, -0.02491145, -0.00939747, 0.00798917, 0.0135474 ,\n", + " 0.01258122, -0.03753063, -0.04039029, -0.01517935, 0.00668549,\n", + " 0.02796665, 0.01242495, 0.0059546 , -0.01216253, 0.0372387 ,\n", + " 0.01762399, -0.00170241, 0.0003667 , 0.00895109, -0.03517802,\n", + " -0.00762667, 0.01357641, -0.02436312, 0.01829541, -0.01330634,\n", + " -0.02818829, -0.01139517, -0.01664645, 0.00769452, 0.01209339,\n", + " -0.00416979, -0.01296107, -0.0064631 , 0.0050506 , -0.01833598,\n", + " 0.02872021, -0.00062401, 0.0109796 , -0.01280711, 0.01152301,\n", + " -0.01085931, 0.02023655, 0.00272896, -0.00558658, 0.03704501,\n", + " -0.01837787, -0.00414707, 0.00713773, -0.01023714, 0.0090292 ,\n", + " 0.00089387, -0.01082103, 0.02051528, 0.01287969, -0.0074691 ,\n", + " -0.01942614, 0.01223695, -0.0136801 , -0.01567431, 0.01466064,\n", + " 0.04967042, 0.02889016, -0.005946 , -0.00131571, -0.0110809 ,\n", + " 0.00165396, -0.01279759, -0.01407798, -0.01902512, -0.01361593,\n", + " -0.00631681, -0.00142478, 0.01678663, 0.00815052, -0.00193329,\n", + " -0.00845464, -0.00746565, 0.03766166, -0.01099476, 0.00489809,\n", + " 0.01403449, 0.01477709, -0.00150515, 0.00462877, -0.01271886,\n", + " 0.00072193, 0.00815068, -0.04432011, -0.00604029, -0.00264471,\n", + " 0.01325564, -0.01315497, 0.00713541, -0.0137267 , -0.01845939,\n", + " -0.02801731, 0.01673851, -0.00593479, -0.01457028, 0.01636872,\n", + " -0.00751132, -0.01056858, 0.01126528, 0.01645665, 0.02689397,\n", + " 0.01920939, 0.01767929, -0.00843761, -0.01002457, -0.00844629,\n", + " 0.02888541, -0.00503441, -0.00025836, 0.01326172, -0.00968244,\n", + " 0.00430614, -0.00964946, 0.00635843, 0.00445558, -0.00235765,\n", + " 0.00160239, -0.00325711, 0.03206096, -0.00511734, 0.01108837,\n", + " 0.0014369 , 0.02616214, 0.01631057, -0.00778238, -0.04322761,\n", + " -0.00086197, 0.01174034, -0.00230315, -0.01354581, 0.01665967,\n", + " -0.02281472, -0.0123808 , -0.02901287, -0.04143119, -0.00477564,\n", + " 0.00608404, -0.00701787, -0.00686041, 0.01422733, -0.02854553,\n", + " -0.01464688, -0.00404892, 0.00348112, 0.02299088, -0.02302668,\n", + " 0.01208024, 0.01010513, -0.01571813, 0.01446694, -0.00129136,\n", + " -0.00054684, -0.00328883, 0.01649218, 0.03326375, -0.00185443,\n", + " 0.02091988, -0.00814938, -0.0088084 , 0.02302703, -0.01156406,\n", + " 0.04080933, 0.02902327, -0.01330268, -0.00385899, -0.00826302,\n", + " -0.02295679, 0.00658087, -0.0056047 , -0.01404469, 0.00368797,\n", + " -0.01484573, 0.00689151, 0.02035506, 0.02181732, 0.02151672,\n", + " 0.0004279 , -0.00763045, -0.01551796, -0.02054572, -0.03275407,\n", + " 0.00623783, 0.007831 , -0.02604559, 0.01956206, 0.0161521 ,\n", + " 0.02634443, -0.03285164, -0.01301691, 0.01066694, 0.01585914,\n", + " -0.0187955 , 0.01046878, -0.00189302, -0.01132144, -0.00140048,\n", + " 0.02645635, -0.04300842, -0.00639437, 0.01285532, -0.00437311,\n", + " 0.01163111, -0.015357 , -0.00531165, 0.01102756, 0.00182517,\n", + " 0.02303016, -0.00949884, -0.02009463, 0.00573564, 0.00076009,\n", + " 0.00078505], dtype=float32)]" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file