diff --git a/asset/_static/data.png b/asset/_static/data.png new file mode 100644 index 00000000..b6c9daa1 Binary files /dev/null and b/asset/_static/data.png differ diff --git a/asset/_static/formula.png b/asset/_static/formula.png index 10fecbd3..3cabf913 100644 Binary files a/asset/_static/formula.png and b/asset/_static/formula.png differ diff --git a/asset/_static/i2v.png b/asset/_static/i2v.png new file mode 100644 index 00000000..3da11cd0 Binary files /dev/null and b/asset/_static/i2v.png differ diff --git a/asset/_static/parse.png b/asset/_static/parse.png new file mode 100644 index 00000000..fd345f20 Binary files /dev/null and b/asset/_static/parse.png differ diff --git a/asset/_static/prepare_dataset.jpg b/asset/_static/prepare_dataset.jpg new file mode 100644 index 00000000..e82d5c42 Binary files /dev/null and b/asset/_static/prepare_dataset.jpg differ diff --git a/asset/_static/seg.png b/asset/_static/seg.png new file mode 100644 index 00000000..a04de8bc Binary files /dev/null and b/asset/_static/seg.png differ diff --git a/asset/_static/sif_addition.png b/asset/_static/sif_addition.png new file mode 100644 index 00000000..db7ccfdc Binary files /dev/null and b/asset/_static/sif_addition.png differ diff --git a/asset/_static/tokenizer.png b/asset/_static/tokenizer.png new file mode 100644 index 00000000..f074449c Binary files /dev/null and b/asset/_static/tokenizer.png differ diff --git a/docs/source/conf.py b/docs/source/conf.py index 4f1bc27d..d0d6f9a2 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -57,12 +57,15 @@ def copy_tree(src, tar): nbsphinx_thumbnails = { 'build/blitz/sif/sif': '_static/item_figure.png', - 'build/blitz/utils/data.ipynb': '_static/data.png', - 'build/blitz/formula/formula.ipynb': '_static/item_formula.png', - 'build/blitz/sif/sif_addition.ipynb': '_static/sif_addition.png', + 'build/blitz/sif/sif_addition': '_static/sif_addition.png', + 'build/blitz/utils/data': '_static/data.png', + 'build/blitz/formula/formula': '_static/formula.png', + 'build/blitz/seg/seg': '_static/seg.png', + 'build/blitz/parse/parse': '_static/parse.png', + 'build/blitz/formula/formula': '_static/formula.png', 'build/blitz/tokenizer/tokenizer': '_static/tokenizer.png', - 'build/blitz/pretrain/prepare_dataset.ipynb': '_static/item_figure.png', - 'build/blitz/pretrain/d2v.ipynb': '_static/item_figure.png', + 'build/blitz/pretrain/prepare_dataset': '_static/prepare_dataset.jpg', + 'build/blitz/vectorization/i2v': '_static/i2v.png', } # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/tutorial/zh/parse.rst b/docs/source/tutorial/zh/parse.rst index 6c91049e..e73f0075 100644 --- a/docs/source/tutorial/zh/parse.rst +++ b/docs/source/tutorial/zh/parse.rst @@ -24,46 +24,12 @@ 4.匹配latex公式,主要检查latex公式的完整性和可解析性,对latex 中出现中文字符发出警告 -文本语法结构解析 +学习路线图 -------------------- - -将文本中的字母、数字等进行提取,将其转换为标准格式。 - -Examples: -:: - - >>> text = '生产某种零件的A工厂25名工人的日加工零件数_ _' - >>> text_parser = Parser(text) - >>> text_parser.description_list() - >>> text_parser.text - >>> '生产某种零件的$A$工厂$25$名工人的日加工零件数$\\SIFBlank$' - .. toctree:: - :maxdepth: 1 :titlesonly: - parse <../../build/blitz/parse/parse.ipynb> - - -公式语法结构解析 --------------------- - -可以检查公式是否合法。 - -Examples: -:: - - >>> text = '支持公式如$\\frac{y}{x}$,$\\SIFBlank$,$\\FigureID{1}$,不支持公式如$\\frac{ \\dddot y}{x}$' - >>> text_parser = Parser(text) - >>> text_parser.description_list() - >>> text_parser.fomula_illegal_flag - >>> 1 - - -.. toctree:: - :maxdepth: 1 - :titlesonly: + 文本语法结构解析 + 公式语法结构解析 - tree <../../build/blitz/formula/tree.ipynb> - formula <../../build/blitz/formula/formula.ipynb> diff --git "a/docs/source/tutorial/zh/parse/\345\205\254\345\274\217\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" "b/docs/source/tutorial/zh/parse/\345\205\254\345\274\217\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" new file mode 100644 index 00000000..1a7717fb --- /dev/null +++ "b/docs/source/tutorial/zh/parse/\345\205\254\345\274\217\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" @@ -0,0 +1,61 @@ +公式语法结构解析 +-------------------- + +本功能主要由EduNLP.Formula模块实现,具有检查传入的公式是否合法,并将合法的公式转换为art树的形式。从实际使用的角度,本模块常作为中间处理过程,调用相应的模型即可自动选择本模块的相关参数,故一般不需要特别关注。 + +主要内容介绍 ++++++++++++++++ + +1.Formula:对传入的单个公式进行判断,判断传入的公式是否为str形式,如果是则使用ast的方法进行处理,否则进行报错。此外,提供了variable_standardization参数,当此参数为True时,使用变量标准化方法,即同一变量拥有相同的变量编号。 + +2.FormulaGroup:如果需要传入公式集则可调用此接口,最终将形成ast森林,森林中树的结构同Formula。 + + +Examples: + +:: + + >>> text = '支持公式如$\\frac{y}{x}$,$\\SIFBlank$,$\\FigureID{1}$,不支持公式如$\\frac{ \\dddot y}{x}$' + >>> text_parser = Parser(text) + >>> text_parser.description_list() + >>> text_parser.fomula_illegal_flag + >>> 1 + +:: + + >>> f = Formula("x") + >>> f + + >>> f.ast + [{'val': {'id': 0, 'type': 'mathord', 'text': 'x', 'role': None}, 'structure': {'bro': [None, None], 'child': None, 'father': None, 'forest': None}}] + >>> f.elements + [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None}] + >>> f.variable_standardization(inplace=True) + + >>> f.elements + [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}] + +:: + + >>> fg = FormulaGroup(["x + y", "y + x", "z + x"]) + >>> fg + ;;> + >>> fg = FormulaGroup(["x + y", Formula("y + x"), "z + x"]) + >>> fg + ;;> + >>> fg = FormulaGroup(["x", Formula("y"), "x"]) + >>> fg.elements + [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'y', 'role': None},\ + {'id': 2, 'type': 'mathord', 'text': 'x', 'role': None}] + >>> fg = FormulaGroup(["x", Formula("y"), "x"], variable_standardization=True) + >>> fg.elements + [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}, {'id': 1, 'type': 'mathord', 'text': 'y', 'role': None, 'var': 1}, {'id': 2, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}] + +详细示范 ++++++++++++++++ + +.. toctree:: + :titlesonly: + + 树型处理效果 <../../../build/blitz/formula/tree.ipynb> + 公式解析效果案例 <../../../build/blitz/formula/formula.ipynb> diff --git "a/docs/source/tutorial/zh/parse/\346\226\207\346\234\254\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" "b/docs/source/tutorial/zh/parse/\346\226\207\346\234\254\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" new file mode 100644 index 00000000..f2f442a0 --- /dev/null +++ "b/docs/source/tutorial/zh/parse/\346\226\207\346\234\254\350\257\255\346\263\225\347\273\223\346\236\204\350\247\243\346\236\220.rst" @@ -0,0 +1,39 @@ +文本语法结构解析 +-------------------- + +本部分主要由EduNLP.SIF.Parse模块实现,主要功能为将文本中的字母、数字等进行提取,将其转换为标准格式。 + +主要流程介绍 ++++++++++++++++ + +1.按照以下顺序,先后对传入的文本进行判断类型 + +* is_chinese:用于匹配中文字符 [\u4e00-\u9fa5] + +* is_alphabet:匹配公式之外的英文字母,将匹配到的只对两个汉字之间的字母做修正(使用$$包裹起来),其余匹配到的情况视为不合 latex 语法录入的公式 + +* is_number:匹配公式之外的数字,只对两个汉字之间的数字做修正(使用$$包裹起来),其余匹配到的情况视为不合 latex 语法录入的公式 + +2.匹配 latex 公式 + +* latex 中出现中文字符,打印且只打印一次 warning + +* 使用_is_formula_legal函数,检查latex公式的完整性和可解析性,对于不合法公式报错 + +Examples: + +:: + + >>> text = '生产某种零件的A工厂25名工人的日加工零件数_ _' + >>> text_parser = Parser(text) + >>> text_parser.description_list() + >>> text_parser.text + >>> '生产某种零件的$A$工厂$25$名工人的日加工零件数$\\SIFBlank$' + +详细示范 ++++++++++++++++ + +.. toctree:: + :titlesonly: + + 文本语法结构解析的案例 <../../../build/blitz/parse/parse.ipynb> diff --git a/docs/source/tutorial/zh/pretrain.rst b/docs/source/tutorial/zh/pretrain.rst index a9aa9594..bcc737ce 100644 --- a/docs/source/tutorial/zh/pretrain.rst +++ b/docs/source/tutorial/zh/pretrain.rst @@ -75,7 +75,7 @@ Examples: :: 全量版本-全学科的D2V模型路径: - `/share/qlh/d2v_model/luna_private/luna_private_all_gensim_luna_stem_general_d2v_256.bin` + `/share/qlh/d2v_model/luna_pub/luna_pub_all_gensim_luna_stem_general_d2v_256.bin` (备注:一个D2V模型含4个bin后缀的文件) 模型训练数据说明 diff --git a/docs/source/tutorial/zh/seg.rst b/docs/source/tutorial/zh/seg.rst index 35cc1882..8dc91196 100644 --- a/docs/source/tutorial/zh/seg.rst +++ b/docs/source/tutorial/zh/seg.rst @@ -7,55 +7,19 @@ * 语义成分分解 * 结构成分分解 -语义成分分解 ------------- +主要处理内容 +-------------------- -特别的,由于选择题是以字典的形式给出,故需要进行特殊处理,这里可以调用./Utils/data中的dict2str4sif函数,将选择题形式的item转换为字符格式,并将题干和选项、各选项之间分割开来。 +1.将字典输入形式的选择题通过语义成分分解转换为符合条件的item; -Examples: -:: +2.将输入的item按照元素类型进行切分、分组。 - >>> item = { +学习路线图 +-------------------- - ... "stem": r"若复数$z=1+2 i+i^{3}$,则$|z|=$", - - ... "options": ['0', '1', r'$\sqrt{2}$', '2'], - - ... } - - >>> item - - {'stem': '若复数$z=1+2 i+i^{3}$,则$|z|=$', 'options': ['0', '1', '$\\sqrt{2}$', '2']} - - >>> dict2str4sif(item, key_as_tag=False) - - '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' - .. toctree:: - :maxdepth: 1 :titlesonly: - dict2str4sif <../../build/blitz/utils/data.ipynb> - - -结构成分分解 ------------- - -对切片后的item中的各个元素进行分词,提供深度选项,可以按照需求选择所有地方切分或者在部分标签处切分(比如\SIFSep、\SIFTag处);对标签添加的位置也可以进行选择,可以在头尾处添加或仅在头或尾处添加。 - -具有两种模式,一种是linear模式,用于对文本进行处理(使用jieba库进行分词);一种是ast模式,用于对公式进行解析。 - -Examples: -:: - - >>> test_item = r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$" - >>> seg(test_item) - >>> ['如图所示,则', '\\bigtriangleup ABC', '的面积是', '\\SIFBlank', '。', \FigureID{1}] - >>> seg(test_item, symbol="fgm") - >>> ['如图所示,则', '[FORMULA]', '的面积是', '[MARK]', '。', '[FIGURE]'] - -.. toctree:: - :maxdepth: 1 - :titlesonly: + 语义成分分解 + 结构成分分解 - seg <../../build/blitz/seg/seg.ipynb> diff --git "a/docs/source/tutorial/zh/seg/\347\273\223\346\236\204\346\210\220\345\210\206\345\210\206\350\247\243.rst" "b/docs/source/tutorial/zh/seg/\347\273\223\346\236\204\346\210\220\345\210\206\345\210\206\350\247\243.rst" new file mode 100644 index 00000000..13ae96ca --- /dev/null +++ "b/docs/source/tutorial/zh/seg/\347\273\223\346\236\204\346\210\220\345\210\206\345\210\206\350\247\243.rst" @@ -0,0 +1,53 @@ +结构成分分解 +------------ + +对切片后的item中的各个元素进行分词,提供深度选项,可以按照需求选择所有地方切分或者在部分标签处切分(比如\SIFSep、\SIFTag处);对标签添加的位置也可以进行选择,可以在头尾处添加或仅在头或尾处添加。 + +具有两种模式,一种是linear模式,用于对文本进行处理(使用jieba库进行分词);一种是ast模式,用于对公式进行解析。 + +基础使用方法 +++++++++++++++++++ + +:: + + >>> test_item = r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$" + >>> seg(test_item) + >>> ['如图所示,则', '\\bigtriangleup ABC', '的面积是', '\\SIFBlank', '。', \FigureID{1}] + +可选的的额外参数/接口 +++++++++++++++++++++++ + +1.describe:可以统计出各种类型元素的数量 + +:: + + >>> s.describe() + {'t': 3, 'f': 1, 'g': 1, 'm': 1} + +2.filter:可以选择性的筛除某种或几种类型的元素 + +:: + + >>> with s.filter("f"): + ... s + ['如图所示,则', '的面积是', '\\SIFBlank', '。', \FigureID{1}] + >>> with s.filter(keep="t"): + ... s + ['如图所示,则', '的面积是', '。'] + +3.symbol:选择性的将部分类型的数据转换为特殊符号遮掩起来 + +:: + + >>> seg(test_item, symbol="fgm") + ['如图所示,则', '[FORMULA]', '的面积是', '[MARK]', '。', '[FIGURE]'] + >>> seg(test_item, symbol="tfgm") + ['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]'] + +详细示范 ++++++++++++ + +.. toctree:: + :titlesonly: + + 结构成分分解的案例 <../../../build/blitz/seg/seg.ipynb> diff --git "a/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst" "b/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst" new file mode 100644 index 00000000..0950dd87 --- /dev/null +++ "b/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst" @@ -0,0 +1,55 @@ +语义成分分解 +------------ + +由于选择题是以字典的形式给出,故需要将其在保留数据类型关系的情况下转换为文本格式。dict2str4sif函数就是实现此功能的一个模块,该模块可以将选择题形式的item转换为字符格式,并将题干和选项、各选项之间分割开来。 + + +基础使用方法 +++++++++++++++++++ + +:: + + >>> item = { + ... "stem": r"若复数$z=1+2 i+i^{3}$,则$|z|=$", + ... "options": ['0', '1', r'$\sqrt{2}$', '2'], + ... } + >>> dict2str4sif(item) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' + +可选的的额外参数/接口 +++++++++++++++++++++++ + +1.add_list_no_tag:当此参数为True较False时区别在于是否需要将选项部分的标签计数 + +:: + + >>> dict2str4sif(item, add_list_no_tag=True) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' + + >>> dict2str4sif(item, add_list_no_tag=False) # doctest: +ELLIPSIS + '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2$\\SIFTag{options_end}$' + +2.tag_mode:此参数为选择标签所在位置,delimiter为头尾都加标签,head为仅头部加标签,tail为仅尾部加标签 + +:: + + >>> dict2str4sif(item, tag_mode="head") # doctest: +ELLIPSIS + '$\\SIFTag{stem}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{options}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2' + + >>> dict2str4sif(item, tag_mode="tail") # doctest: +ELLIPSIS + '若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options}$' + +3.key_as_tag:当其为False时则不区分切分标签的类型,而是仅在选项之间加入$\SIFSep$ + +:: + + >>> dict2str4sif(item, key_as_tag=False) + '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' + +详细示范 +++++++++++++++++++++++ + +.. toctree:: + :titlesonly: + + 语义成分分解的案例 <../../../build/blitz/utils/data.ipynb> diff --git a/examples/formula/formula.ipynb b/examples/formula/formula.ipynb index 47b4fdcd..f748a90a 100644 --- a/examples/formula/formula.ipynb +++ b/examples/formula/formula.ipynb @@ -7,7 +7,7 @@ "\n", "## 概述\n", "\n", - "Formula 首先在分词功能中(SIF/segment ==> SIF/tokenization)中对原始文本的公式做切分处理,之后在 Formula/ast 中提供 [公式解析树] 功能,可以将数学公式的抽象语法分析树用文本或图片的形式表示出来。 \n", + "Formula 首先在分词功能中对原始文本的公式做切分处理,另外提供 [公式解析树] 功能,可以将数学公式的抽象语法分析树用文本或图片的形式表示出来。 \n", "\n", "本模块另提供公式变量标准化的功能,如判断几个子公式内的‘x’为同一变量。" ], @@ -508,4 +508,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/examples/formula/tree.ipynb b/examples/formula/tree.ipynb index 87327c30..ce8bb972 100644 --- a/examples/formula/tree.ipynb +++ b/examples/formula/tree.ipynb @@ -1,24 +1,17 @@ { "cells": [ - { - "cell_type": "markdown", - "source": [ - "# tree" - ], - "metadata": {} - }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 1, "source": [ - "import networkx as nx\r\n", - "\r\n", - "g = nx.DiGraph()\r\n", - "g.add_node(0, value=1, id=0)\r\n", - "g.add_node(1, value=2, id=1)\r\n", - "g.add_node(2, id=2)\r\n", - "g.add_edge(0, 1)\r\n", - "g.add_edge(0, 2)\r\n", + "import networkx as nx\n", + "\n", + "g = nx.DiGraph()\n", + "g.add_node(0, value=1, id=0)\n", + "g.add_node(1, value=2, id=1)\n", + "g.add_node(2, id=2)\n", + "g.add_edge(0, 1)\n", + "g.add_edge(0, 2)\n", "g.nodes[0]" ], "outputs": [ @@ -30,7 +23,7 @@ ] }, "metadata": {}, - "execution_count": 9 + "execution_count": 1 } ], "metadata": { @@ -40,23 +33,25 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "name": "python3", + "display_name": "Python 3.8.5 64-bit" }, "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", + "pygments_lexer": "ipython3", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "file_extension": ".py" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" } }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/examples/parse/parse.ipynb b/examples/parse/parse.ipynb index dc60aaaa..94272b65 100644 --- a/examples/parse/parse.ipynb +++ b/examples/parse/parse.ipynb @@ -56,7 +56,7 @@ { "cell_type": "markdown", "source": [ - "### to_sif" + "### 尝试转换为标准形式" ], "metadata": {} }, @@ -88,7 +88,7 @@ { "cell_type": "markdown", "source": [ - "### is_sif" + "### 判断是否有语法问题" ], "metadata": {} }, @@ -142,4 +142,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/examples/test_model/test_gensim_luna_stem_tf_d2v_256.bin b/examples/test_model/test_gensim_luna_stem_tf_d2v_256.bin new file mode 100644 index 00000000..7a56bca4 Binary files /dev/null and b/examples/test_model/test_gensim_luna_stem_tf_d2v_256.bin differ diff --git a/examples/tokenizer/test_stopwords.txt b/examples/tokenizer/test_stopwords.txt new file mode 100644 index 00000000..8183ecf4 --- /dev/null +++ b/examples/tokenizer/test_stopwords.txt @@ -0,0 +1,9 @@ +一旦 +一时 +一来 +一样 +一次 +一片 +一番 +一直 +一致 \ No newline at end of file diff --git a/examples/tokenizer/tokenizer.ipynb b/examples/tokenizer/tokenizer.ipynb index 6a683158..4819b00d 100644 --- a/examples/tokenizer/tokenizer.ipynb +++ b/examples/tokenizer/tokenizer.ipynb @@ -1,357 +1,501 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Tokenizer\n", - "\n", - "## 概述\n", - "\n", - "为了方便后续向量化表征试题,本模块提供题目文本的令牌化解析(Tokenization),即将题目转换成令牌序列。 \n", - "\n", - "根据构成题目的元素类型,解析功能分为 **“文本解析”** 和 **“公式解析”** 两部分。\n", - "\n", - "### 文本解析\n", - "\n", - "根据题目文本切分粒度的大小,文本解析又分为 **“句解析”** 和 **“词解析”**。\n", - "\n", - "1. 句解析(sentence-tokenization):将较长的文档切分成若干句子的过程称为“分句”。每个句子为一个“令牌”(token)。(待实现) \n", - " \n", - "\n", - "2. 词解析(text-tokenization):一个句子(不含公式)是由若干“词”按顺序构成的,将一个句子切分为若干词的过程称为“词解析”。根据词的粒度大小,又可细分为“词组解析”和\"单字解析\"。\n", - " - 词组解析 (word-tokenization):每一个词组为一个“令牌”(token)。\n", - " - 单字解析 (char-tokenization):单个字符即为一个“令牌”(token)。\n", - "\n", - "### 公式解析\n", - "\n", - "公式解析(formula-tokenization):理科类文本中常常含有公式。将一个符合 latex 语法的公式切分为标记字符列表的过程称为“公式解析”。每个标记字符为一个“令牌”(token)。 \n", - " " - ], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## 词解析\n", - "\n", - "词解析分为两个主要步骤:\n", - "1. 分词: \n", - " - 词组解析:使用分词工具切分并提取题目文本中的词。 \n", - " 本项目目前支持的分词工具有:`jieba` \n", - " - 单字解析:按字符划分。\n", - " \n", - " \n", - "2. 筛选:过滤指定的停用词。 \n", - " 本项目默认使用的停用词表:[stopwords](https://github.com/bigdata-ustc/EduNLP/blob/master/EduNLP/meta_data/sif_stopwords.txt) \n", - " 你也可以使用自己的停用词表,具体使用方法见下面的示例。\n" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 1, - "source": [ - "# 导入模块\n", - "from EduNLP.SIF.tokenization.text import tokenize " - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 5, - "source": [ - "# 输入\n", - "text = \"三角函数是基本初等函数之一\"" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "### 词组解析\n", - "\n", - "分词粒度参数选择 word: `granularity = \"word\"` " - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 10, - "source": [ - "# 输出:默认使用 EduNLP 项目提供的停用词表\n", - "tokenize(text, granularity=\"word\")" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['三角函数', '初等', '函数']" - ] - }, - "metadata": {}, - "execution_count": 10 - } - ], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "### 单字解析\n", - "\n", - "分词粒度参数选择 word: `granularity = \"char\"` " - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 14, - "source": [ - "# 输出:默认使用 EduNLP 项目提供的停用词表\n", - "tokenize(text, granularity=\"char\")" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['三', '角', '函', '数', '基', '初', '函', '数']" - ] - }, - "metadata": {}, - "execution_count": 14 - } - ], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## 停用词表" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 15, - "source": [ - "# 获取自己的停用词表\n", - "spath = \"test_stopwords.txt\"\n", - "from EduNLP.SIF.tokenization.text.stopwords import get_stopwords\n", - "stopwords = get_stopwords(spath)\n", - "stopwords" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'一旦', '一时', '一来', '一样', '一次', '一片', '一番', '一直', '一致'}" - ] - }, - "metadata": {}, - "execution_count": 15 - } - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 8, - "source": [ - "# 输出:传入停用词表(stopwords)\n", - "tokenize(text,granularity=\"word\",stopwords=stopwords)" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['三角函数', '是', '基本', '初等', '函数', '之一']" - ] - }, - "metadata": {}, - "execution_count": 8 - } - ], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## 公式解析\n", - "切分出 latex 公式的每个标记符号。针对本模块更加详细的解释参见 [formula](../formula/formula.ipynb)" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 35, - "source": [ - "# 导入模块\n", - "from EduNLP.SIF.tokenization.formula import tokenize\n", - "\n", - "# 输入\n", - "formula = \"\\\\frac{\\\\pi}{x + y} + 1 = x\"\n", - "\n", - "# 输出\n", - "\n", - "# 输出形式选择普通序列(linear)\n", - "print('linear: ',tokenize(formula,method=\"linear\"))\n", - "\n", - "# 输出形式选择抽象语法分析树(ast)\n", - "print('ast : ',tokenize(formula,method=\"ast\",return_type = \"list\", ord2token=False))\n", - "\n", - "# 输出形式选择抽象语法分析树(ast)且将公式变量名转换成 token \n", - "print('ast & ord2token: ',tokenize(formula,method=\"ast\",return_type = \"list\", ord2token=True))\n", - "\n", - "# 输出形式选择抽象语法分析树(ast)且将公式变量名转换成带编号的 token\n", - "print('ast & ord2token & var_numbering: ',tokenize(formula,method=\"ast\",return_type = \"list\", ord2token=True, var_numbering=True))\n", - "\n" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "linear: ['\\\\frac', '{', '\\\\pi', '}', '{', 'x', '+', 'y', '}', '+', '1', '=', 'x']\n", - "ast : ['\\\\pi', '{ }', 'x', '+', 'y', '{ }', '\\\\frac', '+', '1', '=', 'x']\n", - "ast & ord2token: ['mathord', '{ }', 'mathord', '+', 'mathord', '{ }', '\\\\frac', '+', 'textord', '=', 'mathord']\n", - "ast & ord2token & var_numbering: ['mathord_con', '{ }', 'mathord_0', '+', 'mathord_1', '{ }', '\\\\frac', '+', 'textord', '=', 'mathord_0']\n" - ] - } - ], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## 综合解析\n", - "\n", - "标记解析 + 公式解析。特殊符号将转换成常量,例如:\n", - "```python\n", - "FIGURE_SYMBOL = \"[FIGURE]\" # $\\SIFChoice$\n", - "QUES_MARK_SYMBOL = \"[MARK]\" # $\\FigureID{1}$\n", - "```\n" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 39, - "source": [ - "# 导入模块\n", - "from EduNLP.Tokenizer import get_tokenizer\n", - "\n", - "# 输入\n", - "item = {\n", - " \"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\n", - "}\n", - "\n", - "# 输出\n", - "tokenizer = get_tokenizer(\"text\")\n", - "tokens = tokenizer(item)\n", - "next(tokens) " - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['如图',\n", - " '古希腊',\n", - " '数学家',\n", - " '希波',\n", - " '克拉底',\n", - " '研究',\n", - " '几何图形',\n", - " '此图',\n", - " '三个',\n", - " '半圆',\n", - " '三个',\n", - " '半圆',\n", - " '直径',\n", - " '直角三角形',\n", - " 'ABC',\n", - " '斜边',\n", - " 'BC',\n", - " '直角',\n", - " 'AB',\n", - " 'AC',\n", - " '\\x08',\n", - " 'igtriangleupABC',\n", - " '三边',\n", - " '围成',\n", - " '区域',\n", - " '记',\n", - " 'I',\n", - " '黑色',\n", - " '记',\n", - " 'II',\n", - " '其余部分',\n", - " '记',\n", - " 'III',\n", - " '图形',\n", - " '中',\n", - " '随机',\n", - " '取',\n", - " '一点',\n", - " '此点',\n", - " '取自',\n", - " 'I',\n", - " ',',\n", - " 'II',\n", - " ',',\n", - " 'III',\n", - " '概率',\n", - " '记',\n", - " 'p',\n", - " '_',\n", - " '1',\n", - " ',',\n", - " 'p',\n", - " '_',\n", - " '2',\n", - " ',',\n", - " 'p',\n", - " '_',\n", - " '3',\n", - " '[MARK]',\n", - " '[FIGURE]']" - ] - }, - "metadata": {}, - "execution_count": 39 - } - ], - "metadata": {} - } - ], - "metadata": { - "orig_nbformat": 4, - "language_info": { - "name": "python", - "version": "3.8.5", - "mimetype": "text/x-python", - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "pygments_lexer": "ipython3", - "nbconvert_exporter": "python", - "file_extension": ".py" - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3.8.5 64-bit" - }, - "interpreter": { - "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" - } - }, - "nbformat": 4, - "nbformat_minor": 2 +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Tokenizer\n", + "\n", + "## 概述\n", + "\n", + "为了方便后续向量化表征试题,本模块提供题目文本的令牌化解析(Tokenization),即将题目转换成令牌序列。 \n", + "\n", + "根据构成题目的元素类型,解析功能分为 **“文本解析”** 和 **“公式解析”** 两部分。\n", + "\n", + "### 文本解析\n", + "\n", + "根据题目文本切分粒度的大小,文本解析又分为 **“句解析”** 和 **“词解析”**。\n", + "\n", + "(1) 句解析(sentence-tokenization):将较长的文档切分成若干句子的过程称为“分句”。每个句子为一个“令牌”(token)。(待实现) \n", + " \n", + "\n", + "(2) 词解析(text-tokenization):一个句子(不含公式)是由若干“词”按顺序构成的,将一个句子切分为若干词的过程称为“词解析”。根据词的粒度大小,又可细分为“词组解析”和\"单字解析\"。\n", + "- 词组解析 (word-tokenization):每一个词组为一个“令牌”(token)。\n", + "- 单字解析 (char-tokenization):单个字符即为一个“令牌”(token)。\n", + "\n", + "### 公式解析\n", + "\n", + "公式解析(formula-tokenization):理科类文本中常常含有公式。将一个符合 latex 语法的公式切分为标记字符列表的过程称为“公式解析”。每个标记字符为一个“令牌”(token)。 \n", + " " + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 文本解析" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 句解析\n", + "\n", + "待实现..." + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 词解析\n", + "\n", + "词解析分为两个主要步骤: \n", + "\n", + "(1) 分词: \n", + "- 词组解析:使用分词工具切分并提取题目文本中的词。 \n", + " 本项目目前支持的分词工具有:`jieba` \n", + "- 单字解析:按字符划分。\n", + " \n", + " \n", + "(2) 筛选:过滤指定的停用词。 \n", + "- 本项目默认使用的停用词表:[stopwords](https://github.com/bigdata-ustc/EduNLP/blob/master/EduNLP/meta_data/sif_stopwords.txt) \n", + "- 你也可以使用自己的停用词表,具体使用方法见下面的示例。\n" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "# 导入模块\n", + "from EduNLP.SIF.tokenization.text import tokenize " + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "# 输入\n", + "text = \"三角函数是基本初等函数之一\"" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "#### 词组解析\n", + "\n", + "分词粒度参数选择 word: `granularity = \"word\"` " + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "# 输出:默认使用 EduNLP 项目提供的停用词表\n", + "tokenize(text, granularity=\"word\")" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['三角函数', '初等', '函数']" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "#### 单字解析\n", + "\n", + "分词粒度参数选择 word: `granularity = \"char\"` " + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "# 输出:默认使用 EduNLP 项目提供的停用词表\n", + "tokenize(text, granularity=\"char\")" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['三', '角', '函', '数', '基', '初', '函', '数']" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "#### 停用词表" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "# 获取自己的停用词表\n", + "spath = \"test_stopwords.txt\"\n", + "from EduNLP.SIF.tokenization.text.stopwords import get_stopwords\n", + "stopwords = get_stopwords(spath)\n", + "stopwords" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'一旦', '一时', '一来', '一样', '一次', '一片', '一番', '一直', '一致'}" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "# 输出:传入停用词表(stopwords)\n", + "tokenize(text,granularity=\"word\",stopwords=stopwords)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['三角函数', '是', '基本', '初等', '函数', '之一']" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 公式解析\n", + "切分出 latex 公式的每个标记符号。针对本模块更加详细的解释参见 [formula](../formula/formula.ipynb)" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "# 导入模块\n", + "from EduNLP.SIF.tokenization.formula import tokenize" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 输入" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 8, + "source": [ + "formula = \"\\\\frac{\\\\pi}{x + y} + 1 = x\"" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 输出" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(1)如果您想按 latex 语法标记拆分公式的各个部分,并得到顺序序列结果,输出方法可以选择:`linear`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 9, + "source": [ + "tokenize(formula, method=\"linear\")" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['\\\\frac', '{', '\\\\pi', '}', '{', 'x', '+', 'y', '}', '+', '1', '=', 'x']" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(2) 如果您想得到公式解析出的语法分析树序列,输出方法可以选择:`ast`\n", + "> 抽象语法分析树,简称语法树(Syntax tree),是源代码语法结构的一种抽象表示。它以树状的形式表现编程语言的语法结构,树上的每个节点都表示源代码中的一种结构。 \n", + "> 因此,ast 可以看做是公式的语法结构表征。" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 10, + "source": [ + "tokenize(formula, method=\"ast\", return_type=\"list\", ord2token=False)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['\\\\pi', '{ }', 'x', '+', 'y', '{ }', '\\\\frac', '+', '1', '=', 'x']" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(3)如果您只是关心公式的结构和类型,并不关心变量具体是什么,比如二元二次方程 `x^2 + y = 1` ,它从公式结构和类型上来说,和 `w^2 + z = 1` 没有区别。 \n", + "此时,您可以设置如下参数:`ord2token = True`,将公式变量名转换成 token" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 11, + "source": [ + "# 输出形式选择抽象语法分析树(ast)且将公式变量名转换成 token\n", + "tokenize(formula, method=\"ast\", return_type=\"list\", ord2token=True)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['mathord',\n", + " '{ }',\n", + " 'mathord',\n", + " '+',\n", + " 'mathord',\n", + " '{ }',\n", + " '\\\\frac',\n", + " '+',\n", + " 'textord',\n", + " '=',\n", + " 'mathord']" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(4) 如果您除了 (3) 中提供的功能之外,还需要区分不同的变量。此时可以另外设置参数:`var_numbering=True`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 12, + "source": [ + "# 输出形式选择抽象语法分析树(ast)且将公式变量名转换成带编号的 token\n", + "tokenize(formula, method=\"ast\", return_type=\"list\", ord2token=True, var_numbering=True)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['mathord_con',\n", + " '{ }',\n", + " 'mathord_0',\n", + " '+',\n", + " 'mathord_1',\n", + " '{ }',\n", + " '\\\\frac',\n", + " '+',\n", + " 'textord',\n", + " '=',\n", + " 'mathord_0']" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 综合解析\n", + "\n", + "综合解析,即综合以上两种解析方式(标记解析 + 公式解析),提供对题目文本的全解析。另外,如遇到特殊符号将转换成常量,例如:\n", + "```python\n", + "FIGURE_SYMBOL = \"[FIGURE]\" # $\\SIFChoice$\n", + "QUES_MARK_SYMBOL = \"[MARK]\" # $\\FigureID{1}$\n", + "```\n" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 39, + "source": [ + "# 导入模块\n", + "from EduNLP.Tokenizer import get_tokenizer\n", + "\n", + "# 输入\n", + "item = {\n", + " \"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\n", + "}\n", + "\n", + "# 输出\n", + "tokenizer = get_tokenizer(\"text\")\n", + "tokens = tokenizer(item)\n", + "next(tokens) " + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['如图',\n", + " '古希腊',\n", + " '数学家',\n", + " '希波',\n", + " '克拉底',\n", + " '研究',\n", + " '几何图形',\n", + " '此图',\n", + " '三个',\n", + " '半圆',\n", + " '三个',\n", + " '半圆',\n", + " '直径',\n", + " '直角三角形',\n", + " 'ABC',\n", + " '斜边',\n", + " 'BC',\n", + " '直角',\n", + " 'AB',\n", + " 'AC',\n", + " '\\x08',\n", + " 'igtriangleupABC',\n", + " '三边',\n", + " '围成',\n", + " '区域',\n", + " '记',\n", + " 'I',\n", + " '黑色',\n", + " '记',\n", + " 'II',\n", + " '其余部分',\n", + " '记',\n", + " 'III',\n", + " '图形',\n", + " '中',\n", + " '随机',\n", + " '取',\n", + " '一点',\n", + " '此点',\n", + " '取自',\n", + " 'I',\n", + " ',',\n", + " 'II',\n", + " ',',\n", + " 'III',\n", + " '概率',\n", + " '记',\n", + " 'p',\n", + " '_',\n", + " '1',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '2',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '3',\n", + " '[MARK]',\n", + " '[FIGURE]']" + ] + }, + "metadata": {}, + "execution_count": 39 + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } \ No newline at end of file diff --git a/examples/vectorization/get_pretrained_i2v.ipynb b/examples/vectorization/get_pretrained_i2v.ipynb new file mode 100644 index 00000000..9fe707b7 --- /dev/null +++ b/examples/vectorization/get_pretrained_i2v.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# get_pretrained_i2v\n", + "\n", + "## 概述\n", + "\n", + "使用 EduNLP 项目组给定的预训练模型将给定的题目文本转成向量。\n", + "\n", + "- 优点:简单方便。\n", + "- 缺点:只能使用项目中给定的模型,局限性较大。\n" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 导入功能块" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "from EduNLP import get_pretrained_i2v" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输入\n", + "\n", + "类型:str \n", + "内容:题目文本 (text)" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "item = {\n", + "\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\n", + "}\n" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 模型选择与使用" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "根据题目所属学科选择预训练模型: \n", + "\n", + " 预训练模型名称 | 模型训练数据的所属学科 \n", + " -------------- | ---------------------- \n", + " d2v_all_256 | 全学科 \n", + " d2v_sci_256 | 理科 \n", + " d2v_eng_256 | 英语 \n", + " d2v_lit_256 | 文科 \n", + "\n" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "i2v = get_pretrained_i2v(\"d2v_sci_256\")" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "EduNLP, INFO Use pretrained t2v model d2v_sci_256\n", + "downloader, INFO http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_science_256.zip is saved as /home/lvrui/.EduNLP/model/general_science_256.zip\n", + "downloader, INFO file existed, skipped\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 注意:\n", + " 默认的 EduNLP 项目存储地址为根目录(`~/.EduNLP`),模型存储地址为项目存储地址下的 `model` 文件夹。您可以通过修改下面的环境变量来修改模型存储地址:\n", + " - EduNLP 项目存储地址:`EDUNLPPATH = xx/xx/xx`\n", + " - 模型存储地址:`EDUNLPMODELPATH = xx/xx/xx`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "print(i2v(item))" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "([array([-2.38860980e-01, 7.09681511e-02, -2.71706015e-01, 1.64714813e-01,\n", + " 2.81243492e-02, -1.82386801e-01, 9.22331214e-02, 1.31783364e-02,\n", + " 9.15176645e-02, 3.14464062e-01, 9.37800854e-02, -2.28523940e-01,\n", + " -2.60597020e-01, 6.49375990e-02, 9.75619778e-02, -1.97933778e-01,\n", + " 8.29798505e-02, -2.26491719e-01, -1.77030653e-01, -3.56038064e-02,\n", + " 6.22844934e-01, -2.66110301e-01, 8.00080523e-02, -1.60827965e-01,\n", + " -1.78654417e-01, -1.33000776e-01, 2.76004016e-01, 1.79546073e-01,\n", + " 8.71006995e-02, 2.33958483e-01, 1.76031828e-01, 1.55402005e-01,\n", + " -1.38987333e-01, -1.92975491e-01, -1.09528497e-01, 1.12305783e-01,\n", + " 2.32549626e-02, 7.75609687e-02, -2.43636876e-01, 6.35311157e-02,\n", + " -4.82399836e-02, -2.24204548e-02, 7.49862418e-02, -1.91449642e-01,\n", + " 9.72701237e-02, 4.00750965e-01, 2.81992704e-01, 3.07581365e-01,\n", + " -4.68867749e-01, -3.03025767e-02, -1.95257351e-01, 1.79073047e-02,\n", + " -2.15334237e-01, 9.98005569e-02, -2.62755096e-01, -2.39337608e-01,\n", + " 3.44270498e-01, 1.50241479e-01, -2.96006531e-01, -3.81666899e-01,\n", + " -1.19041964e-01, 6.18071109e-02, 6.49120063e-02, 9.94637012e-02,\n", + " 1.23297565e-01, 1.29930690e-01, 1.27305657e-01, -1.53804764e-01,\n", + " 7.04720244e-03, -1.33500487e-01, -1.51161134e-01, 1.13862932e-01,\n", + " -2.44814962e-01, -8.95622373e-02, 4.76458520e-02, -5.92206642e-02,\n", + " 2.88407020e-02, -5.88610955e-02, -4.25557904e-02, 3.20446432e-01,\n", + " -2.61463765e-02, 7.19539896e-02, -1.32161498e-01, 1.62227061e-02,\n", + " 1.20197656e-03, -2.03355268e-01, -6.83294982e-03, -2.82588631e-01,\n", + " -1.61395460e-01, -5.05547188e-02, -2.27462381e-01, -1.70932785e-01,\n", + " 1.41351461e-01, -1.30069017e-01, -1.83039993e-01, -6.79691881e-02,\n", + " -2.15642393e-01, -7.84436688e-02, 1.77202985e-01, 4.50607650e-02,\n", + " 7.02605024e-02, 8.01992565e-02, -1.55584306e-01, -2.00563252e-01,\n", + " 1.17082551e-01, 9.73844752e-02, -1.10356934e-01, -1.37866074e-02,\n", + " -8.57235789e-02, -5.56467362e-02, -9.36827138e-02, 6.82030804e-03,\n", + " 6.92379624e-02, -2.28701755e-01, 6.70390204e-02, 1.34586483e-01,\n", + " 2.25231394e-01, 1.33322045e-01, -8.82911906e-02, 1.42205298e-01,\n", + " 2.41012901e-01, 7.94170424e-03, -7.02124536e-02, 2.51370400e-01,\n", + " 1.04983136e-01, -6.39194548e-02, 5.24720028e-02, 7.16757867e-03,\n", + " -1.08169973e-01, -1.08731678e-02, 1.69618204e-02, 7.87692815e-02,\n", + " -2.26539060e-01, 3.29003595e-02, 1.91522852e-01, 2.75921494e-01,\n", + " -1.64055750e-01, 5.83723187e-02, 9.84422341e-02, 3.21688712e-01,\n", + " -2.62310840e-02, -2.08140060e-01, 1.14425711e-01, 1.23823956e-01,\n", + " -8.62085819e-03, -4.14005108e-02, -3.41566652e-02, 1.34680912e-01,\n", + " 4.27634180e-01, 1.42883554e-01, -1.54787973e-01, 7.96157196e-02,\n", + " 1.40678003e-01, 1.39171826e-02, 1.66003749e-01, -4.85638082e-02,\n", + " 5.88261709e-02, 9.51106697e-02, 1.81014258e-02, 1.44485429e-01,\n", + " 4.01205927e-01, 6.77596256e-02, -5.52676022e-01, -1.87850371e-01,\n", + " 1.12366609e-01, -6.84190989e-02, 9.48949978e-02, 2.23454669e-01,\n", + " -1.69843137e-01, 2.09085494e-01, 4.29946512e-01, -3.36349100e-01,\n", + " 6.12608856e-03, -1.46142125e-01, -5.11092655e-02, 8.06671828e-02,\n", + " 1.81744993e-01, -6.78945482e-02, -5.77093139e-02, 1.52337164e-01,\n", + " 2.21259117e-01, 3.35705757e-01, -2.51778495e-02, 1.03662543e-01,\n", + " -4.21361588e-02, 1.43061429e-01, -3.92947495e-01, -4.89463992e-02,\n", + " -9.15660262e-02, -1.00108273e-01, 3.86523217e-01, -4.25569601e-02,\n", + " 4.10154127e-02, -3.41399819e-01, 2.13903114e-02, 8.09015241e-03,\n", + " 9.56344381e-02, 1.12729572e-01, 7.25207478e-02, -6.64384067e-02,\n", + " -2.73666024e-01, -2.79651750e-02, 1.18422434e-01, -5.22459708e-02,\n", + " -2.47057881e-02, 2.84700710e-02, 2.07451075e-01, -9.74238589e-02,\n", + " 8.08936954e-02, 4.07307222e-02, -1.35277033e-01, 2.18436554e-01,\n", + " 1.28792310e-02, -1.20433331e-01, 2.41929386e-02, 1.28128864e-02,\n", + " -7.39881098e-02, -1.12995692e-01, 7.69245178e-02, -2.87000872e-02,\n", + " 1.64782573e-02, -2.78794408e-01, -2.64403820e-01, -2.43874848e-01,\n", + " 1.77457914e-01, 4.11631197e-01, -6.09753132e-02, 2.84967333e-01,\n", + " 9.81074646e-02, -2.68213183e-01, 1.52153388e-01, 2.42148209e-02,\n", + " 1.24371536e-01, 6.02926640e-03, 8.22689310e-02, 2.82294262e-04,\n", + " -1.40584474e-02, 4.09389734e-02, -2.58334547e-01, -9.83026102e-02,\n", + " -1.91695184e-01, -2.61005852e-02, -2.21736208e-01, -4.36628833e-02,\n", + " 9.49840024e-02, -5.16017936e-02, 2.17577979e-01, 2.58604765e-01,\n", + " 6.33814484e-02, -7.10158283e-03, 9.87893157e-03, -2.26405971e-02,\n", + " 1.67435139e-01, 2.90897069e-03, 2.35914681e-02, 5.43428905e-06],\n", + " dtype=float32)], None)\n" + ] + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/vectorization/get_pretrained_t2v.ipynb b/examples/vectorization/get_pretrained_t2v.ipynb new file mode 100644 index 00000000..c0982e81 --- /dev/null +++ b/examples/vectorization/get_pretrained_t2v.ipynb @@ -0,0 +1,160 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# get_pretrained_t2v\n", + "\n", + "## 概述\n", + "\n", + "使用 EduNLP 项目组给定的预训练模型将一组题目的切分序列表征为向量。\n", + "\n", + "- 优点:简单方便。\n", + "- 缺点:只能使用项目中给定的模型,局限性较大。" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 导入功能块" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "from tqdm import tqdm\n", + "from EduNLP.SIF.segment import seg\n", + "from EduNLP.SIF.tokenization import tokenize\n", + "from EduNLP.Pretrain import GensimWordTokenizer\n", + "from EduNLP.Vector import get_pretrained_t2v" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输入\n", + "\n", + "类型:list \n", + "内容:一个题组中每个题目切分序列的组合。\n", + "> 这里需要调用 `GensimWordTokenizer` 将题目文本(`str` 类型)转换成 tokens。" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "def load_items():\n", + " test_items = [\n", + " {'ques_content':'有公式$\\\\FormFigureID{wrong1?}$和公式$\\\\FormFigureBase64{wrong2?}$,如图$\\\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$'},\n", + " {'ques_content':'如图$\\\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$'},\n", + " {'ques_content':'
Below is a discussion on a website.
t2v\n", + "t2v = get_pretrained_t2v(\"d2v_sci_256\")" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "downloader, INFO http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_science_256.zip is saved as /home/lvrui/.EduNLP/model/general_science_256.zip\n", + "downloader, INFO file existed, skipped\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 注意:\n", + " 默认的 EduNLP 项目存储地址为根目录(`~/.EduNLP`),模型存储地址为项目存储地址下的 `model` 文件夹。您可以通过修改下面的环境变量来修改模型存储地址:\n", + " - EduNLP 项目存储地址:`EDUNLPPATH = xx/xx/xx`\n", + " - 模型存储地址:`EDUNLPMODELPATH = xx/xx/xx`" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "t2v(token_items)" + ], + "outputs": [], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/vectorization/i2v.ipynb b/examples/vectorization/i2v.ipynb index 4654d6cf..3122fbce 100644 --- a/examples/vectorization/i2v.ipynb +++ b/examples/vectorization/i2v.ipynb @@ -26,16 +26,7 @@ "source": [ "from EduNLP.I2V import D2V" ], - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/home/lvrui/.local/lib/python3.8/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", - " warnings.warn(msg)\n" - ] - } - ], + "outputs": [], "metadata": {} }, { @@ -198,4 +189,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/examples/vectorization/t2v.ipynb b/examples/vectorization/t2v.ipynb new file mode 100644 index 00000000..908ff182 --- /dev/null +++ b/examples/vectorization/t2v.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# T2V\n", + "\n", + "## 概述\n", + "\n", + "使用自己提供的任一预训练模型(给出模型存放路径即可)将一组题目的切分序列表征为向量。\n", + "\n", + "- 优点:模型及其参数可自主调整,灵活性强。\n" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 导入功能块" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 12, + "source": [ + "from tqdm import tqdm\n", + "from EduNLP.SIF.segment import seg\n", + "from EduNLP.SIF.tokenization import tokenize\n", + "from EduNLP.Pretrain import GensimWordTokenizer\n", + "from EduNLP.Vector import T2V" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输入\n", + "\n", + "类型:list \n", + "内容:一个题组中每个题目切分序列的组合。\n", + "> 这里需要调用 `GensimWordTokenizer` 将题目文本(`str` 类型)转换成 tokens。" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 11, + "source": [ + "print(type(token_items))\n", + "print(type(token_items[0]))" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "token_items[0]" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['公式',\n", + " '[FORMULA]',\n", + " '公式',\n", + " '[FORMULA]',\n", + " '如图',\n", + " '[FIGURE]',\n", + " 'x',\n", + " ',',\n", + " 'y',\n", + " '约束条件',\n", + " '[SEP]',\n", + " 'z',\n", + " '=',\n", + " 'x',\n", + " '+',\n", + " '7',\n", + " 'y',\n", + " '最大值',\n", + " '[MARK]']" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 输出" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "path = \"../test_model/test_gensim_luna_stem_tf_d2v_256.bin\"\n", + "t2v = T2V('d2v',filepath=path)\n", + "t2v(token_items)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[array([ 0.0256574 , 0.06061139, -0.00121044, -0.0167674 , -0.0111706 ,\n", + " 0.05325712, -0.02097339, -0.01613594, 0.02904145, 0.0185046 ,\n", + " 0.03473525, 0.00628165, 0.03696947, 0.00666153, -0.02352318,\n", + " -0.00458236, 0.02308686, -0.02153478, 0.01579256, -0.01575841,\n", + " -0.02654778, 0.01376328, 0.02539059, -0.01098955, 0.02203193,\n", + " -0.01503642, 0.01310026, -0.03569775, -0.00450978, 0.02522727,\n", + " -0.01547103, -0.00907244, -0.00072009, -0.0021727 , 0.02894731,\n", + " 0.01382611, 0.01647377, 0.00452782, -0.02488854, 0.02741116,\n", + " 0.0489724 , -0.04156181, -0.00855933, 0.01783935, 0.00704233,\n", + " 0.01296936, -0.06078439, -0.04922014, -0.0206639 , 0.00820663,\n", + " 0.02565274, 0.0164784 , 0.00996537, -0.02215545, 0.06741589,\n", + " 0.01634789, -0.0094168 , 0.00183323, 0.00853508, -0.0547929 ,\n", + " 0.00405556, 0.01386227, -0.04204945, 0.02175955, -0.01960315,\n", + " -0.05279269, -0.01511251, -0.02905018, -0.00405249, 0.03328003,\n", + " -0.00487469, -0.00338632, 0.01793213, 0.00942458, -0.02468935,\n", + " 0.03548338, -0.00907473, 0.00927462, -0.02545504, 0.02286367,\n", + " -0.01822809, 0.03625014, -0.00976438, -0.00188348, 0.06408882,\n", + " -0.04314236, -0.00193059, 0.02433112, -0.0091018 , 0.0276503 ,\n", + " -0.0036342 , -0.02485391, 0.02309245, 0.01880057, -0.00893952,\n", + " -0.03391525, 0.02678591, -0.00618519, -0.03601262, 0.0327184 ,\n", + " 0.09240578, 0.03631649, -0.00700663, -0.01786321, -0.02987848,\n", + " 0.00315695, -0.02082208, -0.00494443, -0.02717963, -0.00938541,\n", + " -0.0329605 , 0.0069218 , 0.01227082, 0.00856757, -0.0008222 ,\n", + " -0.0067637 , -0.01577486, 0.0628339 , -0.02329138, -0.00475964,\n", + " 0.02197625, 0.03022351, 0.00256966, -0.00247619, -0.01218352,\n", + " 0.01257284, 0.0051926 , -0.05297434, -0.0057066 , 0.01031242,\n", + " 0.02414824, -0.0115857 , 0.01625632, -0.03126714, -0.02389767,\n", + " -0.01417263, 0.02280749, -0.01431546, -0.00771551, 0.0264634 ,\n", + " 0.00115387, -0.01903204, -0.00100629, 0.00608774, 0.03787961,\n", + " 0.05098663, 0.03064756, -0.00654223, -0.01838502, -0.01889201,\n", + " 0.04686983, -0.02295219, -0.00901293, 0.00916024, -0.00013042,\n", + " 0.01236307, -0.00918534, 0.01792936, 0.00862702, -0.00018518,\n", + " -0.00566689, 0.00499178, 0.0246148 , -0.0170825 , 0.01850726,\n", + " 0.00031357, 0.02411471, 0.01080729, -0.01361136, -0.06226439,\n", + " 0.01830878, 0.01209503, -0.00980596, -0.01865078, 0.03692432,\n", + " -0.04503555, 0.0037965 , -0.04214804, -0.05657932, -0.01566005,\n", + " 0.00271924, -0.00026349, -0.00783886, 0.01218421, -0.03205092,\n", + " -0.02793218, -0.00298462, 0.00380523, 0.04471321, -0.02079478,\n", + " 0.0100926 , 0.00450996, -0.03412817, 0.03027697, 0.00872989,\n", + " 0.01512562, 0.01527565, 0.03683509, 0.05608684, 0.01055199,\n", + " 0.01637757, -0.01995301, -0.01610573, 0.04207385, 0.00058077,\n", + " 0.03846577, 0.04952911, -0.02142448, 0.0049874 , -0.00308159,\n", + " -0.02233348, 0.02013967, -0.01194606, -0.02481469, 0.01824989,\n", + " -0.00939436, -0.00374474, 0.02278485, 0.04107878, 0.01870474,\n", + " -0.00310527, -0.00257802, -0.03689042, -0.0200304 , -0.04838364,\n", + " 0.0035307 , 0.02496746, -0.0385387 , 0.01649689, 0.01429029,\n", + " 0.04338812, -0.05614391, -0.01632982, 0.03378268, 0.01393604,\n", + " -0.03859077, 0.01855484, 0.00241599, -0.00985778, 0.00530987,\n", + " 0.03700508, -0.06107654, -0.00972089, 0.02251891, 0.01154722,\n", + " 0.00913082, -0.0267815 , -0.01723521, 0.0136464 , 0.01965802,\n", + " 0.04769301, -0.02218902, -0.01268643, 0.00650465, 0.00985247,\n", + " 0.0029873 ], dtype=float32),\n", + " array([ 0.00877787, 0.03242666, -0.00026327, -0.01881958, -0.00730135,\n", + " 0.03559063, -0.01825701, -0.01065201, 0.01681685, 0.01074173,\n", + " 0.02253641, 0.0082016 , 0.02200216, 0.00088347, -0.0205142 ,\n", + " -0.01339685, 0.01239092, -0.01781665, 0.01000167, -0.01227449,\n", + " -0.03044926, 0.00296532, 0.01440197, -0.01035894, 0.01061506,\n", + " -0.00530907, 0.00484147, -0.02209524, 0.00735557, 0.01712263,\n", + " -0.00231011, -0.01255511, -0.00114341, -0.01413104, 0.02112199,\n", + " 0.01123461, 0.01380601, -0.00019924, -0.02128731, 0.01526375,\n", + " 0.02988552, -0.02491145, -0.00939747, 0.00798917, 0.0135474 ,\n", + " 0.01258122, -0.03753063, -0.04039029, -0.01517935, 0.00668549,\n", + " 0.02796665, 0.01242495, 0.0059546 , -0.01216253, 0.0372387 ,\n", + " 0.01762399, -0.00170241, 0.0003667 , 0.00895109, -0.03517802,\n", + " -0.00762667, 0.01357641, -0.02436312, 0.01829541, -0.01330634,\n", + " -0.02818829, -0.01139517, -0.01664645, 0.00769452, 0.01209339,\n", + " -0.00416979, -0.01296107, -0.0064631 , 0.0050506 , -0.01833598,\n", + " 0.02872021, -0.00062401, 0.0109796 , -0.01280711, 0.01152301,\n", + " -0.01085931, 0.02023655, 0.00272896, -0.00558658, 0.03704501,\n", + " -0.01837787, -0.00414707, 0.00713773, -0.01023714, 0.0090292 ,\n", + " 0.00089387, -0.01082103, 0.02051528, 0.01287969, -0.0074691 ,\n", + " -0.01942614, 0.01223695, -0.0136801 , -0.01567431, 0.01466064,\n", + " 0.04967042, 0.02889016, -0.005946 , -0.00131571, -0.0110809 ,\n", + " 0.00165396, -0.01279759, -0.01407798, -0.01902512, -0.01361593,\n", + " -0.00631681, -0.00142478, 0.01678663, 0.00815052, -0.00193329,\n", + " -0.00845464, -0.00746565, 0.03766166, -0.01099476, 0.00489809,\n", + " 0.01403449, 0.01477709, -0.00150515, 0.00462877, -0.01271886,\n", + " 0.00072193, 0.00815068, -0.04432011, -0.00604029, -0.00264471,\n", + " 0.01325564, -0.01315497, 0.00713541, -0.0137267 , -0.01845939,\n", + " -0.02801731, 0.01673851, -0.00593479, -0.01457028, 0.01636872,\n", + " -0.00751132, -0.01056858, 0.01126528, 0.01645665, 0.02689397,\n", + " 0.01920939, 0.01767929, -0.00843761, -0.01002457, -0.00844629,\n", + " 0.02888541, -0.00503441, -0.00025836, 0.01326172, -0.00968244,\n", + " 0.00430614, -0.00964946, 0.00635843, 0.00445558, -0.00235765,\n", + " 0.00160239, -0.00325711, 0.03206096, -0.00511734, 0.01108837,\n", + " 0.0014369 , 0.02616214, 0.01631057, -0.00778238, -0.04322761,\n", + " -0.00086197, 0.01174034, -0.00230315, -0.01354581, 0.01665967,\n", + " -0.02281472, -0.0123808 , -0.02901287, -0.04143119, -0.00477564,\n", + " 0.00608404, -0.00701787, -0.00686041, 0.01422733, -0.02854553,\n", + " -0.01464688, -0.00404892, 0.00348112, 0.02299088, -0.02302668,\n", + " 0.01208024, 0.01010513, -0.01571813, 0.01446694, -0.00129136,\n", + " -0.00054684, -0.00328883, 0.01649218, 0.03326375, -0.00185443,\n", + " 0.02091988, -0.00814938, -0.0088084 , 0.02302703, -0.01156406,\n", + " 0.04080933, 0.02902327, -0.01330268, -0.00385899, -0.00826302,\n", + " -0.02295679, 0.00658087, -0.0056047 , -0.01404469, 0.00368797,\n", + " -0.01484573, 0.00689151, 0.02035506, 0.02181732, 0.02151672,\n", + " 0.0004279 , -0.00763045, -0.01551796, -0.02054572, -0.03275407,\n", + " 0.00623783, 0.007831 , -0.02604559, 0.01956206, 0.0161521 ,\n", + " 0.02634443, -0.03285164, -0.01301691, 0.01066694, 0.01585914,\n", + " -0.0187955 , 0.01046878, -0.00189302, -0.01132144, -0.00140048,\n", + " 0.02645635, -0.04300842, -0.00639437, 0.01285532, -0.00437311,\n", + " 0.01163111, -0.015357 , -0.00531165, 0.01102756, 0.00182517,\n", + " 0.02303016, -0.00949884, -0.02009463, 0.00573564, 0.00076009,\n", + " 0.00078505], dtype=float32)]" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.5 64-bit" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file