From 87b2b635a1f26d5b099e8f3cbedd00e03afb2a91 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 12:26:33 +0800 Subject: [PATCH 01/30] =?UTF-8?q?Create=20=E4=BB=A4=E7=89=8C=E5=8C=96.rst?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" | 1 + 1 file changed, 1 insertion(+) diff --git "a/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" "b/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" index 9782bece..276b219a 100644 --- "a/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" +++ "b/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" @@ -24,5 +24,6 @@ Examples :titlesonly: ../tokenization/TextTokenizer + ../tokenization/PureTextTokenizer ../tokenization/GensimSegTokenizer ../tokenization/GensimWordTokenizer From 4816fa43970c8ff62e08befc6509f353363413bf Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 12:35:38 +0800 Subject: [PATCH 02/30] Create PureTextTokenizer.ipynb --- .../zh/tokenization/PureTextTokenizer.ipynb | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb diff --git a/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb b/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb new file mode 100644 index 00000000..14f955b4 --- /dev/null +++ b/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb @@ -0,0 +1,32 @@ +PureTextTokenizer +================ + +即纯净型文本令牌解析器,在默认情况下对传入的item中的图片、标签、分隔符、题目空缺符等部分则转换成特殊字符进行保护,并对特殊公式(例如:$\\FormFigureID{...}$, $\\FormFigureBase64{...}$)进行筛除,从而对文本、纯文本公式进行令牌化操作。此外,此令牌解析器对文本、公式均采用线性的分析方法,并提供的key参数用于对传入的item进行预处理,待未来根据需求进行开发。 + + +Examples +---------- + +:: + + >>> tokenizer = PureTextTokenizer() + >>> items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"] + >>> tokens = tokenizer(items) + >>> next(tokens)[:10] + ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z'] + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + >>> items = [{ + ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", + ... "options": ["1", "2"] + ... }] + >>> tokens = tokenizer(items, key=lambda x: x["stem"]) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] From d46aa06c7f985ab93f1fb75fbe2d56adbc675b28 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 12:38:01 +0800 Subject: [PATCH 03/30] =?UTF-8?q?Create=20=E4=BB=A4=E7=89=8C=E5=8C=96.rst?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git "a/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" "b/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" index 276b219a..230aa200 100644 --- "a/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" +++ "b/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" @@ -17,7 +17,7 @@ Examples -通过查看"./EduNLP/Tokenizer/tokenizer.py"及"./EduNLP/Pretrain/gensim_vec.py"可以查看更多令牌化器,下面是一个完整的令牌化器列表 +通过查看 ``./EduNLP/Tokenizer/tokenizer.py`` 及 ``./EduNLP/Pretrain/gensim_vec.py`` 可以查看更多令牌化器,下面是一个完整的令牌化器列表 .. toctree:: :maxdepth: 1 From bc8db6a143bc1df776d19f92b73fb8cff2663b82 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 12:39:45 +0800 Subject: [PATCH 04/30] Create PureTextTokenizer.ipynb --- docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb b/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb index 14f955b4..12181e94 100644 --- a/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb +++ b/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb @@ -3,7 +3,6 @@ PureTextTokenizer 即纯净型文本令牌解析器,在默认情况下对传入的item中的图片、标签、分隔符、题目空缺符等部分则转换成特殊字符进行保护,并对特殊公式(例如:$\\FormFigureID{...}$, $\\FormFigureBase64{...}$)进行筛除,从而对文本、纯文本公式进行令牌化操作。此外,此令牌解析器对文本、公式均采用线性的分析方法,并提供的key参数用于对传入的item进行预处理,待未来根据需求进行开发。 - Examples ---------- From 1d05290d1644e857502e59e15762766feb5a0006 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 16:39:30 +0800 Subject: [PATCH 05/30] Create pretrain.rst --- docs/source/ap/pretrain.rst | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/source/ap/pretrain.rst diff --git a/docs/source/ap/pretrain.rst b/docs/source/ap/pretrain.rst new file mode 100644 index 00000000..36c631e2 --- /dev/null +++ b/docs/source/ap/pretrain.rst @@ -0,0 +1,6 @@ +EduNLP.Pretrain.gensim_vec +============== + +.. automodule:: EduNLP.Pretrain.gensim_vec + :members: + :imported-members: From ac3020de465e6006059c7aa7525fdc4e5a327f89 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 16:40:04 +0800 Subject: [PATCH 06/30] Delete docs/source/ap directory --- docs/source/ap/pretrain.rst | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 docs/source/ap/pretrain.rst diff --git a/docs/source/ap/pretrain.rst b/docs/source/ap/pretrain.rst deleted file mode 100644 index 36c631e2..00000000 --- a/docs/source/ap/pretrain.rst +++ /dev/null @@ -1,6 +0,0 @@ -EduNLP.Pretrain.gensim_vec -============== - -.. automodule:: EduNLP.Pretrain.gensim_vec - :members: - :imported-members: From 53c5da642f52ef44fa83fcc17f9a0426c5fc969d Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 16:40:41 +0800 Subject: [PATCH 07/30] Create Pretrain.rst --- docs/source/api/Pretrain.rst | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/source/api/Pretrain.rst diff --git a/docs/source/api/Pretrain.rst b/docs/source/api/Pretrain.rst new file mode 100644 index 00000000..36c631e2 --- /dev/null +++ b/docs/source/api/Pretrain.rst @@ -0,0 +1,6 @@ +EduNLP.Pretrain.gensim_vec +============== + +.. automodule:: EduNLP.Pretrain.gensim_vec + :members: + :imported-members: From 13d1aced345327e34f0dc8eb49b375b0aeef6dc9 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 16:42:14 +0800 Subject: [PATCH 08/30] Create PureTextTokenizer.rst --- .../{PureTextTokenizer.ipynb => PureTextTokenizer.rst} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/source/tutorial/zh/tokenization/{PureTextTokenizer.ipynb => PureTextTokenizer.rst} (100%) diff --git a/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb b/docs/source/tutorial/zh/tokenization/PureTextTokenizer.rst similarity index 100% rename from docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb rename to docs/source/tutorial/zh/tokenization/PureTextTokenizer.rst From 9fd12127c71f3e8a92b5408a491508e4470da930 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 16:47:09 +0800 Subject: [PATCH 09/30] Create pretrain.rst --- docs/source/api/{Pretrain.rst => pretrain.rst} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/source/api/{Pretrain.rst => pretrain.rst} (100%) diff --git a/docs/source/api/Pretrain.rst b/docs/source/api/pretrain.rst similarity index 100% rename from docs/source/api/Pretrain.rst rename to docs/source/api/pretrain.rst From 2798fc5d39aae0d2c4aad42413bc8499d894d813 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 16:47:27 +0800 Subject: [PATCH 10/30] Create index.rst --- docs/source/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 13f8b2c6..96dc50ff 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -168,3 +168,4 @@ If this repository is helpful for you, please cite our work api/i2v api/sif api/formula + api/pretrain From c309caae103dd348932fe3bf29d1656f7c78763d Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 17:08:26 +0800 Subject: [PATCH 11/30] Create ModelZoo.rst --- docs/source/api/ModelZoo.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 docs/source/api/ModelZoo.rst diff --git a/docs/source/api/ModelZoo.rst b/docs/source/api/ModelZoo.rst new file mode 100644 index 00000000..8a87d953 --- /dev/null +++ b/docs/source/api/ModelZoo.rst @@ -0,0 +1,16 @@ +ModelZoo +============== + +rnn +----------- + +.. automodule:: EduNLP.ModelZoo.rnn + :members: + :imported-members: + +utils +----------- + +.. automodule:: EduNLP.ModelZoo.utils + :members: + :imported-members: From a5eac35eb7a645a77a53e2e8afa880ce135ae4fb Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 17:11:52 +0800 Subject: [PATCH 12/30] Create index.rst --- docs/source/index.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 96dc50ff..72e24a7a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -167,5 +167,8 @@ If this repository is helpful for you, please cite our work api/index api/i2v api/sif + api/tokenizer api/formula api/pretrain + api/ModelZoo + From ff588d88701615b4fd9a738620c7150bf898ee14 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 17:13:11 +0800 Subject: [PATCH 13/30] Create tokenizer.rst --- docs/source/api/tokenizer.rst | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/source/api/tokenizer.rst diff --git a/docs/source/api/tokenizer.rst b/docs/source/api/tokenizer.rst new file mode 100644 index 00000000..63d27f48 --- /dev/null +++ b/docs/source/api/tokenizer.rst @@ -0,0 +1,6 @@ +EduNLP.Tokenizer +===================================== + +.. automodule:: EduNLP.Tokenizer + :members: + :imported-members: From 4d32acd74703eea0fd525ba0d82e66af5829fc82 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 17:25:43 +0800 Subject: [PATCH 14/30] Create vector.rst --- docs/source/api/vector.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 docs/source/api/vector.rst diff --git a/docs/source/api/vector.rst b/docs/source/api/vector.rst new file mode 100644 index 00000000..b8b43d58 --- /dev/null +++ b/docs/source/api/vector.rst @@ -0,0 +1,16 @@ +EduNLP.Vector +========================== + +Vector +--------------- + +.. automodule:: EduNLP.Vector + :members: + :imported-members: + +rnn +----------- + +.. automodule:: EduNLP.Vector.rnn + :members: + :imported-members: From e9a117edbf23d0fb23795c2c38feadcd9c7ef078 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 17:28:56 +0800 Subject: [PATCH 15/30] Create utils.rst --- docs/source/api/utils.rst | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/source/api/utils.rst diff --git a/docs/source/api/utils.rst b/docs/source/api/utils.rst new file mode 100644 index 00000000..9ad570bf --- /dev/null +++ b/docs/source/api/utils.rst @@ -0,0 +1,6 @@ +EduNLP.utils +==================== + +.. automodule:: EduNLP.utils + :members: + :imported-members: From 6fd38433b483e55380bec77d22d10646e3b834b9 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 17:29:54 +0800 Subject: [PATCH 16/30] Create pretrain.rst --- docs/source/api/pretrain.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/api/pretrain.rst b/docs/source/api/pretrain.rst index 36c631e2..474d389d 100644 --- a/docs/source/api/pretrain.rst +++ b/docs/source/api/pretrain.rst @@ -1,6 +1,6 @@ -EduNLP.Pretrain.gensim_vec +EduNLP.Pretrain ============== -.. automodule:: EduNLP.Pretrain.gensim_vec +.. automodule:: EduNLP.Pretrain :members: :imported-members: From bd2f7f18ea6cdf3035e7322442196055146e4371 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 19:11:01 +0800 Subject: [PATCH 17/30] Create index.rst --- docs/source/index.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 72e24a7a..16107eae 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -171,4 +171,6 @@ If this repository is helpful for you, please cite our work api/formula api/pretrain api/ModelZoo + api/vector + api/utils From 58423c25782c6615bee05ff401ce23bc5ddf4d85 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 19:14:02 +0800 Subject: [PATCH 18/30] Create pretrain.rst --- docs/source/api/pretrain.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/api/pretrain.rst b/docs/source/api/pretrain.rst index 474d389d..977a0923 100644 --- a/docs/source/api/pretrain.rst +++ b/docs/source/api/pretrain.rst @@ -1,6 +1,6 @@ EduNLP.Pretrain ============== -.. automodule:: EduNLP.Pretrain +.. automodule:: EduNLP.Pretrain.gensim_vec :members: :imported-members: From c8dfd43cfa5810d0fa950465855e99bda6bf737c Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 19:18:01 +0800 Subject: [PATCH 19/30] Create index.rst --- docs/source/api/index.rst | 41 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst index 044ed6d3..683c993c 100644 --- a/docs/source/api/index.rst +++ b/docs/source/api/index.rst @@ -1,2 +1,43 @@ EduNLP ====== + +SIF +---------------------- +.. automodule:: EduNLP.SIF.sif + :members: + :imported-members: + +EduNLP.Formula +--------------------- + +.. automodule:: EduNLP.Formula.ast + :members: + :imported-members: + +EduNLP.I2V +----------------- + +.. automodule:: EduNLP.I2V.i2v + :members: + :imported-members: + +EduNLP.Pretrain +------------------- + +.. automodule:: EduNLP.Pretrain.gensim_vec + :members: + :imported-members: + +EduNLP.Tokenizer +---------------------- + +.. automodule:: EduNLP.Tokenizer + :members: + :imported-members: + +Vector +--------------- + +.. automodule:: EduNLP.Vector + :members: + :imported-members: From 7d6528e2ecdff075ea9be77de0f955f3558fb80a Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 19:35:46 +0800 Subject: [PATCH 20/30] Create pretrain.rst --- docs/source/api/pretrain.rst | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/source/api/pretrain.rst b/docs/source/api/pretrain.rst index 977a0923..e56289ab 100644 --- a/docs/source/api/pretrain.rst +++ b/docs/source/api/pretrain.rst @@ -1,6 +1,15 @@ EduNLP.Pretrain -============== +================== -.. automodule:: EduNLP.Pretrain.gensim_vec +.. automodule:: EduNLP.Pretrain + :members: + :imported-members: + + + +EduNLP.I2V +============ + +.. automodule:: EduNLP.I2V.i2v :members: :imported-members: From cca587052274ef111fc9d6b4f92978c86b5ace9e Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 19:40:11 +0800 Subject: [PATCH 21/30] Create ModelZoo.rst --- docs/source/api/ModelZoo.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/api/ModelZoo.rst b/docs/source/api/ModelZoo.rst index 8a87d953..ffdc764d 100644 --- a/docs/source/api/ModelZoo.rst +++ b/docs/source/api/ModelZoo.rst @@ -1,4 +1,4 @@ -ModelZoo +EduNLP.ModelZoo ============== rnn From f8f01661ed27b76a935d884980ed7d0bea30a308 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 19:44:42 +0800 Subject: [PATCH 22/30] Create pretrain.rst --- docs/source/api/pretrain.rst | 9 --------- 1 file changed, 9 deletions(-) diff --git a/docs/source/api/pretrain.rst b/docs/source/api/pretrain.rst index e56289ab..f418eda0 100644 --- a/docs/source/api/pretrain.rst +++ b/docs/source/api/pretrain.rst @@ -4,12 +4,3 @@ EduNLP.Pretrain .. automodule:: EduNLP.Pretrain :members: :imported-members: - - - -EduNLP.I2V -============ - -.. automodule:: EduNLP.I2V.i2v - :members: - :imported-members: From 9e0d35504aa53789e56e8240818d69a5dc84965c Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 19:45:17 +0800 Subject: [PATCH 23/30] Create index.rst --- docs/source/api/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst index 683c993c..30a14fc2 100644 --- a/docs/source/api/index.rst +++ b/docs/source/api/index.rst @@ -24,7 +24,7 @@ EduNLP.I2V EduNLP.Pretrain ------------------- -.. automodule:: EduNLP.Pretrain.gensim_vec +.. automodule:: EduNLP.Pretrain :members: :imported-members: From 6eb9e7ce647a8d1064d7e20c223dcb505c90d9f1 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 19:47:08 +0800 Subject: [PATCH 24/30] Create gensim_vec.py --- EduNLP/Pretrain/gensim_vec.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/EduNLP/Pretrain/gensim_vec.py b/EduNLP/Pretrain/gensim_vec.py index fde1bb43..acc3d67c 100644 --- a/EduNLP/Pretrain/gensim_vec.py +++ b/EduNLP/Pretrain/gensim_vec.py @@ -15,8 +15,7 @@ class GensimWordTokenizer(object): - def __init__(self, symbol="gm", general=False): - """ + """ Parameters ---------- @@ -45,6 +44,7 @@ def __init__(self, symbol="gm", general=False): >>> print(token_item.tokens[:10]) ['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]'] """ + def __init__(self, symbol="gm", general=False): self.symbol = symbol if general is True: self.tokenization_params = { @@ -72,8 +72,7 @@ def __call__(self, item): class GensimSegTokenizer(object): # pragma: no cover - def __init__(self, symbol="gms", depth=None, flatten=False, **kwargs): - """ + """ Parameters ---------- @@ -81,6 +80,7 @@ def __init__(self, symbol="gms", depth=None, flatten=False, **kwargs): gms fgm """ + def __init__(self, symbol="gms", depth=None, flatten=False, **kwargs): self.symbol = symbol self.tokenization_params = { "formula_params": { From 4c968c3237c649e1cc7663cfcd59f9a76154a85f Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 20:33:17 +0800 Subject: [PATCH 25/30] Create gensim_vec.py --- EduNLP/Pretrain/gensim_vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EduNLP/Pretrain/gensim_vec.py b/EduNLP/Pretrain/gensim_vec.py index acc3d67c..afe37215 100644 --- a/EduNLP/Pretrain/gensim_vec.py +++ b/EduNLP/Pretrain/gensim_vec.py @@ -43,7 +43,7 @@ class GensimWordTokenizer(object): ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") >>> print(token_item.tokens[:10]) ['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]'] - """ + """ def __init__(self, symbol="gm", general=False): self.symbol = symbol if general is True: From 24cc07d631f3b5929d6df2ef5912463a1dfa12f8 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 20:34:40 +0800 Subject: [PATCH 26/30] Create gensim_vec.py --- EduNLP/Pretrain/gensim_vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EduNLP/Pretrain/gensim_vec.py b/EduNLP/Pretrain/gensim_vec.py index afe37215..da5272f2 100644 --- a/EduNLP/Pretrain/gensim_vec.py +++ b/EduNLP/Pretrain/gensim_vec.py @@ -79,7 +79,7 @@ class GensimSegTokenizer(object): # pragma: no cover symbol: gms fgm - """ + """ def __init__(self, symbol="gms", depth=None, flatten=False, **kwargs): self.symbol = symbol self.tokenization_params = { From c7d451424b5f04028561ad5bc7eedcf48b19de5c Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 22:06:39 +0800 Subject: [PATCH 27/30] =?UTF-8?q?Create=20=E8=AF=AD=E4=B9=89=E6=88=90?= =?UTF-8?q?=E5=88=86=E5=88=86=E8=A7=A3.rst?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...11\346\210\220\345\210\206\345\210\206\350\247\243.rst" | 7 ------- 1 file changed, 7 deletions(-) diff --git "a/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst" "b/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst" index 0950dd87..e2106829 100644 --- "a/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst" +++ "b/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst" @@ -46,10 +46,3 @@ >>> dict2str4sif(item, key_as_tag=False) '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' -详细示范 -++++++++++++++++++++++ - -.. toctree:: - :titlesonly: - - 语义成分分解的案例 <../../../build/blitz/utils/data.ipynb> From 9cd6f06429692a0416f37fd16c9d92dc867964d5 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 22:08:50 +0800 Subject: [PATCH 28/30] Create d2v_bow_tfidf.ipynb --- examples/pretrain/gensim/d2v_bow_tfidf.ipynb | 519 ++++++++++++++----- 1 file changed, 383 insertions(+), 136 deletions(-) diff --git a/examples/pretrain/gensim/d2v_bow_tfidf.ipynb b/examples/pretrain/gensim/d2v_bow_tfidf.ipynb index 154279dc..bf70bec8 100644 --- a/examples/pretrain/gensim/d2v_bow_tfidf.ipynb +++ b/examples/pretrain/gensim/d2v_bow_tfidf.ipynb @@ -3,101 +3,109 @@ { "cell_type": "markdown", "source": [ - "# d2v_bow_tfidf" + "# 基于 gensim 的模型训练举例" ], "metadata": {} }, { "cell_type": "markdown", "source": [ - "## 1. load and tokenize test_items" + "## 概述\n", + "\n", + "您可以使用自己的数据和模型参数来训练和使用自己的模型。" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 导入模块" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 13, + "source": [ + "import json\n", + "from tqdm import tqdm\n", + "from EduNLP.Pretrain import GensimWordTokenizer, train_vector\n", + "from EduNLP.Vector import D2V, W2V\n", + "from EduNLP.SIF.segment import seg\n", + "from EduNLP.SIF.tokenization import tokenize\n", + "import time" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 准备模型训练数据" ], "metadata": {} }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 12, "source": [ - "from platform import processor\r\n", - "from gensim import corpora,models\r\n", - "# from collections import defaultdict\r\n", - "import json\r\n", - "from tqdm import tqdm\r\n", - "from EduNLP.Pretrain import GensimWordTokenizer,train_vector\r\n", - "from EduNLP.Vector import D2V\r\n", - "from EduNLP.SIF.segment import seg\r\n", - "from EduNLP.SIF.tokenization import tokenize\r\n", - "import time\r\n", - "\r\n", - "output_file_head = \"test\" # subject = english | liberal | science |all\r\n", - "baseDir = \"E:/Workustc/lunadata/d2v\"\r\n", - "# baseDir = \"/home/qlh/data_pretrain\"\r\n", - "work_file_path = baseDir + \"/data/\" + output_file_head + \"_raw.json\"\r\n", - "\r\n", - "test_items = [{'ques_content':'有公式$\\\\FormFigureID{wrong1?}$和公式$\\\\FormFigureBase64{wrong2?}$,如图$\\\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$'},\r\n", - " {\"ques_content\":\"Human machine interface for lab abc computer applications\"},\r\n", - " {\"ques_content\": \"A survey of user opinion of computer system response time\"},\r\n", - " {\"ques_content\": \"The EPS user interface management system\"},\r\n", - " {\"ques_content\": \"System and human system engineering testing of EPS\"},\r\n", - " {\"ques_content\": \"Relation of user perceived response time to error measurement\"},\r\n", - " {\"ques_content\": \"The generation of random binary unordered trees\"},\r\n", - " {\"ques_content\": \"The intersection graph of paths in trees\"},\r\n", - " {\"ques_content\": \"Graph minors IV Widths of trees and well quasi ordering\"},\r\n", - " {\"ques_content\": \"Graph minors A survey\"}\r\n", - " ]\r\n", - "\r\n", - "def load_items():\r\n", - " for line in test_items:\r\n", - " yield line\r\n", - " # with open(work_file_path, 'r', encoding=\"utf-8\") as f:\r\n", - " # for line in f:\r\n", - " # yield json.loads(line)\r\n", - "\r\n", - "def data2Token():\r\n", - " # 线性分词,而不使用ast\r\n", - " tokenization_params = {\r\n", - " \"formula_params\": {\r\n", - " \"method\": \"linear\",\r\n", - " }\r\n", - " }\r\n", - " \r\n", - " token_items = []\r\n", - " count = 1\r\n", - " for item in tqdm(load_items(), \"sifing\"):\r\n", - " count = count + 1\r\n", - " # -------------------------------------------- # \r\n", - " # \"\"\"除文本、公式外,其他转化为特殊标记\"\"\"\r\n", - " # seg_ret = seg(item[\"ques_content\"], symbol=\"gmas\")\r\n", - " # token_item = tokenize(seg_ret, **tokenization_params)\r\n", - " tokenizer = GensimWordTokenizer(symbol=\"gmas\", general=True)\r\n", - " token_item = tokenizer(item[\"ques_content\"])\r\n", - "\r\n", - " # -------------------------------------------- # \r\n", - " if token_item:\r\n", - " # print(\"[i] = \", count)\r\n", - " # print(\"[tokens] = \", token_item)\r\n", - " token_items.append(token_item.tokens)\r\n", - " print(\"[data2Token] finish ========================> num = \",len(token_items))\r\n", - " return token_items\r\n", - "\r\n", - "token_items = data2Token()\r\n", - "token_items[0]" + "test_items = [{'ques_content':'有公式$\\\\FormFigureID{wrong1?}$和公式$\\\\FormFigureBase64{wrong2?}$,如图$\\\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$'},\n", + " {\"ques_content\":\"Human machine interface for lab abc computer applications\"},\n", + " {\"ques_content\": \"A survey of user opinion of computer system response time\"},\n", + " {\"ques_content\": \"The EPS user interface management system\"},\n", + " {\"ques_content\": \"System and human system engineering testing of EPS\"},\n", + " {\"ques_content\": \"Relation of user perceived response time to error measurement\"},\n", + " {\"ques_content\": \"The generation of random binary unordered trees\"},\n", + " {\"ques_content\": \"The intersection graph of paths in trees\"},\n", + " {\"ques_content\": \"Graph minors IV Widths of trees and well quasi ordering\"},\n", + " {\"ques_content\": \"Graph minors A survey\"}\n", + " ]\n", + "\n", + "def load_items():\n", + " for line in test_items:\n", + " yield line\n", + "\n", + "\n", + "def data2Token():\n", + " # 线性分词\n", + " tokenization_params = {\n", + " \"formula_params\": {\n", + " \"method\": \"linear\",\n", + " }\n", + " }\n", + " \n", + " token_items = []\n", + " count = 1\n", + " for item in tqdm(load_items(), \"sifing\"):\n", + " count = count + 1\n", + " # -------------------------------------------- # \n", + " # \"\"\"除文本、公式外,其他转化为特殊标记\"\"\"\n", + " tokenizer = GensimWordTokenizer(symbol=\"gmas\", general=True)\n", + " token_item = tokenizer(item[\"ques_content\"])\n", + "\n", + " # -------------------------------------------- # \n", + " if token_item:\n", + " token_items.append(token_item.tokens)\n", + " print(\"[data2Token] finish ========================> num = \",len(token_items))\n", + " return token_items\n", + "\n", + "token_items = data2Token()\n", + "print(token_items[0])" ], "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ - "D:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", - " warnings.warn(msg)\n", - "sifing: 10it [00:00, 18.57it/s]" + "sifing: 10it [00:00, 114.91it/s]" ] }, { "output_type": "stream", "name": "stdout", "text": [ - "[data2Token] finish ========================> num = 10\n" + "[data2Token] finish ========================> num = 10\n", + "['公式', '[FORMULA]', '公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']\n" ] }, { @@ -106,41 +114,13 @@ "text": [ "\n" ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['公式',\n", - " '[FORMULA]',\n", - " '公式',\n", - " '[FORMULA]',\n", - " '如图',\n", - " '[FIGURE]',\n", - " 'x',\n", - " ',',\n", - " 'y',\n", - " '约束条件',\n", - " '[SEP]',\n", - " 'z',\n", - " '=',\n", - " 'x',\n", - " '+',\n", - " '7',\n", - " 'y',\n", - " '最大值',\n", - " '[MARK]']" - ] - }, - "metadata": {}, - "execution_count": 1 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "source": [ "len(token_items[0])" ], @@ -153,7 +133,7 @@ ] }, "metadata": {}, - "execution_count": 2 + "execution_count": 3 } ], "metadata": { @@ -165,7 +145,67 @@ { "cell_type": "markdown", "source": [ - "## 2. train and test model by 'bow'" + "### 也可从文件导入数据\n", + "例如:" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 45, + "source": [ + "from EduData import get_data\n", + "\n", + "# 导入项目提供的数据,存放路径:\"../../data/\"\n", + "get_data(\"open-luna\", \"../../data/\")\n", + "\n", + "\n", + "def load_items():\n", + " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\n", + " for line in f:\n", + " yield json.loads(line)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "downloader, INFO http://base.ustc.edu.cn/data/OpenLUNA/OpenLUNA.json is saved as ../../data/OpenLUNA.json\n", + "downloader, INFO file existed, skipped\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 46, + "source": [ + "tokenizer = GensimWordTokenizer(symbol=\"gm\")\n", + "sif_items = []\n", + "for item in tqdm(load_items(), \"sifing\"):\n", + " sif_item = tokenizer(\n", + " item[\"stem\"]\n", + " )\n", + " if sif_item:\n", + " sif_items.append(sif_item.tokens)\n", + "\n", + "sif_items[0]\n" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## EduNLP.Vector.D2V 模块的训练方法" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 1. 基于 bow 训练模型" ], "metadata": { "pycharm": { @@ -175,10 +215,8 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "source": [ - "from EduNLP.Pretrain import train_vector\r\n", - "#10 dimension with fasstext method\r\n", "train_vector(token_items, \"../../../data/d2v/gensim_luna_stem_tf_\", method=\"bow\")" ], "outputs": [ @@ -197,7 +235,7 @@ ] }, "metadata": {}, - "execution_count": 3 + "execution_count": 6 } ], "metadata": { @@ -206,13 +244,18 @@ } } }, + { + "cell_type": "markdown", + "source": [ + "- 模型测试" + ], + "metadata": {} + }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "source": [ - "from EduNLP.Vector import D2V\r\n", - "\r\n", - "d2v = D2V(\"../../../data/d2v/gensim_luna_stem_tf_bow.bin\", method = \"bow\")\r\n", + "d2v = D2V(\"../../../data/d2v/gensim_luna_stem_tf_bow.bin\", method = \"bow\")\n", "print(d2v(token_items[1]))" ], "outputs": [ @@ -233,16 +276,14 @@ { "cell_type": "markdown", "source": [ - "## 3. train and test model by 'tfidf'" + "### 2. 基于 tfidf 训练模型" ], "metadata": {} }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "source": [ - "from EduNLP.Pretrain import train_vector\r\n", - "#10 dimension with fasstext method\r\n", "train_vector(token_items, \"../../../data/d2v/gensim_luna_stem_tf_\", method=\"tfidf\")" ], "outputs": [ @@ -262,46 +303,250 @@ ] }, "metadata": {}, - "execution_count": 5 + "execution_count": 7 } ], "metadata": {} }, + { + "cell_type": "markdown", + "source": [ + "- 模型测试" + ], + "metadata": {} + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "source": [ - "from EduNLP.Vector import D2V\r\n", - "\r\n", - "d2v = D2V(\"../../../data/d2v/gensim_luna_stem_tf_tfidf.bin\", method = \"tfidf\")\r\n", - "vec_size = d2v.vector_size\r\n", - "print(\"vec_size = \", vec_size)\r\n", - "d2v(token_items[1])" + "d2v = D2V(\"../../../data/d2v/gensim_luna_stem_tf_tfidf.bin\", method = \"tfidf\")\n", + "vec_size = d2v.vector_size\n", + "print(\"vec_size = \", vec_size)\n", + "print(d2v(token_items[1]))" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ - "vec_size = 63\n" + "vec_size = 63\n", + "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.37858374396389033, 0.37858374396389033, 0.37858374396389033, 0.2646186811599866, 0.37858374396389033, 0.2646186811599866, 0.37858374396389033, 0.37858374396389033, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 3. 基于 Doc2Vec 训练模型" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 18, + "source": [ + "# 10 dimension with doc2vec method\n", + "train_vector(token_items, \"../../../data/w2v/gensim_luna_stem_tf_\", 10, method=\"d2v\")\n" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "EduNLP, INFO Epoch #0: loss-0.0000 \n", + "EduNLP, INFO Epoch #1: loss-0.0000 \n", + "EduNLP, INFO Epoch #2: loss-0.0000 \n", + "EduNLP, INFO Epoch #3: loss-0.0000 \n", + "EduNLP, INFO Epoch #4: loss-0.0000 \n", + "EduNLP, INFO Epoch #5: loss-0.0000 \n", + "EduNLP, INFO Epoch #6: loss-0.0000 \n", + "EduNLP, INFO Epoch #7: loss-0.0000 \n", + "EduNLP, INFO Epoch #8: loss-0.0000 \n", + "EduNLP, INFO Epoch #9: loss-0.0000 \n", + "EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_tf_d2v_10.bin\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ - "[(15, 0.37858374396389033),\n", - " (16, 0.37858374396389033),\n", - " (17, 0.37858374396389033),\n", - " (18, 0.2646186811599866),\n", - " (19, 0.37858374396389033),\n", - " (20, 0.2646186811599866),\n", - " (21, 0.37858374396389033),\n", - " (22, 0.37858374396389033)]" + "'../../../data/w2v/gensim_luna_stem_tf_d2v_10.bin'" ] }, "metadata": {}, - "execution_count": 6 + "execution_count": 18 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 22, + "source": [ + "d2v = D2V(\"../../../data/w2v/gensim_luna_stem_tf_d2v_10.bin\", method=\"d2v\")\n", + "vec_size = d2v.vector_size\n", + "print(\"vec_size = \", vec_size)\n", + "print(d2v(token_items[1]))\n" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "vec_size = 10\n", + "[-0.00211227 0.00167636 0.02313529 -0.04260717 -0.01389424 -0.03898989\n", + " 0.01181044 0.01069339 -0.03934718 0.00038158]\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## EduNLP.Vector.W2V 模块支持的训练方法" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 1. 基于 FastText 训练模型" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 25, + "source": [ + "# 10 dimension with fasstext method\n", + "train_vector(token_items, \"../../../data/w2v/gensim_luna_stem_t_\",\n", + " 10, method=\"fasttext\")\n" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "EduNLP, INFO Epoch #0: loss-0.0000 \n", + "EduNLP, INFO Epoch #1: loss-0.0000 \n", + "EduNLP, INFO Epoch #2: loss-0.0000 \n", + "EduNLP, INFO Epoch #3: loss-0.0000 \n", + "EduNLP, INFO Epoch #4: loss-0.0000 \n", + "EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_t_fasttext_10.bin\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'../../../data/w2v/gensim_luna_stem_t_fasttext_10.bin'" + ] + }, + "metadata": {}, + "execution_count": 25 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 模型测试" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 41, + "source": [ + "w2v = W2V(\"../../../data/w2v/gensim_luna_stem_t_fasttext_10.bin\", method=\"fasttext\")\n", + "w2v[\"[FORMULA]\"]\n" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([-0.00434524, -0.00836839, -0.02108332, 0.00493213, 0.00461454,\n", + " 0.01070305, -0.01737931, 0.0210843 , -0.00525515, 0.00918209],\n", + " dtype=float32)" + ] + }, + "metadata": {}, + "execution_count": 41 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### 2. 基于 cbow 训练模型" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 42, + "source": [ + "train_vector(token_items, \"../../../data/w2v/gensim_luna_stem_t_\", 10, method=\"cbow\")" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "EduNLP, INFO Epoch #0: loss-0.0000 \n", + "EduNLP, INFO Epoch #1: loss-0.0000 \n", + "EduNLP, INFO Epoch #2: loss-0.0000 \n", + "EduNLP, INFO Epoch #3: loss-0.0000 \n", + "EduNLP, INFO Epoch #4: loss-0.0000 \n", + "EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_t_cbow_10.kv\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'../../../data/w2v/gensim_luna_stem_t_cbow_10.kv'" + ] + }, + "metadata": {}, + "execution_count": 42 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 模型测试" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 43, + "source": [ + "w2v = W2V(\"../../../data/w2v/gensim_luna_stem_t_cbow_10.kv\",\n", + " method=\"fasttext\")\n", + "w2v[\"[FORMULA]\"]\n" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([-0.0156765 , 0.00329737, -0.04140369, -0.07689971, -0.01493463,\n", + " 0.02475806, -0.00877463, 0.05539609, -0.02750023, 0.0224804 ],\n", + " dtype=float32)" + ] + }, + "metadata": {}, + "execution_count": 43 } ], "metadata": {} @@ -309,9 +554,8 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "name": "python3", + "display_name": "Python 3.8.5 64-bit" }, "language_info": { "codemirror_mode": { @@ -323,7 +567,10 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.13" + "version": "3.8.5" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" } }, "nbformat": 4, From 5dd95d228f99dc4c79f29195c12a760be1c70221 Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 22:10:32 +0800 Subject: [PATCH 29/30] Create w2v_stem_text.ipynb --- examples/pretrain/gensim/w2v_stem_text.ipynb | 93 +++++++------------- 1 file changed, 34 insertions(+), 59 deletions(-) diff --git a/examples/pretrain/gensim/w2v_stem_text.ipynb b/examples/pretrain/gensim/w2v_stem_text.ipynb index 3c9b6ca9..0a0005cc 100644 --- a/examples/pretrain/gensim/w2v_stem_text.ipynb +++ b/examples/pretrain/gensim/w2v_stem_text.ipynb @@ -1,55 +1,28 @@ { "cells": [ - { - "cell_type": "markdown", - "source": [ - "# w2v_stem_text" - ], - "metadata": {} - }, { "cell_type": "code", "execution_count": 1, "source": [ - "import json\r\n", - "from tqdm import tqdm\r\n", - "\r\n", - "def load_items():\r\n", - " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\r\n", - " for line in f:\r\n", - " yield json.loads(line)\r\n", - "\r\n", - "from EduNLP.Pretrain import train_vector, GensimWordTokenizer\r\n", - "\r\n", - "tokenizer = GensimWordTokenizer(symbol=\"fgm\")\r\n", - "\r\n", - "sif_items = [\r\n", - " tokenizer(item[\"stem\"]).tokens for item in tqdm(load_items(), \"sifing\")\r\n", - "]\r\n", - "\r\n", + "import json\n", + "from tqdm import tqdm\n", + "\n", + "def load_items():\n", + " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\n", + " for line in f:\n", + " yield json.loads(line)\n", + "\n", + "from EduNLP.Pretrain import train_vector, GensimWordTokenizer\n", + "\n", + "tokenizer = GensimWordTokenizer(symbol=\"fgm\")\n", + "\n", + "sif_items = [\n", + " tokenizer(item[\"stem\"]).tokens for item in tqdm(load_items(), \"sifing\")\n", + "]\n", + "\n", "sif_items[0]" ], - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "d:\\env\\python3.8\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", - " warnings.warn(msg)\n", - "sifing: 792it [00:00, 845.20it/s]\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['已知', '集合', '[FORMULA]', '[FORMULA]']" - ] - }, - "metadata": {}, - "execution_count": 1 - } - ], + "outputs": [], "metadata": { "collapsed": true } @@ -83,7 +56,7 @@ "cell_type": "code", "execution_count": 3, "source": [ - "# 100 dimension with skipgram method\r\n", + "# 100 dimension with skipgram method\n", "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_t_\", 100)" ], "outputs": [ @@ -121,7 +94,7 @@ "cell_type": "code", "execution_count": 4, "source": [ - "# 50 dimension with cbow method\r\n", + "# 50 dimension with cbow method\n", "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_t_\", 50, method=\"cbow\")" ], "outputs": [ @@ -159,7 +132,7 @@ "cell_type": "code", "execution_count": 5, "source": [ - "# 10 dimension with fasstext method\r\n", + "# 10 dimension with fasstext method\n", "train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_t_\", 10, method=\"fasttext\")" ], "outputs": [ @@ -197,9 +170,9 @@ "cell_type": "code", "execution_count": 6, "source": [ - "from EduNLP.Vector import W2V\r\n", - "\r\n", - "w2v = W2V(\"../../../data/w2v/gensim_luna_stem_t_sg_100.kv\")\r\n", + "from EduNLP.Vector import W2V\n", + "\n", + "w2v = W2V(\"../../../data/w2v/gensim_luna_stem_t_sg_100.kv\")\n", "w2v[\"[FORMULA]\"]" ], "outputs": [ @@ -244,21 +217,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "name": "python3", + "display_name": "Python 3.8.5 64-bit" }, "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", + "pygments_lexer": "ipython3", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "file_extension": ".py" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" } }, "nbformat": 4, From f55a8760174b260d7c256d757d19a4262f6db77c Mon Sep 17 00:00:00 2001 From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com> Date: Mon, 23 Aug 2021 22:19:08 +0800 Subject: [PATCH 30/30] Create sif.ipynb --- examples/sif/sif.ipynb | 848 ++++++++++++++++++++++++++++++----------- 1 file changed, 618 insertions(+), 230 deletions(-) diff --git a/examples/sif/sif.ipynb b/examples/sif/sif.ipynb index 3376cd6d..63685077 100644 --- a/examples/sif/sif.ipynb +++ b/examples/sif/sif.ipynb @@ -2,35 +2,39 @@ "cells": [ { "cell_type": "markdown", + "source": [ + "# SIF4Sci 使用示例\n", + "\n", + "## 概述\n", + "\n", + "SIFSci 是一个提供试题切分和标注的模块。它可定制化的将文本切分为令牌(token)序列,为后续试题的向量化做准备。" + ], "metadata": { "collapsed": true, "pycharm": { "name": "#%% md\n" } - }, + } + }, + { + "cell_type": "markdown", "source": [ - "# Code for beginner to learn how to use SIF4Sci\n", + "本文将以下面这道题目(来源自 LUNA 题库)为例,展示 SIFSci 的使用方法。 \n", "\n", - "In this notebook, we will show you the basic usage to apply SIF to prepare data for conducting scientific experiments.\n", - "\n", - "We use the demo item (an exercise from LUNA) shown in the following Figure.\n", - "![Figure](../../asset/_static/item.png).\n", - "The SIF expression of this item can be written as follows:" - ] + "![Figure](../../asset/_static/item.png)" + ], + "metadata": {} }, { - "cell_type": "code", - "execution_count": 1, - "outputs": [ - { - "data": { - "text/plain": "'如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\\\SIFChoice$$\\\\FigureID{1}$'" - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } + "cell_type": "markdown", + "source": [ + "- 符合 [SIF 格式](https://edunlp.readthedocs.io/en/docs_dev/tutorial/zh/sif.html) 的题目录入格式为:" ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, "source": [ "item = {\n", " \"stem\": r\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\n", @@ -38,6 +42,18 @@ "}\n", "item[\"stem\"]" ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\\\SIFChoice$$\\\\FigureID{1}$'" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], "metadata": { "collapsed": false, "pycharm": { @@ -45,26 +61,35 @@ } } }, + { + "cell_type": "markdown", + "source": [ + "- 加载图片:`$\\\\FigureID{1}$`" + ], + "metadata": {} + }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, + "source": [ + "from PIL import Image\n", + "img = Image.open(\"../../asset/_static/item_figure.png\")\n", + "figures = {\"1\": img}\n", + "img" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "", - "image/png": "\n" + "text/plain": [ + "" + ], + "image/png": "" }, - "execution_count": 2, "metadata": {}, - "output_type": "execute_result" + "execution_count": 6 } ], - "source": [ - "from PIL import Image\n", - "img = Image.open(\"../../asset/_static/item_figure.png\")\n", - "figures = {\"1\": img}\n", - "img" - ], "metadata": { "collapsed": false, "pycharm": { @@ -75,7 +100,7 @@ { "cell_type": "markdown", "source": [ - "## Preparation" + "## 导入模块" ], "metadata": { "collapsed": false @@ -83,11 +108,11 @@ }, { "cell_type": "code", - "execution_count": 3, - "outputs": [], + "execution_count": 2, "source": [ - "from EduNLP.SIF import sif4sci" + "from EduNLP.SIF import sif4sci, is_sif, to_sif" ], + "outputs": [], "metadata": { "collapsed": false, "pycharm": { @@ -98,23 +123,85 @@ { "cell_type": "markdown", "source": [ - "## Verification\n", - "\n", - "## Auto Correction" + "## 验证题目格式" ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "is_sif(item['stem'])" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 7 } - } + ], + "metadata": {} }, { "cell_type": "markdown", "source": [ - "## Segment and Tokenization\n", + "- 若发现题目因为公式没有包含在 `$$` 中而不符合 SIF 格式,则可以使用 `to_sif` 模块转成标准格式。示例如下:" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 8, + "source": [ + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", + "is_sif(text)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "False" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 9, + "source": [ + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", + "to_sif(text)\n" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...'" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## 题目切分及令牌化\n", "\n", - "After we verify an item obeys SIF, we can further process it, i.e., segment and tokenization." + "现在我们得到了符合标准格式的题目文本,接下来可以对题目做进一步的预训练,例如:切分和令牌化。" ], "metadata": { "collapsed": false, @@ -126,7 +213,7 @@ { "cell_type": "markdown", "source": [ - "### Segment" + "### 题目切分" ], "metadata": { "collapsed": false, @@ -135,162 +222,220 @@ } } }, + { + "cell_type": "markdown", + "source": [ + "#### 基本切分\n", + "分离文本、公式、图片和特殊符号。" + ], + "metadata": {} + }, { "cell_type": "code", + "execution_count": 12, "source": [ - "sif4sci(item[\"stem\"], figures=figures, tokenization=False, symbol=\"tfgm\")\n" + "segments = sif4sci(item[\"stem\"], figures=figures, tokenization=False)\n", + "segments" ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - }, - "execution_count": 4, "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "['[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[FIGURE]']" + "text/plain": [ + "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形', 'ABC', '的斜边', 'BC', ', 直角边', 'AB', ', ', 'AC', '.', '\\\\bigtriangleup ABC', '的三边所围成的区域记为', 'I', ',黑色部分记为', 'II', ', 其余部分记为', 'III', '.在整个图形中随机取一点,此点取自', 'I,II,III', '的概率分别记为', 'p_1,p_2,p_3', ',则', '\\\\SIFChoice', \\FigureID{1}]" + ] }, - "execution_count": 4, "metadata": {}, - "output_type": "execute_result" + "execution_count": 12 } - ] + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "- 文本部分" + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 13, + "source": [ + "segments.text_segments" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形', 'ABC', '的斜边', 'BC', ', 直角边', 'AB', ', ', 'AC', '.', '\\\\bigtriangleup ABC', '的三边所围成的区域记为', 'I', ',黑色部分记为', 'II', ', 其余部分记为', 'III', '.在整个图形中随机取一点,此点取自', 'I,II,III', '的概率分别记为', 'p_1,p_2,p_3', ',则', '\\\\SIFChoice', \\FigureID{1}]" + "text/plain": [ + "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形',\n", + " '的斜边',\n", + " ', 直角边',\n", + " ', ',\n", + " '.',\n", + " '的三边所围成的区域记为',\n", + " ',黑色部分记为',\n", + " ', 其余部分记为',\n", + " '.在整个图形中随机取一点,此点取自',\n", + " '的概率分别记为',\n", + " ',则']" + ] }, - "execution_count": 5, "metadata": {}, - "output_type": "execute_result" + "execution_count": 13 } ], + "metadata": {} + }, + { + "cell_type": "markdown", "source": [ - "segments = sif4sci(item[\"stem\"], figures=figures, tokenization=False)\n", - "segments" + "- 公式部分" ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "metadata": {} }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 15, + "source": [ + "segments.formula_segments\n" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形',\n '的斜边',\n ', 直角边',\n ', ',\n '.',\n '的三边所围成的区域记为',\n ',黑色部分记为',\n ', 其余部分记为',\n '.在整个图形中随机取一点,此点取自',\n '的概率分别记为',\n ',则']" + "text/plain": [ + "['ABC',\n", + " 'BC',\n", + " 'AB',\n", + " 'AC',\n", + " '\\\\bigtriangleup ABC',\n", + " 'I',\n", + " 'II',\n", + " 'III',\n", + " 'I,II,III',\n", + " 'p_1,p_2,p_3']" + ] }, - "execution_count": 6, "metadata": {}, - "output_type": "execute_result" + "execution_count": 15 } ], + "metadata": {} + }, + { + "cell_type": "markdown", "source": [ - "segments.text_segments" + "- 图片部分" ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "metadata": {} }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 16, + "source": [ + "segments.figure_segments" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "[\\FigureID{1}]" + "text/plain": [ + "[\\FigureID{1}]" + ] }, - "execution_count": 7, "metadata": {}, - "output_type": "execute_result" + "execution_count": 16 } ], - "source": [ - "segments.figure_segments" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "metadata": {} }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 17, + "source": [ + "segments.figure_segments[0].figure" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "", - "image/png": "\n" + "text/plain": [ + "" + ], + "image/png": "" }, - "execution_count": 8, "metadata": {}, - "output_type": "execute_result" + "execution_count": 17 } ], + "metadata": {} + }, + { + "cell_type": "markdown", "source": [ - "segments.figure_segments[0].figure" + "- 特殊符号" ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "metadata": {} }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 19, + "source": [ + "segments.ques_mark_segments" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "['ABC',\n 'BC',\n 'AB',\n 'AC',\n '\\\\bigtriangleup ABC',\n 'I',\n 'II',\n 'III',\n 'I,II,III',\n 'p_1,p_2,p_3']" + "text/plain": [ + "['\\\\SIFChoice']" + ] }, - "execution_count": 9, "metadata": {}, - "output_type": "execute_result" + "execution_count": 19 } ], + "metadata": {} + }, + { + "cell_type": "markdown", "source": [ - "segments.formula_segments" + "#### 标记化切分 \n", + "如果您不注重题目文本和公式的具体内容,仅仅是对题目的整体(或部分)构成感兴趣,那么可以通过修改 `symbol` 参数来将不同的成分转化成特定标记,方便您的研究。" ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + " - symbol:\n", + " - \"t\": text\n", + " - \"f\": formula\n", + " - \"g\": figure\n", + " - \"m\": question mark" + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, + "source": [ + "sif4sci(item[\"stem\"], figures=figures, tokenization=False, symbol=\"tfgm\")" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "['\\\\SIFChoice']" + "text/plain": [ + "['[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[FIGURE]']" + ] }, - "execution_count": 10, "metadata": {}, - "output_type": "execute_result" + "execution_count": 11 } ], - "source": [ - "segments.ques_mark_segments" - ], "metadata": { "collapsed": false, "pycharm": { @@ -301,7 +446,11 @@ { "cell_type": "markdown", "source": [ - "### Tokenization" + "### 令牌化\n", + "\n", + "为了方便后续向量化表征试题,本模块提供题目文本的令牌化解析(Tokenization),即将题目转换成令牌序列。 \n", + "\n", + "根据构成题目的元素类型,解析功能分为 **“文本解析”** 和 **“公式解析”** 两部分。更具体的过程解析参见 [令牌化](../Tokenizer/tokenizer.ipynb)。" ], "metadata": { "collapsed": false, @@ -313,10 +462,10 @@ { "cell_type": "code", "execution_count": 20, - "outputs": [], "source": [ "tokens = sif4sci(item[\"stem\"], figures=figures, tokenization=True)" ], + "outputs": [], "metadata": { "collapsed": false, "pycharm": { @@ -327,7 +476,7 @@ { "cell_type": "markdown", "source": [ - "#### Text" + "- 文本解析结果" ], "metadata": { "collapsed": false, @@ -339,19 +488,53 @@ { "cell_type": "code", "execution_count": 12, + "source": [ + "tokens.text_tokens" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "['如图',\n '古希腊',\n '数学家',\n '希波',\n '克拉底',\n '研究',\n '几何图形',\n '此图',\n '三个',\n '半圆',\n '三个',\n '半圆',\n '直径',\n '直角三角形',\n '斜边',\n '直角',\n '三边',\n '围成',\n '区域',\n '记',\n '黑色',\n '记',\n '其余部分',\n '记',\n '图形',\n '中',\n '随机',\n '取',\n '一点',\n '此点',\n '取自',\n '概率',\n '记']" + "text/plain": [ + "['如图',\n", + " '古希腊',\n", + " '数学家',\n", + " '希波',\n", + " '克拉底',\n", + " '研究',\n", + " '几何图形',\n", + " '此图',\n", + " '三个',\n", + " '半圆',\n", + " '三个',\n", + " '半圆',\n", + " '直径',\n", + " '直角三角形',\n", + " '斜边',\n", + " '直角',\n", + " '三边',\n", + " '围成',\n", + " '区域',\n", + " '记',\n", + " '黑色',\n", + " '记',\n", + " '其余部分',\n", + " '记',\n", + " '图形',\n", + " '中',\n", + " '随机',\n", + " '取',\n", + " '一点',\n", + " '此点',\n", + " '取自',\n", + " '概率',\n", + " '记']" + ] }, - "execution_count": 12, "metadata": {}, - "output_type": "execute_result" + "execution_count": 12 } ], - "source": [ - "tokens.text_tokens" - ], "metadata": { "collapsed": false, "pycharm": { @@ -362,7 +545,7 @@ { "cell_type": "markdown", "source": [ - "#### Formula" + "#### 公式解析结果" ], "metadata": { "collapsed": false, @@ -374,19 +557,45 @@ { "cell_type": "code", "execution_count": 13, + "source": [ + "tokens.formula_tokens" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "['ABC',\n 'BC',\n 'AB',\n 'AC',\n '\\\\bigtriangleup',\n 'ABC',\n 'I',\n 'II',\n 'III',\n 'I',\n ',',\n 'II',\n ',',\n 'III',\n 'p',\n '_',\n '1',\n ',',\n 'p',\n '_',\n '2',\n ',',\n 'p',\n '_',\n '3']" + "text/plain": [ + "['ABC',\n", + " 'BC',\n", + " 'AB',\n", + " 'AC',\n", + " '\\\\bigtriangleup',\n", + " 'ABC',\n", + " 'I',\n", + " 'II',\n", + " 'III',\n", + " 'I',\n", + " ',',\n", + " 'II',\n", + " ',',\n", + " 'III',\n", + " 'p',\n", + " '_',\n", + " '1',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '2',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '3']" + ] }, - "execution_count": 13, "metadata": {}, - "output_type": "execute_result" + "execution_count": 13 } ], - "source": [ - "tokens.formula_tokens" - ], "metadata": { "collapsed": false, "pycharm": { @@ -395,18 +604,22 @@ } }, { - "cell_type": "code", - "execution_count": 14, - "outputs": [ - { - "data": { - "text/plain": "['A',\n 'B',\n 'C',\n 'B',\n 'C',\n 'A',\n 'B',\n 'A',\n 'C',\n '\\\\bigtriangleup',\n 'A',\n 'B',\n 'C',\n 'I',\n 'I',\n 'I',\n 'I',\n 'I',\n 'I',\n 'I',\n ',',\n 'I',\n 'I',\n ',',\n 'I',\n 'I',\n 'I',\n 'p',\n '1',\n '_',\n ',',\n 'p',\n '2',\n '_',\n ',',\n 'p',\n '3',\n '_']" - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } + "cell_type": "markdown", + "source": [ + "- 自定义参数,得到定制化解析结果" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(1)如果您想按 latex 语法标记拆分公式的各个部分,并得到顺序序列结果,输出方法(`method`)可以选择:`linear`" ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 37, "source": [ "sif4sci(\n", " item[\"stem\"],\n", @@ -414,12 +627,47 @@ " tokenization=True,\n", " tokenization_params={\n", " \"formula_params\": {\n", - " \"method\": \"ast\",\n", - " \"return_type\": \"list\"\n", + " \"method\": \"linear\",\n", " }\n", " }\n", ").formula_tokens" ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['ABC',\n", + " 'BC',\n", + " 'AB',\n", + " 'AC',\n", + " '\\\\bigtriangleup',\n", + " 'ABC',\n", + " 'I',\n", + " 'II',\n", + " 'III',\n", + " 'I',\n", + " ',',\n", + " 'II',\n", + " ',',\n", + " 'III',\n", + " 'p',\n", + " '_',\n", + " '1',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '2',\n", + " ',',\n", + " 'p',\n", + " '_',\n", + " '3']" + ] + }, + "metadata": {}, + "execution_count": 37 + } + ], "metadata": { "collapsed": false, "pycharm": { @@ -428,18 +676,17 @@ } }, { - "cell_type": "code", - "execution_count": 15, - "outputs": [ - { - "data": { - "text/plain": "['mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n '\\\\bigtriangleup',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n ',',\n 'mathord',\n 'mathord',\n ',',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'textord',\n '_',\n ',',\n 'mathord',\n 'textord',\n '_',\n ',',\n 'mathord',\n 'textord',\n '_']" - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } + "cell_type": "markdown", + "source": [ + "(2) 如果您想得到公式解析出的语法分析树序列,输出方法可以选择:`ast`\n", + "> 抽象语法分析树,简称语法树(Syntax tree),是源代码语法结构的一种抽象表示。它以树状的形式表现编程语言的语法结构,树上的每个节点都表示源代码中的一种结构。 \n", + "> 因此,ast 可以看做是公式的语法结构表征。" ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 39, "source": [ "sif4sci(\n", " item[\"stem\"],\n", @@ -448,11 +695,30 @@ " tokenization_params={\n", " \"formula_params\":{\n", " \"method\": \"ast\",\n", - " \"return_type\": \"list\",\n", - " \"ord2token\": True\n", " }\n", " }\n", - ").formula_tokens" + ").formula_tokens\n" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ]" + ] + }, + "metadata": {}, + "execution_count": 39 + } ], "metadata": { "collapsed": false, @@ -462,53 +728,78 @@ } }, { - "cell_type": "code", - "execution_count": 16, - "outputs": [ - { - "data": { - "text/plain": "['mathord_0',\n 'mathord_1',\n 'mathord_2',\n 'mathord_0',\n 'mathord_1',\n 'mathord_0',\n 'mathord_1',\n 'mathord_0',\n 'mathord_1',\n '\\\\bigtriangleup',\n 'mathord_0',\n 'mathord_1',\n 'mathord_2',\n 'mathord_0',\n 'mathord_0',\n 'mathord_0',\n 'mathord_0',\n 'mathord_0',\n 'mathord_0',\n 'mathord_0',\n ',',\n 'mathord_0',\n 'mathord_0',\n ',',\n 'mathord_0',\n 'mathord_0',\n 'mathord_0',\n 'mathord_0',\n 'textord',\n '_',\n ',',\n 'mathord_0',\n 'textord',\n '_',\n ',',\n 'mathord_0',\n 'textord',\n '_']" - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } + "cell_type": "markdown", + "source": [ + "- 语法树展示:" ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 109, "source": [ - "sif4sci(\n", + "f = sif4sci(\n", " item[\"stem\"],\n", " figures=figures,\n", " tokenization=True,\n", " tokenization_params={\n", " \"formula_params\":{\n", " \"method\": \"ast\",\n", - " \"return_type\": \"list\",\n", + " \"return_type\": \"ast\",\n", " \"ord2token\": True,\n", - " \"var_numbering\": True\n", + " \"var_numbering\": True,\n", " }\n", " }\n", - ").formula_tokens" + ").formula_tokens\n", + "f\n" ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "code", - "execution_count": 17, "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "[,\n ,\n ,\n ,\n ,\n ,\n ,\n ,\n ,\n ]" + "text/plain": [ + "[,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ]" + ] }, - "execution_count": 17, "metadata": {}, - "output_type": "execute_result" + "execution_count": 109 } ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 110, + "source": [ + "for i in range(0, len(f)):\n", + " ForestPlotter().export(\n", + " f[i], root_list=[node for node in f[i]],\n", + " )\n", + "# plt.show()\n" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "(3)如果您只是关心公式的结构和类型,并不关心变量具体是什么,比如二元二次方程 `x^2 + y = 1` ,它从公式结构和类型上来说,和 `w^2 + z = 1` 没有区别。 \n", + "此时,您可以设置如下参数:`ord2token = True`,将公式变量名转换成 token" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 40, "source": [ "sif4sci(\n", " item[\"stem\"],\n", @@ -517,13 +808,61 @@ " tokenization_params={\n", " \"formula_params\":{\n", " \"method\": \"ast\",\n", - " \"return_type\": \"formula\",\n", + " \"return_type\": \"list\",\n", " \"ord2token\": True,\n", - " \"var_numbering\": True\n", " }\n", " }\n", ").formula_tokens" ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " '\\\\bigtriangleup',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " ',',\n", + " 'mathord',\n", + " 'mathord',\n", + " ',',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord',\n", + " 'textord',\n", + " '\\\\supsub']" + ] + }, + "metadata": {}, + "execution_count": 40 + } + ], "metadata": { "collapsed": false, "pycharm": { @@ -532,18 +871,15 @@ } }, { - "cell_type": "code", - "execution_count": 18, - "outputs": [ - { - "data": { - "text/plain": "[,\n ,\n ,\n ,\n ,\n ,\n ,\n ,\n ,\n ]" - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } + "cell_type": "markdown", + "source": [ + "(4) 如果您除了 (3) 中提供的功能之外,还需要区分不同的变量。此时可以另外设置参数:`var_numbering=True`" ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 44, "source": [ "sif4sci(\n", " item[\"stem\"],\n", @@ -552,14 +888,61 @@ " tokenization_params={\n", " \"formula_params\":{\n", " \"method\": \"ast\",\n", - " \"return_type\": \"ast\",\n", " \"ord2token\": True,\n", + " \"return_type\": \"list\",\n", " \"var_numbering\": True\n", " }\n", " }\n", - ").formula_tokens\n", - "\n", - "#### Figure" + ").formula_tokens" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['mathord_0',\n", + " 'mathord_1',\n", + " 'mathord_2',\n", + " 'mathord_1',\n", + " 'mathord_2',\n", + " 'mathord_0',\n", + " 'mathord_1',\n", + " 'mathord_0',\n", + " 'mathord_2',\n", + " '\\\\bigtriangleup',\n", + " 'mathord_0',\n", + " 'mathord_1',\n", + " 'mathord_2',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " ',',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " ',',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_4',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord_4',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord_4',\n", + " 'textord',\n", + " '\\\\supsub']" + ] + }, + "metadata": {}, + "execution_count": 44 + } ], "metadata": { "collapsed": false, @@ -571,9 +954,9 @@ { "cell_type": "markdown", "source": [ - "## Downstream tasks\n", + "## 综合训练\n", "\n", - "### Word to vector" + "综合上述方法,将题目转换成令牌序列,为后续向量化做准备。" ], "metadata": { "collapsed": false, @@ -584,20 +967,23 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 96, + "source": [ + "sif4sci(item[\"stem\"], figures=figures, tokenization=True,\n", + " symbol=\"fgm\")" + ], "outputs": [ { + "output_type": "execute_result", "data": { - "text/plain": "['如图', '古希腊', '数学家', '希波', '克拉底', '研究', '几何图形', '此图', '三个', '半圆', '三个', '半圆', '直径', '直角三角形', '[FORMULA]', '斜边', '[FORMULA]', '直角', '[FORMULA]', '[FORMULA]', '[FORMULA]', '三边', '围成', '区域', '记', '[FORMULA]', '黑色', '记', '[FORMULA]', '其余部分', '记', '[FORMULA]', '图形', '中', '随机', '取', '一点', '此点', '取自', '[FORMULA]', '概率', '记', '[FORMULA]', '[MARK]', '[FIGURE]']" + "text/plain": [ + "['如图', '古希腊', '数学家', '希波', '克拉底', '研究', '几何图形', '此图', '三个', '半圆', '三个', '半圆', '直径', '直角三角形', '[FORMULA]', '斜边', '[FORMULA]', '直角', '[FORMULA]', '[FORMULA]', '[FORMULA]', '三边', '围成', '区域', '记', '[FORMULA]', '黑色', '记', '[FORMULA]', '其余部分', '记', '[FORMULA]', '图形', '中', '随机', '取', '一点', '此点', '取自', '[FORMULA]', '概率', '记', '[FORMULA]', '[MARK]', '[FIGURE]']" + ] }, - "execution_count": 19, "metadata": {}, - "output_type": "execute_result" + "execution_count": 96 } ], - "source": [ - "sif4sci(item[\"stem\"], figures=figures, tokenization=True, symbol=\"fgm\")" - ], "metadata": { "collapsed": false, "pycharm": { @@ -608,23 +994,25 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "name": "python3", + "display_name": "Python 3.8.5 64-bit" }, "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", + "pygments_lexer": "ipython3", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "file_extension": ".py" + }, + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 2 }