From 87b2b635a1f26d5b099e8f3cbedd00e03afb2a91 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 12:26:33 +0800
Subject: [PATCH 01/30] =?UTF-8?q?Create=20=E4=BB=A4=E7=89=8C=E5=8C=96.rst?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" | 1 +
1 file changed, 1 insertion(+)
diff --git "a/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" "b/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst"
index 9782bece..276b219a 100644
--- "a/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst"
+++ "b/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst"
@@ -24,5 +24,6 @@ Examples
:titlesonly:
../tokenization/TextTokenizer
+ ../tokenization/PureTextTokenizer
../tokenization/GensimSegTokenizer
../tokenization/GensimWordTokenizer
From 4816fa43970c8ff62e08befc6509f353363413bf Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 12:35:38 +0800
Subject: [PATCH 02/30] Create PureTextTokenizer.ipynb
---
.../zh/tokenization/PureTextTokenizer.ipynb | 32 +++++++++++++++++++
1 file changed, 32 insertions(+)
create mode 100644 docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb
diff --git a/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb b/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb
new file mode 100644
index 00000000..14f955b4
--- /dev/null
+++ b/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb
@@ -0,0 +1,32 @@
+PureTextTokenizer
+================
+
+即纯净型文本令牌解析器,在默认情况下对传入的item中的图片、标签、分隔符、题目空缺符等部分则转换成特殊字符进行保护,并对特殊公式(例如:$\\FormFigureID{...}$, $\\FormFigureBase64{...}$)进行筛除,从而对文本、纯文本公式进行令牌化操作。此外,此令牌解析器对文本、公式均采用线性的分析方法,并提供的key参数用于对传入的item进行预处理,待未来根据需求进行开发。
+
+
+Examples
+----------
+
+::
+
+ >>> tokenizer = PureTextTokenizer()
+ >>> items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
+ ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"]
+ >>> tokens = tokenizer(items)
+ >>> next(tokens)[:10]
+ ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z']
+ >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"]
+ >>> tokens = tokenizer(items)
+ >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE
+ ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<',
+ '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',',
+ '\\quad', 'A', '\\cap', 'B', '=']
+ >>> items = [{
+ ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
+ ... "options": ["1", "2"]
+ ... }]
+ >>> tokens = tokenizer(items, key=lambda x: x["stem"])
+ >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE
+ ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<',
+ '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',',
+ '\\quad', 'A', '\\cap', 'B', '=']
From d46aa06c7f985ab93f1fb75fbe2d56adbc675b28 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 12:38:01 +0800
Subject: [PATCH 03/30] =?UTF-8?q?Create=20=E4=BB=A4=E7=89=8C=E5=8C=96.rst?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git "a/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst" "b/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst"
index 276b219a..230aa200 100644
--- "a/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst"
+++ "b/docs/source/tutorial/zh/tokenize/\344\273\244\347\211\214\345\214\226.rst"
@@ -17,7 +17,7 @@ Examples
-通过查看"./EduNLP/Tokenizer/tokenizer.py"及"./EduNLP/Pretrain/gensim_vec.py"可以查看更多令牌化器,下面是一个完整的令牌化器列表
+通过查看 ``./EduNLP/Tokenizer/tokenizer.py`` 及 ``./EduNLP/Pretrain/gensim_vec.py`` 可以查看更多令牌化器,下面是一个完整的令牌化器列表
.. toctree::
:maxdepth: 1
From bc8db6a143bc1df776d19f92b73fb8cff2663b82 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 12:39:45 +0800
Subject: [PATCH 04/30] Create PureTextTokenizer.ipynb
---
docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb | 1 -
1 file changed, 1 deletion(-)
diff --git a/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb b/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb
index 14f955b4..12181e94 100644
--- a/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb
+++ b/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb
@@ -3,7 +3,6 @@ PureTextTokenizer
即纯净型文本令牌解析器,在默认情况下对传入的item中的图片、标签、分隔符、题目空缺符等部分则转换成特殊字符进行保护,并对特殊公式(例如:$\\FormFigureID{...}$, $\\FormFigureBase64{...}$)进行筛除,从而对文本、纯文本公式进行令牌化操作。此外,此令牌解析器对文本、公式均采用线性的分析方法,并提供的key参数用于对传入的item进行预处理,待未来根据需求进行开发。
-
Examples
----------
From 1d05290d1644e857502e59e15762766feb5a0006 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 16:39:30 +0800
Subject: [PATCH 05/30] Create pretrain.rst
---
docs/source/ap/pretrain.rst | 6 ++++++
1 file changed, 6 insertions(+)
create mode 100644 docs/source/ap/pretrain.rst
diff --git a/docs/source/ap/pretrain.rst b/docs/source/ap/pretrain.rst
new file mode 100644
index 00000000..36c631e2
--- /dev/null
+++ b/docs/source/ap/pretrain.rst
@@ -0,0 +1,6 @@
+EduNLP.Pretrain.gensim_vec
+==============
+
+.. automodule:: EduNLP.Pretrain.gensim_vec
+ :members:
+ :imported-members:
From ac3020de465e6006059c7aa7525fdc4e5a327f89 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 16:40:04 +0800
Subject: [PATCH 06/30] Delete docs/source/ap directory
---
docs/source/ap/pretrain.rst | 6 ------
1 file changed, 6 deletions(-)
delete mode 100644 docs/source/ap/pretrain.rst
diff --git a/docs/source/ap/pretrain.rst b/docs/source/ap/pretrain.rst
deleted file mode 100644
index 36c631e2..00000000
--- a/docs/source/ap/pretrain.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-EduNLP.Pretrain.gensim_vec
-==============
-
-.. automodule:: EduNLP.Pretrain.gensim_vec
- :members:
- :imported-members:
From 53c5da642f52ef44fa83fcc17f9a0426c5fc969d Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 16:40:41 +0800
Subject: [PATCH 07/30] Create Pretrain.rst
---
docs/source/api/Pretrain.rst | 6 ++++++
1 file changed, 6 insertions(+)
create mode 100644 docs/source/api/Pretrain.rst
diff --git a/docs/source/api/Pretrain.rst b/docs/source/api/Pretrain.rst
new file mode 100644
index 00000000..36c631e2
--- /dev/null
+++ b/docs/source/api/Pretrain.rst
@@ -0,0 +1,6 @@
+EduNLP.Pretrain.gensim_vec
+==============
+
+.. automodule:: EduNLP.Pretrain.gensim_vec
+ :members:
+ :imported-members:
From 13d1aced345327e34f0dc8eb49b375b0aeef6dc9 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 16:42:14 +0800
Subject: [PATCH 08/30] Create PureTextTokenizer.rst
---
.../{PureTextTokenizer.ipynb => PureTextTokenizer.rst} | 0
1 file changed, 0 insertions(+), 0 deletions(-)
rename docs/source/tutorial/zh/tokenization/{PureTextTokenizer.ipynb => PureTextTokenizer.rst} (100%)
diff --git a/docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb b/docs/source/tutorial/zh/tokenization/PureTextTokenizer.rst
similarity index 100%
rename from docs/source/tutorial/zh/tokenization/PureTextTokenizer.ipynb
rename to docs/source/tutorial/zh/tokenization/PureTextTokenizer.rst
From 9fd12127c71f3e8a92b5408a491508e4470da930 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 16:47:09 +0800
Subject: [PATCH 09/30] Create pretrain.rst
---
docs/source/api/{Pretrain.rst => pretrain.rst} | 0
1 file changed, 0 insertions(+), 0 deletions(-)
rename docs/source/api/{Pretrain.rst => pretrain.rst} (100%)
diff --git a/docs/source/api/Pretrain.rst b/docs/source/api/pretrain.rst
similarity index 100%
rename from docs/source/api/Pretrain.rst
rename to docs/source/api/pretrain.rst
From 2798fc5d39aae0d2c4aad42413bc8499d894d813 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 16:47:27 +0800
Subject: [PATCH 10/30] Create index.rst
---
docs/source/index.rst | 1 +
1 file changed, 1 insertion(+)
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 13f8b2c6..96dc50ff 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -168,3 +168,4 @@ If this repository is helpful for you, please cite our work
api/i2v
api/sif
api/formula
+ api/pretrain
From c309caae103dd348932fe3bf29d1656f7c78763d Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 17:08:26 +0800
Subject: [PATCH 11/30] Create ModelZoo.rst
---
docs/source/api/ModelZoo.rst | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
create mode 100644 docs/source/api/ModelZoo.rst
diff --git a/docs/source/api/ModelZoo.rst b/docs/source/api/ModelZoo.rst
new file mode 100644
index 00000000..8a87d953
--- /dev/null
+++ b/docs/source/api/ModelZoo.rst
@@ -0,0 +1,16 @@
+ModelZoo
+==============
+
+rnn
+-----------
+
+.. automodule:: EduNLP.ModelZoo.rnn
+ :members:
+ :imported-members:
+
+utils
+-----------
+
+.. automodule:: EduNLP.ModelZoo.utils
+ :members:
+ :imported-members:
From a5eac35eb7a645a77a53e2e8afa880ce135ae4fb Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 17:11:52 +0800
Subject: [PATCH 12/30] Create index.rst
---
docs/source/index.rst | 3 +++
1 file changed, 3 insertions(+)
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 96dc50ff..72e24a7a 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -167,5 +167,8 @@ If this repository is helpful for you, please cite our work
api/index
api/i2v
api/sif
+ api/tokenizer
api/formula
api/pretrain
+ api/ModelZoo
+
From ff588d88701615b4fd9a738620c7150bf898ee14 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 17:13:11 +0800
Subject: [PATCH 13/30] Create tokenizer.rst
---
docs/source/api/tokenizer.rst | 6 ++++++
1 file changed, 6 insertions(+)
create mode 100644 docs/source/api/tokenizer.rst
diff --git a/docs/source/api/tokenizer.rst b/docs/source/api/tokenizer.rst
new file mode 100644
index 00000000..63d27f48
--- /dev/null
+++ b/docs/source/api/tokenizer.rst
@@ -0,0 +1,6 @@
+EduNLP.Tokenizer
+=====================================
+
+.. automodule:: EduNLP.Tokenizer
+ :members:
+ :imported-members:
From 4d32acd74703eea0fd525ba0d82e66af5829fc82 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 17:25:43 +0800
Subject: [PATCH 14/30] Create vector.rst
---
docs/source/api/vector.rst | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
create mode 100644 docs/source/api/vector.rst
diff --git a/docs/source/api/vector.rst b/docs/source/api/vector.rst
new file mode 100644
index 00000000..b8b43d58
--- /dev/null
+++ b/docs/source/api/vector.rst
@@ -0,0 +1,16 @@
+EduNLP.Vector
+==========================
+
+Vector
+---------------
+
+.. automodule:: EduNLP.Vector
+ :members:
+ :imported-members:
+
+rnn
+-----------
+
+.. automodule:: EduNLP.Vector.rnn
+ :members:
+ :imported-members:
From e9a117edbf23d0fb23795c2c38feadcd9c7ef078 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 17:28:56 +0800
Subject: [PATCH 15/30] Create utils.rst
---
docs/source/api/utils.rst | 6 ++++++
1 file changed, 6 insertions(+)
create mode 100644 docs/source/api/utils.rst
diff --git a/docs/source/api/utils.rst b/docs/source/api/utils.rst
new file mode 100644
index 00000000..9ad570bf
--- /dev/null
+++ b/docs/source/api/utils.rst
@@ -0,0 +1,6 @@
+EduNLP.utils
+====================
+
+.. automodule:: EduNLP.utils
+ :members:
+ :imported-members:
From 6fd38433b483e55380bec77d22d10646e3b834b9 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 17:29:54 +0800
Subject: [PATCH 16/30] Create pretrain.rst
---
docs/source/api/pretrain.rst | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/docs/source/api/pretrain.rst b/docs/source/api/pretrain.rst
index 36c631e2..474d389d 100644
--- a/docs/source/api/pretrain.rst
+++ b/docs/source/api/pretrain.rst
@@ -1,6 +1,6 @@
-EduNLP.Pretrain.gensim_vec
+EduNLP.Pretrain
==============
-.. automodule:: EduNLP.Pretrain.gensim_vec
+.. automodule:: EduNLP.Pretrain
:members:
:imported-members:
From bd2f7f18ea6cdf3035e7322442196055146e4371 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 19:11:01 +0800
Subject: [PATCH 17/30] Create index.rst
---
docs/source/index.rst | 2 ++
1 file changed, 2 insertions(+)
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 72e24a7a..16107eae 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -171,4 +171,6 @@ If this repository is helpful for you, please cite our work
api/formula
api/pretrain
api/ModelZoo
+ api/vector
+ api/utils
From 58423c25782c6615bee05ff401ce23bc5ddf4d85 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 19:14:02 +0800
Subject: [PATCH 18/30] Create pretrain.rst
---
docs/source/api/pretrain.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/source/api/pretrain.rst b/docs/source/api/pretrain.rst
index 474d389d..977a0923 100644
--- a/docs/source/api/pretrain.rst
+++ b/docs/source/api/pretrain.rst
@@ -1,6 +1,6 @@
EduNLP.Pretrain
==============
-.. automodule:: EduNLP.Pretrain
+.. automodule:: EduNLP.Pretrain.gensim_vec
:members:
:imported-members:
From c8dfd43cfa5810d0fa950465855e99bda6bf737c Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 19:18:01 +0800
Subject: [PATCH 19/30] Create index.rst
---
docs/source/api/index.rst | 41 +++++++++++++++++++++++++++++++++++++++
1 file changed, 41 insertions(+)
diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
index 044ed6d3..683c993c 100644
--- a/docs/source/api/index.rst
+++ b/docs/source/api/index.rst
@@ -1,2 +1,43 @@
EduNLP
======
+
+SIF
+----------------------
+.. automodule:: EduNLP.SIF.sif
+ :members:
+ :imported-members:
+
+EduNLP.Formula
+---------------------
+
+.. automodule:: EduNLP.Formula.ast
+ :members:
+ :imported-members:
+
+EduNLP.I2V
+-----------------
+
+.. automodule:: EduNLP.I2V.i2v
+ :members:
+ :imported-members:
+
+EduNLP.Pretrain
+-------------------
+
+.. automodule:: EduNLP.Pretrain.gensim_vec
+ :members:
+ :imported-members:
+
+EduNLP.Tokenizer
+----------------------
+
+.. automodule:: EduNLP.Tokenizer
+ :members:
+ :imported-members:
+
+Vector
+---------------
+
+.. automodule:: EduNLP.Vector
+ :members:
+ :imported-members:
From 7d6528e2ecdff075ea9be77de0f955f3558fb80a Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 19:35:46 +0800
Subject: [PATCH 20/30] Create pretrain.rst
---
docs/source/api/pretrain.rst | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/docs/source/api/pretrain.rst b/docs/source/api/pretrain.rst
index 977a0923..e56289ab 100644
--- a/docs/source/api/pretrain.rst
+++ b/docs/source/api/pretrain.rst
@@ -1,6 +1,15 @@
EduNLP.Pretrain
-==============
+==================
-.. automodule:: EduNLP.Pretrain.gensim_vec
+.. automodule:: EduNLP.Pretrain
+ :members:
+ :imported-members:
+
+
+
+EduNLP.I2V
+============
+
+.. automodule:: EduNLP.I2V.i2v
:members:
:imported-members:
From cca587052274ef111fc9d6b4f92978c86b5ace9e Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 19:40:11 +0800
Subject: [PATCH 21/30] Create ModelZoo.rst
---
docs/source/api/ModelZoo.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/source/api/ModelZoo.rst b/docs/source/api/ModelZoo.rst
index 8a87d953..ffdc764d 100644
--- a/docs/source/api/ModelZoo.rst
+++ b/docs/source/api/ModelZoo.rst
@@ -1,4 +1,4 @@
-ModelZoo
+EduNLP.ModelZoo
==============
rnn
From f8f01661ed27b76a935d884980ed7d0bea30a308 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 19:44:42 +0800
Subject: [PATCH 22/30] Create pretrain.rst
---
docs/source/api/pretrain.rst | 9 ---------
1 file changed, 9 deletions(-)
diff --git a/docs/source/api/pretrain.rst b/docs/source/api/pretrain.rst
index e56289ab..f418eda0 100644
--- a/docs/source/api/pretrain.rst
+++ b/docs/source/api/pretrain.rst
@@ -4,12 +4,3 @@ EduNLP.Pretrain
.. automodule:: EduNLP.Pretrain
:members:
:imported-members:
-
-
-
-EduNLP.I2V
-============
-
-.. automodule:: EduNLP.I2V.i2v
- :members:
- :imported-members:
From 9e0d35504aa53789e56e8240818d69a5dc84965c Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 19:45:17 +0800
Subject: [PATCH 23/30] Create index.rst
---
docs/source/api/index.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
index 683c993c..30a14fc2 100644
--- a/docs/source/api/index.rst
+++ b/docs/source/api/index.rst
@@ -24,7 +24,7 @@ EduNLP.I2V
EduNLP.Pretrain
-------------------
-.. automodule:: EduNLP.Pretrain.gensim_vec
+.. automodule:: EduNLP.Pretrain
:members:
:imported-members:
From 6eb9e7ce647a8d1064d7e20c223dcb505c90d9f1 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 19:47:08 +0800
Subject: [PATCH 24/30] Create gensim_vec.py
---
EduNLP/Pretrain/gensim_vec.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/EduNLP/Pretrain/gensim_vec.py b/EduNLP/Pretrain/gensim_vec.py
index fde1bb43..acc3d67c 100644
--- a/EduNLP/Pretrain/gensim_vec.py
+++ b/EduNLP/Pretrain/gensim_vec.py
@@ -15,8 +15,7 @@
class GensimWordTokenizer(object):
- def __init__(self, symbol="gm", general=False):
- """
+ """
Parameters
----------
@@ -45,6 +44,7 @@ def __init__(self, symbol="gm", general=False):
>>> print(token_item.tokens[:10])
['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]']
"""
+ def __init__(self, symbol="gm", general=False):
self.symbol = symbol
if general is True:
self.tokenization_params = {
@@ -72,8 +72,7 @@ def __call__(self, item):
class GensimSegTokenizer(object): # pragma: no cover
- def __init__(self, symbol="gms", depth=None, flatten=False, **kwargs):
- """
+ """
Parameters
----------
@@ -81,6 +80,7 @@ def __init__(self, symbol="gms", depth=None, flatten=False, **kwargs):
gms
fgm
"""
+ def __init__(self, symbol="gms", depth=None, flatten=False, **kwargs):
self.symbol = symbol
self.tokenization_params = {
"formula_params": {
From 4c968c3237c649e1cc7663cfcd59f9a76154a85f Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 20:33:17 +0800
Subject: [PATCH 25/30] Create gensim_vec.py
---
EduNLP/Pretrain/gensim_vec.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/EduNLP/Pretrain/gensim_vec.py b/EduNLP/Pretrain/gensim_vec.py
index acc3d67c..afe37215 100644
--- a/EduNLP/Pretrain/gensim_vec.py
+++ b/EduNLP/Pretrain/gensim_vec.py
@@ -43,7 +43,7 @@ class GensimWordTokenizer(object):
... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$")
>>> print(token_item.tokens[:10])
['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]']
- """
+ """
def __init__(self, symbol="gm", general=False):
self.symbol = symbol
if general is True:
From 24cc07d631f3b5929d6df2ef5912463a1dfa12f8 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 20:34:40 +0800
Subject: [PATCH 26/30] Create gensim_vec.py
---
EduNLP/Pretrain/gensim_vec.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/EduNLP/Pretrain/gensim_vec.py b/EduNLP/Pretrain/gensim_vec.py
index afe37215..da5272f2 100644
--- a/EduNLP/Pretrain/gensim_vec.py
+++ b/EduNLP/Pretrain/gensim_vec.py
@@ -79,7 +79,7 @@ class GensimSegTokenizer(object): # pragma: no cover
symbol:
gms
fgm
- """
+ """
def __init__(self, symbol="gms", depth=None, flatten=False, **kwargs):
self.symbol = symbol
self.tokenization_params = {
From c7d451424b5f04028561ad5bc7eedcf48b19de5c Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 22:06:39 +0800
Subject: [PATCH 27/30] =?UTF-8?q?Create=20=E8=AF=AD=E4=B9=89=E6=88=90?=
=?UTF-8?q?=E5=88=86=E5=88=86=E8=A7=A3.rst?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
...11\346\210\220\345\210\206\345\210\206\350\247\243.rst" | 7 -------
1 file changed, 7 deletions(-)
diff --git "a/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst" "b/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst"
index 0950dd87..e2106829 100644
--- "a/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst"
+++ "b/docs/source/tutorial/zh/seg/\350\257\255\344\271\211\346\210\220\345\210\206\345\210\206\350\247\243.rst"
@@ -46,10 +46,3 @@
>>> dict2str4sif(item, key_as_tag=False)
'若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2'
-详细示范
-++++++++++++++++++++++
-
-.. toctree::
- :titlesonly:
-
- 语义成分分解的案例 <../../../build/blitz/utils/data.ipynb>
From 9cd6f06429692a0416f37fd16c9d92dc867964d5 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 22:08:50 +0800
Subject: [PATCH 28/30] Create d2v_bow_tfidf.ipynb
---
examples/pretrain/gensim/d2v_bow_tfidf.ipynb | 519 ++++++++++++++-----
1 file changed, 383 insertions(+), 136 deletions(-)
diff --git a/examples/pretrain/gensim/d2v_bow_tfidf.ipynb b/examples/pretrain/gensim/d2v_bow_tfidf.ipynb
index 154279dc..bf70bec8 100644
--- a/examples/pretrain/gensim/d2v_bow_tfidf.ipynb
+++ b/examples/pretrain/gensim/d2v_bow_tfidf.ipynb
@@ -3,101 +3,109 @@
{
"cell_type": "markdown",
"source": [
- "# d2v_bow_tfidf"
+ "# 基于 gensim 的模型训练举例"
],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
- "## 1. load and tokenize test_items"
+ "## 概述\n",
+ "\n",
+ "您可以使用自己的数据和模型参数来训练和使用自己的模型。"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 导入模块"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "source": [
+ "import json\n",
+ "from tqdm import tqdm\n",
+ "from EduNLP.Pretrain import GensimWordTokenizer, train_vector\n",
+ "from EduNLP.Vector import D2V, W2V\n",
+ "from EduNLP.SIF.segment import seg\n",
+ "from EduNLP.SIF.tokenization import tokenize\n",
+ "import time"
+ ],
+ "outputs": [],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 准备模型训练数据"
],
"metadata": {}
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 12,
"source": [
- "from platform import processor\r\n",
- "from gensim import corpora,models\r\n",
- "# from collections import defaultdict\r\n",
- "import json\r\n",
- "from tqdm import tqdm\r\n",
- "from EduNLP.Pretrain import GensimWordTokenizer,train_vector\r\n",
- "from EduNLP.Vector import D2V\r\n",
- "from EduNLP.SIF.segment import seg\r\n",
- "from EduNLP.SIF.tokenization import tokenize\r\n",
- "import time\r\n",
- "\r\n",
- "output_file_head = \"test\" # subject = english | liberal | science |all\r\n",
- "baseDir = \"E:/Workustc/lunadata/d2v\"\r\n",
- "# baseDir = \"/home/qlh/data_pretrain\"\r\n",
- "work_file_path = baseDir + \"/data/\" + output_file_head + \"_raw.json\"\r\n",
- "\r\n",
- "test_items = [{'ques_content':'有公式$\\\\FormFigureID{wrong1?}$和公式$\\\\FormFigureBase64{wrong2?}$,如图$\\\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$'},\r\n",
- " {\"ques_content\":\"Human machine interface for lab abc computer applications\"},\r\n",
- " {\"ques_content\": \"A survey of user opinion of computer system response time\"},\r\n",
- " {\"ques_content\": \"The EPS user interface management system\"},\r\n",
- " {\"ques_content\": \"System and human system engineering testing of EPS\"},\r\n",
- " {\"ques_content\": \"Relation of user perceived response time to error measurement\"},\r\n",
- " {\"ques_content\": \"The generation of random binary unordered trees\"},\r\n",
- " {\"ques_content\": \"The intersection graph of paths in trees\"},\r\n",
- " {\"ques_content\": \"Graph minors IV Widths of trees and well quasi ordering\"},\r\n",
- " {\"ques_content\": \"Graph minors A survey\"}\r\n",
- " ]\r\n",
- "\r\n",
- "def load_items():\r\n",
- " for line in test_items:\r\n",
- " yield line\r\n",
- " # with open(work_file_path, 'r', encoding=\"utf-8\") as f:\r\n",
- " # for line in f:\r\n",
- " # yield json.loads(line)\r\n",
- "\r\n",
- "def data2Token():\r\n",
- " # 线性分词,而不使用ast\r\n",
- " tokenization_params = {\r\n",
- " \"formula_params\": {\r\n",
- " \"method\": \"linear\",\r\n",
- " }\r\n",
- " }\r\n",
- " \r\n",
- " token_items = []\r\n",
- " count = 1\r\n",
- " for item in tqdm(load_items(), \"sifing\"):\r\n",
- " count = count + 1\r\n",
- " # -------------------------------------------- # \r\n",
- " # \"\"\"除文本、公式外,其他转化为特殊标记\"\"\"\r\n",
- " # seg_ret = seg(item[\"ques_content\"], symbol=\"gmas\")\r\n",
- " # token_item = tokenize(seg_ret, **tokenization_params)\r\n",
- " tokenizer = GensimWordTokenizer(symbol=\"gmas\", general=True)\r\n",
- " token_item = tokenizer(item[\"ques_content\"])\r\n",
- "\r\n",
- " # -------------------------------------------- # \r\n",
- " if token_item:\r\n",
- " # print(\"[i] = \", count)\r\n",
- " # print(\"[tokens] = \", token_item)\r\n",
- " token_items.append(token_item.tokens)\r\n",
- " print(\"[data2Token] finish ========================> num = \",len(token_items))\r\n",
- " return token_items\r\n",
- "\r\n",
- "token_items = data2Token()\r\n",
- "token_items[0]"
+ "test_items = [{'ques_content':'有公式$\\\\FormFigureID{wrong1?}$和公式$\\\\FormFigureBase64{wrong2?}$,如图$\\\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$'},\n",
+ " {\"ques_content\":\"Human machine interface for lab abc computer applications\"},\n",
+ " {\"ques_content\": \"A survey of user opinion of computer system response time\"},\n",
+ " {\"ques_content\": \"The EPS user interface management system\"},\n",
+ " {\"ques_content\": \"System and human system engineering testing of EPS\"},\n",
+ " {\"ques_content\": \"Relation of user perceived response time to error measurement\"},\n",
+ " {\"ques_content\": \"The generation of random binary unordered trees\"},\n",
+ " {\"ques_content\": \"The intersection graph of paths in trees\"},\n",
+ " {\"ques_content\": \"Graph minors IV Widths of trees and well quasi ordering\"},\n",
+ " {\"ques_content\": \"Graph minors A survey\"}\n",
+ " ]\n",
+ "\n",
+ "def load_items():\n",
+ " for line in test_items:\n",
+ " yield line\n",
+ "\n",
+ "\n",
+ "def data2Token():\n",
+ " # 线性分词\n",
+ " tokenization_params = {\n",
+ " \"formula_params\": {\n",
+ " \"method\": \"linear\",\n",
+ " }\n",
+ " }\n",
+ " \n",
+ " token_items = []\n",
+ " count = 1\n",
+ " for item in tqdm(load_items(), \"sifing\"):\n",
+ " count = count + 1\n",
+ " # -------------------------------------------- # \n",
+ " # \"\"\"除文本、公式外,其他转化为特殊标记\"\"\"\n",
+ " tokenizer = GensimWordTokenizer(symbol=\"gmas\", general=True)\n",
+ " token_item = tokenizer(item[\"ques_content\"])\n",
+ "\n",
+ " # -------------------------------------------- # \n",
+ " if token_item:\n",
+ " token_items.append(token_item.tokens)\n",
+ " print(\"[data2Token] finish ========================> num = \",len(token_items))\n",
+ " return token_items\n",
+ "\n",
+ "token_items = data2Token()\n",
+ "print(token_items[0])"
],
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
- "D:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
- " warnings.warn(msg)\n",
- "sifing: 10it [00:00, 18.57it/s]"
+ "sifing: 10it [00:00, 114.91it/s]"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
- "[data2Token] finish ========================> num = 10\n"
+ "[data2Token] finish ========================> num = 10\n",
+ "['公式', '[FORMULA]', '公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']\n"
]
},
{
@@ -106,41 +114,13 @@
"text": [
"\n"
]
- },
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "['公式',\n",
- " '[FORMULA]',\n",
- " '公式',\n",
- " '[FORMULA]',\n",
- " '如图',\n",
- " '[FIGURE]',\n",
- " 'x',\n",
- " ',',\n",
- " 'y',\n",
- " '约束条件',\n",
- " '[SEP]',\n",
- " 'z',\n",
- " '=',\n",
- " 'x',\n",
- " '+',\n",
- " '7',\n",
- " 'y',\n",
- " '最大值',\n",
- " '[MARK]']"
- ]
- },
- "metadata": {},
- "execution_count": 1
}
],
"metadata": {}
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
"source": [
"len(token_items[0])"
],
@@ -153,7 +133,7 @@
]
},
"metadata": {},
- "execution_count": 2
+ "execution_count": 3
}
],
"metadata": {
@@ -165,7 +145,67 @@
{
"cell_type": "markdown",
"source": [
- "## 2. train and test model by 'bow'"
+ "### 也可从文件导入数据\n",
+ "例如:"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "source": [
+ "from EduData import get_data\n",
+ "\n",
+ "# 导入项目提供的数据,存放路径:\"../../data/\"\n",
+ "get_data(\"open-luna\", \"../../data/\")\n",
+ "\n",
+ "\n",
+ "def load_items():\n",
+ " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\n",
+ " for line in f:\n",
+ " yield json.loads(line)"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "downloader, INFO http://base.ustc.edu.cn/data/OpenLUNA/OpenLUNA.json is saved as ../../data/OpenLUNA.json\n",
+ "downloader, INFO file existed, skipped\n"
+ ]
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "source": [
+ "tokenizer = GensimWordTokenizer(symbol=\"gm\")\n",
+ "sif_items = []\n",
+ "for item in tqdm(load_items(), \"sifing\"):\n",
+ " sif_item = tokenizer(\n",
+ " item[\"stem\"]\n",
+ " )\n",
+ " if sif_item:\n",
+ " sif_items.append(sif_item.tokens)\n",
+ "\n",
+ "sif_items[0]\n"
+ ],
+ "outputs": [],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## EduNLP.Vector.D2V 模块的训练方法"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### 1. 基于 bow 训练模型"
],
"metadata": {
"pycharm": {
@@ -175,10 +215,8 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 6,
"source": [
- "from EduNLP.Pretrain import train_vector\r\n",
- "#10 dimension with fasstext method\r\n",
"train_vector(token_items, \"../../../data/d2v/gensim_luna_stem_tf_\", method=\"bow\")"
],
"outputs": [
@@ -197,7 +235,7 @@
]
},
"metadata": {},
- "execution_count": 3
+ "execution_count": 6
}
],
"metadata": {
@@ -206,13 +244,18 @@
}
}
},
+ {
+ "cell_type": "markdown",
+ "source": [
+ "- 模型测试"
+ ],
+ "metadata": {}
+ },
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 9,
"source": [
- "from EduNLP.Vector import D2V\r\n",
- "\r\n",
- "d2v = D2V(\"../../../data/d2v/gensim_luna_stem_tf_bow.bin\", method = \"bow\")\r\n",
+ "d2v = D2V(\"../../../data/d2v/gensim_luna_stem_tf_bow.bin\", method = \"bow\")\n",
"print(d2v(token_items[1]))"
],
"outputs": [
@@ -233,16 +276,14 @@
{
"cell_type": "markdown",
"source": [
- "## 3. train and test model by 'tfidf'"
+ "### 2. 基于 tfidf 训练模型"
],
"metadata": {}
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 7,
"source": [
- "from EduNLP.Pretrain import train_vector\r\n",
- "#10 dimension with fasstext method\r\n",
"train_vector(token_items, \"../../../data/d2v/gensim_luna_stem_tf_\", method=\"tfidf\")"
],
"outputs": [
@@ -262,46 +303,250 @@
]
},
"metadata": {},
- "execution_count": 5
+ "execution_count": 7
}
],
"metadata": {}
},
+ {
+ "cell_type": "markdown",
+ "source": [
+ "- 模型测试"
+ ],
+ "metadata": {}
+ },
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 11,
"source": [
- "from EduNLP.Vector import D2V\r\n",
- "\r\n",
- "d2v = D2V(\"../../../data/d2v/gensim_luna_stem_tf_tfidf.bin\", method = \"tfidf\")\r\n",
- "vec_size = d2v.vector_size\r\n",
- "print(\"vec_size = \", vec_size)\r\n",
- "d2v(token_items[1])"
+ "d2v = D2V(\"../../../data/d2v/gensim_luna_stem_tf_tfidf.bin\", method = \"tfidf\")\n",
+ "vec_size = d2v.vector_size\n",
+ "print(\"vec_size = \", vec_size)\n",
+ "print(d2v(token_items[1]))"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
- "vec_size = 63\n"
+ "vec_size = 63\n",
+ "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.37858374396389033, 0.37858374396389033, 0.37858374396389033, 0.2646186811599866, 0.37858374396389033, 0.2646186811599866, 0.37858374396389033, 0.37858374396389033, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n"
+ ]
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### 3. 基于 Doc2Vec 训练模型"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "source": [
+ "# 10 dimension with doc2vec method\n",
+ "train_vector(token_items, \"../../../data/w2v/gensim_luna_stem_tf_\", 10, method=\"d2v\")\n"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "EduNLP, INFO Epoch #0: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #1: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #2: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #3: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #4: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #5: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #6: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #7: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #8: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #9: loss-0.0000 \n",
+ "EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_tf_d2v_10.bin\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
- "[(15, 0.37858374396389033),\n",
- " (16, 0.37858374396389033),\n",
- " (17, 0.37858374396389033),\n",
- " (18, 0.2646186811599866),\n",
- " (19, 0.37858374396389033),\n",
- " (20, 0.2646186811599866),\n",
- " (21, 0.37858374396389033),\n",
- " (22, 0.37858374396389033)]"
+ "'../../../data/w2v/gensim_luna_stem_tf_d2v_10.bin'"
]
},
"metadata": {},
- "execution_count": 6
+ "execution_count": 18
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "source": [
+ "d2v = D2V(\"../../../data/w2v/gensim_luna_stem_tf_d2v_10.bin\", method=\"d2v\")\n",
+ "vec_size = d2v.vector_size\n",
+ "print(\"vec_size = \", vec_size)\n",
+ "print(d2v(token_items[1]))\n"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "vec_size = 10\n",
+ "[-0.00211227 0.00167636 0.02313529 -0.04260717 -0.01389424 -0.03898989\n",
+ " 0.01181044 0.01069339 -0.03934718 0.00038158]\n"
+ ]
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## EduNLP.Vector.W2V 模块支持的训练方法"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### 1. 基于 FastText 训练模型"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "source": [
+ "# 10 dimension with fasstext method\n",
+ "train_vector(token_items, \"../../../data/w2v/gensim_luna_stem_t_\",\n",
+ " 10, method=\"fasttext\")\n"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "EduNLP, INFO Epoch #0: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #1: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #2: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #3: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #4: loss-0.0000 \n",
+ "EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_t_fasttext_10.bin\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "'../../../data/w2v/gensim_luna_stem_t_fasttext_10.bin'"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 25
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "- 模型测试"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "source": [
+ "w2v = W2V(\"../../../data/w2v/gensim_luna_stem_t_fasttext_10.bin\", method=\"fasttext\")\n",
+ "w2v[\"[FORMULA]\"]\n"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array([-0.00434524, -0.00836839, -0.02108332, 0.00493213, 0.00461454,\n",
+ " 0.01070305, -0.01737931, 0.0210843 , -0.00525515, 0.00918209],\n",
+ " dtype=float32)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 41
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### 2. 基于 cbow 训练模型"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "source": [
+ "train_vector(token_items, \"../../../data/w2v/gensim_luna_stem_t_\", 10, method=\"cbow\")"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "EduNLP, INFO Epoch #0: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #1: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #2: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #3: loss-0.0000 \n",
+ "EduNLP, INFO Epoch #4: loss-0.0000 \n",
+ "EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_t_cbow_10.kv\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "'../../../data/w2v/gensim_luna_stem_t_cbow_10.kv'"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 42
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "- 模型测试"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "source": [
+ "w2v = W2V(\"../../../data/w2v/gensim_luna_stem_t_cbow_10.kv\",\n",
+ " method=\"fasttext\")\n",
+ "w2v[\"[FORMULA]\"]\n"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array([-0.0156765 , 0.00329737, -0.04140369, -0.07689971, -0.01493463,\n",
+ " 0.02475806, -0.00877463, 0.05539609, -0.02750023, 0.0224804 ],\n",
+ " dtype=float32)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 43
}
],
"metadata": {}
@@ -309,9 +554,8 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
+ "name": "python3",
+ "display_name": "Python 3.8.5 64-bit"
},
"language_info": {
"codemirror_mode": {
@@ -323,7 +567,10 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.13"
+ "version": "3.8.5"
+ },
+ "interpreter": {
+ "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
}
},
"nbformat": 4,
From 5dd95d228f99dc4c79f29195c12a760be1c70221 Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 22:10:32 +0800
Subject: [PATCH 29/30] Create w2v_stem_text.ipynb
---
examples/pretrain/gensim/w2v_stem_text.ipynb | 93 +++++++-------------
1 file changed, 34 insertions(+), 59 deletions(-)
diff --git a/examples/pretrain/gensim/w2v_stem_text.ipynb b/examples/pretrain/gensim/w2v_stem_text.ipynb
index 3c9b6ca9..0a0005cc 100644
--- a/examples/pretrain/gensim/w2v_stem_text.ipynb
+++ b/examples/pretrain/gensim/w2v_stem_text.ipynb
@@ -1,55 +1,28 @@
{
"cells": [
- {
- "cell_type": "markdown",
- "source": [
- "# w2v_stem_text"
- ],
- "metadata": {}
- },
{
"cell_type": "code",
"execution_count": 1,
"source": [
- "import json\r\n",
- "from tqdm import tqdm\r\n",
- "\r\n",
- "def load_items():\r\n",
- " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\r\n",
- " for line in f:\r\n",
- " yield json.loads(line)\r\n",
- "\r\n",
- "from EduNLP.Pretrain import train_vector, GensimWordTokenizer\r\n",
- "\r\n",
- "tokenizer = GensimWordTokenizer(symbol=\"fgm\")\r\n",
- "\r\n",
- "sif_items = [\r\n",
- " tokenizer(item[\"stem\"]).tokens for item in tqdm(load_items(), \"sifing\")\r\n",
- "]\r\n",
- "\r\n",
+ "import json\n",
+ "from tqdm import tqdm\n",
+ "\n",
+ "def load_items():\n",
+ " with open(\"../../../data/OpenLUNA.json\", encoding=\"utf-8\") as f:\n",
+ " for line in f:\n",
+ " yield json.loads(line)\n",
+ "\n",
+ "from EduNLP.Pretrain import train_vector, GensimWordTokenizer\n",
+ "\n",
+ "tokenizer = GensimWordTokenizer(symbol=\"fgm\")\n",
+ "\n",
+ "sif_items = [\n",
+ " tokenizer(item[\"stem\"]).tokens for item in tqdm(load_items(), \"sifing\")\n",
+ "]\n",
+ "\n",
"sif_items[0]"
],
- "outputs": [
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "d:\\env\\python3.8\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
- " warnings.warn(msg)\n",
- "sifing: 792it [00:00, 845.20it/s]\n"
- ]
- },
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "['已知', '集合', '[FORMULA]', '[FORMULA]']"
- ]
- },
- "metadata": {},
- "execution_count": 1
- }
- ],
+ "outputs": [],
"metadata": {
"collapsed": true
}
@@ -83,7 +56,7 @@
"cell_type": "code",
"execution_count": 3,
"source": [
- "# 100 dimension with skipgram method\r\n",
+ "# 100 dimension with skipgram method\n",
"train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_t_\", 100)"
],
"outputs": [
@@ -121,7 +94,7 @@
"cell_type": "code",
"execution_count": 4,
"source": [
- "# 50 dimension with cbow method\r\n",
+ "# 50 dimension with cbow method\n",
"train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_t_\", 50, method=\"cbow\")"
],
"outputs": [
@@ -159,7 +132,7 @@
"cell_type": "code",
"execution_count": 5,
"source": [
- "# 10 dimension with fasstext method\r\n",
+ "# 10 dimension with fasstext method\n",
"train_vector(sif_items, \"../../../data/w2v/gensim_luna_stem_t_\", 10, method=\"fasttext\")"
],
"outputs": [
@@ -197,9 +170,9 @@
"cell_type": "code",
"execution_count": 6,
"source": [
- "from EduNLP.Vector import W2V\r\n",
- "\r\n",
- "w2v = W2V(\"../../../data/w2v/gensim_luna_stem_t_sg_100.kv\")\r\n",
+ "from EduNLP.Vector import W2V\n",
+ "\n",
+ "w2v = W2V(\"../../../data/w2v/gensim_luna_stem_t_sg_100.kv\")\n",
"w2v[\"[FORMULA]\"]"
],
"outputs": [
@@ -244,21 +217,23 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
+ "name": "python3",
+ "display_name": "Python 3.8.5 64-bit"
},
"language_info": {
+ "name": "python",
+ "version": "3.8.5",
+ "mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
- "version": 2
+ "version": 3
},
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
+ "pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.6"
+ "file_extension": ".py"
+ },
+ "interpreter": {
+ "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
}
},
"nbformat": 4,
From f55a8760174b260d7c256d757d19a4262f6db77c Mon Sep 17 00:00:00 2001
From: BAOOOOOM <82091145+BAOOOOOM@users.noreply.github.com>
Date: Mon, 23 Aug 2021 22:19:08 +0800
Subject: [PATCH 30/30] Create sif.ipynb
---
examples/sif/sif.ipynb | 848 ++++++++++++++++++++++++++++++-----------
1 file changed, 618 insertions(+), 230 deletions(-)
diff --git a/examples/sif/sif.ipynb b/examples/sif/sif.ipynb
index 3376cd6d..63685077 100644
--- a/examples/sif/sif.ipynb
+++ b/examples/sif/sif.ipynb
@@ -2,35 +2,39 @@
"cells": [
{
"cell_type": "markdown",
+ "source": [
+ "# SIF4Sci 使用示例\n",
+ "\n",
+ "## 概述\n",
+ "\n",
+ "SIFSci 是一个提供试题切分和标注的模块。它可定制化的将文本切分为令牌(token)序列,为后续试题的向量化做准备。"
+ ],
"metadata": {
"collapsed": true,
"pycharm": {
"name": "#%% md\n"
}
- },
+ }
+ },
+ {
+ "cell_type": "markdown",
"source": [
- "# Code for beginner to learn how to use SIF4Sci\n",
+ "本文将以下面这道题目(来源自 LUNA 题库)为例,展示 SIFSci 的使用方法。 \n",
"\n",
- "In this notebook, we will show you the basic usage to apply SIF to prepare data for conducting scientific experiments.\n",
- "\n",
- "We use the demo item (an exercise from LUNA) shown in the following Figure.\n",
- ".\n",
- "The SIF expression of this item can be written as follows:"
- ]
+ ""
+ ],
+ "metadata": {}
},
{
- "cell_type": "code",
- "execution_count": 1,
- "outputs": [
- {
- "data": {
- "text/plain": "'如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\\\SIFChoice$$\\\\FigureID{1}$'"
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "cell_type": "markdown",
+ "source": [
+ "- 符合 [SIF 格式](https://edunlp.readthedocs.io/en/docs_dev/tutorial/zh/sif.html) 的题目录入格式为:"
],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
"source": [
"item = {\n",
" \"stem\": r\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\n",
@@ -38,6 +42,18 @@
"}\n",
"item[\"stem\"]"
],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "'如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\\\SIFChoice$$\\\\FigureID{1}$'"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 5
+ }
+ ],
"metadata": {
"collapsed": false,
"pycharm": {
@@ -45,26 +61,35 @@
}
}
},
+ {
+ "cell_type": "markdown",
+ "source": [
+ "- 加载图片:`$\\\\FigureID{1}$`"
+ ],
+ "metadata": {}
+ },
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 6,
+ "source": [
+ "from PIL import Image\n",
+ "img = Image.open(\"../../asset/_static/item_figure.png\")\n",
+ "figures = {\"1\": img}\n",
+ "img"
+ ],
"outputs": [
{
+ "output_type": "execute_result",
"data": {
- "text/plain": "",
- "image/png": "\n"
+ "text/plain": [
+ ""
+ ],
+ "image/png": ""
},
- "execution_count": 2,
"metadata": {},
- "output_type": "execute_result"
+ "execution_count": 6
}
],
- "source": [
- "from PIL import Image\n",
- "img = Image.open(\"../../asset/_static/item_figure.png\")\n",
- "figures = {\"1\": img}\n",
- "img"
- ],
"metadata": {
"collapsed": false,
"pycharm": {
@@ -75,7 +100,7 @@
{
"cell_type": "markdown",
"source": [
- "## Preparation"
+ "## 导入模块"
],
"metadata": {
"collapsed": false
@@ -83,11 +108,11 @@
},
{
"cell_type": "code",
- "execution_count": 3,
- "outputs": [],
+ "execution_count": 2,
"source": [
- "from EduNLP.SIF import sif4sci"
+ "from EduNLP.SIF import sif4sci, is_sif, to_sif"
],
+ "outputs": [],
"metadata": {
"collapsed": false,
"pycharm": {
@@ -98,23 +123,85 @@
{
"cell_type": "markdown",
"source": [
- "## Verification\n",
- "\n",
- "## Auto Correction"
+ "## 验证题目格式"
],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%% md\n"
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "source": [
+ "is_sif(item['stem'])"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 7
}
- }
+ ],
+ "metadata": {}
},
{
"cell_type": "markdown",
"source": [
- "## Segment and Tokenization\n",
+ "- 若发现题目因为公式没有包含在 `$$` 中而不符合 SIF 格式,则可以使用 `to_sif` 模块转成标准格式。示例如下:"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "source": [
+ "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n",
+ "is_sif(text)"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "False"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "source": [
+ "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n",
+ "to_sif(text)\n"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "'某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...'"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 9
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 题目切分及令牌化\n",
"\n",
- "After we verify an item obeys SIF, we can further process it, i.e., segment and tokenization."
+ "现在我们得到了符合标准格式的题目文本,接下来可以对题目做进一步的预训练,例如:切分和令牌化。"
],
"metadata": {
"collapsed": false,
@@ -126,7 +213,7 @@
{
"cell_type": "markdown",
"source": [
- "### Segment"
+ "### 题目切分"
],
"metadata": {
"collapsed": false,
@@ -135,162 +222,220 @@
}
}
},
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### 基本切分\n",
+ "分离文本、公式、图片和特殊符号。"
+ ],
+ "metadata": {}
+ },
{
"cell_type": "code",
+ "execution_count": 12,
"source": [
- "sif4sci(item[\"stem\"], figures=figures, tokenization=False, symbol=\"tfgm\")\n"
+ "segments = sif4sci(item[\"stem\"], figures=figures, tokenization=False)\n",
+ "segments"
],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "execution_count": 4,
"outputs": [
{
+ "output_type": "execute_result",
"data": {
- "text/plain": "['[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[FIGURE]']"
+ "text/plain": [
+ "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形', 'ABC', '的斜边', 'BC', ', 直角边', 'AB', ', ', 'AC', '.', '\\\\bigtriangleup ABC', '的三边所围成的区域记为', 'I', ',黑色部分记为', 'II', ', 其余部分记为', 'III', '.在整个图形中随机取一点,此点取自', 'I,II,III', '的概率分别记为', 'p_1,p_2,p_3', ',则', '\\\\SIFChoice', \\FigureID{1}]"
+ ]
},
- "execution_count": 4,
"metadata": {},
- "output_type": "execute_result"
+ "execution_count": 12
}
- ]
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "- 文本部分"
+ ],
+ "metadata": {}
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 13,
+ "source": [
+ "segments.text_segments"
+ ],
"outputs": [
{
+ "output_type": "execute_result",
"data": {
- "text/plain": "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形', 'ABC', '的斜边', 'BC', ', 直角边', 'AB', ', ', 'AC', '.', '\\\\bigtriangleup ABC', '的三边所围成的区域记为', 'I', ',黑色部分记为', 'II', ', 其余部分记为', 'III', '.在整个图形中随机取一点,此点取自', 'I,II,III', '的概率分别记为', 'p_1,p_2,p_3', ',则', '\\\\SIFChoice', \\FigureID{1}]"
+ "text/plain": [
+ "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形',\n",
+ " '的斜边',\n",
+ " ', 直角边',\n",
+ " ', ',\n",
+ " '.',\n",
+ " '的三边所围成的区域记为',\n",
+ " ',黑色部分记为',\n",
+ " ', 其余部分记为',\n",
+ " '.在整个图形中随机取一点,此点取自',\n",
+ " '的概率分别记为',\n",
+ " ',则']"
+ ]
},
- "execution_count": 5,
"metadata": {},
- "output_type": "execute_result"
+ "execution_count": 13
}
],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
"source": [
- "segments = sif4sci(item[\"stem\"], figures=figures, tokenization=False)\n",
- "segments"
+ "- 公式部分"
],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
+ "metadata": {}
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 15,
+ "source": [
+ "segments.formula_segments\n"
+ ],
"outputs": [
{
+ "output_type": "execute_result",
"data": {
- "text/plain": "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形',\n '的斜边',\n ', 直角边',\n ', ',\n '.',\n '的三边所围成的区域记为',\n ',黑色部分记为',\n ', 其余部分记为',\n '.在整个图形中随机取一点,此点取自',\n '的概率分别记为',\n ',则']"
+ "text/plain": [
+ "['ABC',\n",
+ " 'BC',\n",
+ " 'AB',\n",
+ " 'AC',\n",
+ " '\\\\bigtriangleup ABC',\n",
+ " 'I',\n",
+ " 'II',\n",
+ " 'III',\n",
+ " 'I,II,III',\n",
+ " 'p_1,p_2,p_3']"
+ ]
},
- "execution_count": 6,
"metadata": {},
- "output_type": "execute_result"
+ "execution_count": 15
}
],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
"source": [
- "segments.text_segments"
+ "- 图片部分"
],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
+ "metadata": {}
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 16,
+ "source": [
+ "segments.figure_segments"
+ ],
"outputs": [
{
+ "output_type": "execute_result",
"data": {
- "text/plain": "[\\FigureID{1}]"
+ "text/plain": [
+ "[\\FigureID{1}]"
+ ]
},
- "execution_count": 7,
"metadata": {},
- "output_type": "execute_result"
+ "execution_count": 16
}
],
- "source": [
- "segments.figure_segments"
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
+ "metadata": {}
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 17,
+ "source": [
+ "segments.figure_segments[0].figure"
+ ],
"outputs": [
{
+ "output_type": "execute_result",
"data": {
- "text/plain": "",
- "image/png": "\n"
+ "text/plain": [
+ ""
+ ],
+ "image/png": ""
},
- "execution_count": 8,
"metadata": {},
- "output_type": "execute_result"
+ "execution_count": 17
}
],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
"source": [
- "segments.figure_segments[0].figure"
+ "- 特殊符号"
],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
+ "metadata": {}
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 19,
+ "source": [
+ "segments.ques_mark_segments"
+ ],
"outputs": [
{
+ "output_type": "execute_result",
"data": {
- "text/plain": "['ABC',\n 'BC',\n 'AB',\n 'AC',\n '\\\\bigtriangleup ABC',\n 'I',\n 'II',\n 'III',\n 'I,II,III',\n 'p_1,p_2,p_3']"
+ "text/plain": [
+ "['\\\\SIFChoice']"
+ ]
},
- "execution_count": 9,
"metadata": {},
- "output_type": "execute_result"
+ "execution_count": 19
}
],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
"source": [
- "segments.formula_segments"
+ "#### 标记化切分 \n",
+ "如果您不注重题目文本和公式的具体内容,仅仅是对题目的整体(或部分)构成感兴趣,那么可以通过修改 `symbol` 参数来将不同的成分转化成特定标记,方便您的研究。"
],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ " - symbol:\n",
+ " - \"t\": text\n",
+ " - \"f\": formula\n",
+ " - \"g\": figure\n",
+ " - \"m\": question mark"
+ ],
+ "metadata": {}
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 11,
+ "source": [
+ "sif4sci(item[\"stem\"], figures=figures, tokenization=False, symbol=\"tfgm\")"
+ ],
"outputs": [
{
+ "output_type": "execute_result",
"data": {
- "text/plain": "['\\\\SIFChoice']"
+ "text/plain": [
+ "['[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[FIGURE]']"
+ ]
},
- "execution_count": 10,
"metadata": {},
- "output_type": "execute_result"
+ "execution_count": 11
}
],
- "source": [
- "segments.ques_mark_segments"
- ],
"metadata": {
"collapsed": false,
"pycharm": {
@@ -301,7 +446,11 @@
{
"cell_type": "markdown",
"source": [
- "### Tokenization"
+ "### 令牌化\n",
+ "\n",
+ "为了方便后续向量化表征试题,本模块提供题目文本的令牌化解析(Tokenization),即将题目转换成令牌序列。 \n",
+ "\n",
+ "根据构成题目的元素类型,解析功能分为 **“文本解析”** 和 **“公式解析”** 两部分。更具体的过程解析参见 [令牌化](../Tokenizer/tokenizer.ipynb)。"
],
"metadata": {
"collapsed": false,
@@ -313,10 +462,10 @@
{
"cell_type": "code",
"execution_count": 20,
- "outputs": [],
"source": [
"tokens = sif4sci(item[\"stem\"], figures=figures, tokenization=True)"
],
+ "outputs": [],
"metadata": {
"collapsed": false,
"pycharm": {
@@ -327,7 +476,7 @@
{
"cell_type": "markdown",
"source": [
- "#### Text"
+ "- 文本解析结果"
],
"metadata": {
"collapsed": false,
@@ -339,19 +488,53 @@
{
"cell_type": "code",
"execution_count": 12,
+ "source": [
+ "tokens.text_tokens"
+ ],
"outputs": [
{
+ "output_type": "execute_result",
"data": {
- "text/plain": "['如图',\n '古希腊',\n '数学家',\n '希波',\n '克拉底',\n '研究',\n '几何图形',\n '此图',\n '三个',\n '半圆',\n '三个',\n '半圆',\n '直径',\n '直角三角形',\n '斜边',\n '直角',\n '三边',\n '围成',\n '区域',\n '记',\n '黑色',\n '记',\n '其余部分',\n '记',\n '图形',\n '中',\n '随机',\n '取',\n '一点',\n '此点',\n '取自',\n '概率',\n '记']"
+ "text/plain": [
+ "['如图',\n",
+ " '古希腊',\n",
+ " '数学家',\n",
+ " '希波',\n",
+ " '克拉底',\n",
+ " '研究',\n",
+ " '几何图形',\n",
+ " '此图',\n",
+ " '三个',\n",
+ " '半圆',\n",
+ " '三个',\n",
+ " '半圆',\n",
+ " '直径',\n",
+ " '直角三角形',\n",
+ " '斜边',\n",
+ " '直角',\n",
+ " '三边',\n",
+ " '围成',\n",
+ " '区域',\n",
+ " '记',\n",
+ " '黑色',\n",
+ " '记',\n",
+ " '其余部分',\n",
+ " '记',\n",
+ " '图形',\n",
+ " '中',\n",
+ " '随机',\n",
+ " '取',\n",
+ " '一点',\n",
+ " '此点',\n",
+ " '取自',\n",
+ " '概率',\n",
+ " '记']"
+ ]
},
- "execution_count": 12,
"metadata": {},
- "output_type": "execute_result"
+ "execution_count": 12
}
],
- "source": [
- "tokens.text_tokens"
- ],
"metadata": {
"collapsed": false,
"pycharm": {
@@ -362,7 +545,7 @@
{
"cell_type": "markdown",
"source": [
- "#### Formula"
+ "#### 公式解析结果"
],
"metadata": {
"collapsed": false,
@@ -374,19 +557,45 @@
{
"cell_type": "code",
"execution_count": 13,
+ "source": [
+ "tokens.formula_tokens"
+ ],
"outputs": [
{
+ "output_type": "execute_result",
"data": {
- "text/plain": "['ABC',\n 'BC',\n 'AB',\n 'AC',\n '\\\\bigtriangleup',\n 'ABC',\n 'I',\n 'II',\n 'III',\n 'I',\n ',',\n 'II',\n ',',\n 'III',\n 'p',\n '_',\n '1',\n ',',\n 'p',\n '_',\n '2',\n ',',\n 'p',\n '_',\n '3']"
+ "text/plain": [
+ "['ABC',\n",
+ " 'BC',\n",
+ " 'AB',\n",
+ " 'AC',\n",
+ " '\\\\bigtriangleup',\n",
+ " 'ABC',\n",
+ " 'I',\n",
+ " 'II',\n",
+ " 'III',\n",
+ " 'I',\n",
+ " ',',\n",
+ " 'II',\n",
+ " ',',\n",
+ " 'III',\n",
+ " 'p',\n",
+ " '_',\n",
+ " '1',\n",
+ " ',',\n",
+ " 'p',\n",
+ " '_',\n",
+ " '2',\n",
+ " ',',\n",
+ " 'p',\n",
+ " '_',\n",
+ " '3']"
+ ]
},
- "execution_count": 13,
"metadata": {},
- "output_type": "execute_result"
+ "execution_count": 13
}
],
- "source": [
- "tokens.formula_tokens"
- ],
"metadata": {
"collapsed": false,
"pycharm": {
@@ -395,18 +604,22 @@
}
},
{
- "cell_type": "code",
- "execution_count": 14,
- "outputs": [
- {
- "data": {
- "text/plain": "['A',\n 'B',\n 'C',\n 'B',\n 'C',\n 'A',\n 'B',\n 'A',\n 'C',\n '\\\\bigtriangleup',\n 'A',\n 'B',\n 'C',\n 'I',\n 'I',\n 'I',\n 'I',\n 'I',\n 'I',\n 'I',\n ',',\n 'I',\n 'I',\n ',',\n 'I',\n 'I',\n 'I',\n 'p',\n '1',\n '_',\n ',',\n 'p',\n '2',\n '_',\n ',',\n 'p',\n '3',\n '_']"
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "cell_type": "markdown",
+ "source": [
+ "- 自定义参数,得到定制化解析结果"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "(1)如果您想按 latex 语法标记拆分公式的各个部分,并得到顺序序列结果,输出方法(`method`)可以选择:`linear`"
],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
"source": [
"sif4sci(\n",
" item[\"stem\"],\n",
@@ -414,12 +627,47 @@
" tokenization=True,\n",
" tokenization_params={\n",
" \"formula_params\": {\n",
- " \"method\": \"ast\",\n",
- " \"return_type\": \"list\"\n",
+ " \"method\": \"linear\",\n",
" }\n",
" }\n",
").formula_tokens"
],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "['ABC',\n",
+ " 'BC',\n",
+ " 'AB',\n",
+ " 'AC',\n",
+ " '\\\\bigtriangleup',\n",
+ " 'ABC',\n",
+ " 'I',\n",
+ " 'II',\n",
+ " 'III',\n",
+ " 'I',\n",
+ " ',',\n",
+ " 'II',\n",
+ " ',',\n",
+ " 'III',\n",
+ " 'p',\n",
+ " '_',\n",
+ " '1',\n",
+ " ',',\n",
+ " 'p',\n",
+ " '_',\n",
+ " '2',\n",
+ " ',',\n",
+ " 'p',\n",
+ " '_',\n",
+ " '3']"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 37
+ }
+ ],
"metadata": {
"collapsed": false,
"pycharm": {
@@ -428,18 +676,17 @@
}
},
{
- "cell_type": "code",
- "execution_count": 15,
- "outputs": [
- {
- "data": {
- "text/plain": "['mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n '\\\\bigtriangleup',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n ',',\n 'mathord',\n 'mathord',\n ',',\n 'mathord',\n 'mathord',\n 'mathord',\n 'mathord',\n 'textord',\n '_',\n ',',\n 'mathord',\n 'textord',\n '_',\n ',',\n 'mathord',\n 'textord',\n '_']"
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "cell_type": "markdown",
+ "source": [
+ "(2) 如果您想得到公式解析出的语法分析树序列,输出方法可以选择:`ast`\n",
+ "> 抽象语法分析树,简称语法树(Syntax tree),是源代码语法结构的一种抽象表示。它以树状的形式表现编程语言的语法结构,树上的每个节点都表示源代码中的一种结构。 \n",
+ "> 因此,ast 可以看做是公式的语法结构表征。"
],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
"source": [
"sif4sci(\n",
" item[\"stem\"],\n",
@@ -448,11 +695,30 @@
" tokenization_params={\n",
" \"formula_params\":{\n",
" \"method\": \"ast\",\n",
- " \"return_type\": \"list\",\n",
- " \"ord2token\": True\n",
" }\n",
" }\n",
- ").formula_tokens"
+ ").formula_tokens\n"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "[,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 39
+ }
],
"metadata": {
"collapsed": false,
@@ -462,53 +728,78 @@
}
},
{
- "cell_type": "code",
- "execution_count": 16,
- "outputs": [
- {
- "data": {
- "text/plain": "['mathord_0',\n 'mathord_1',\n 'mathord_2',\n 'mathord_0',\n 'mathord_1',\n 'mathord_0',\n 'mathord_1',\n 'mathord_0',\n 'mathord_1',\n '\\\\bigtriangleup',\n 'mathord_0',\n 'mathord_1',\n 'mathord_2',\n 'mathord_0',\n 'mathord_0',\n 'mathord_0',\n 'mathord_0',\n 'mathord_0',\n 'mathord_0',\n 'mathord_0',\n ',',\n 'mathord_0',\n 'mathord_0',\n ',',\n 'mathord_0',\n 'mathord_0',\n 'mathord_0',\n 'mathord_0',\n 'textord',\n '_',\n ',',\n 'mathord_0',\n 'textord',\n '_',\n ',',\n 'mathord_0',\n 'textord',\n '_']"
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "cell_type": "markdown",
+ "source": [
+ "- 语法树展示:"
],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 109,
"source": [
- "sif4sci(\n",
+ "f = sif4sci(\n",
" item[\"stem\"],\n",
" figures=figures,\n",
" tokenization=True,\n",
" tokenization_params={\n",
" \"formula_params\":{\n",
" \"method\": \"ast\",\n",
- " \"return_type\": \"list\",\n",
+ " \"return_type\": \"ast\",\n",
" \"ord2token\": True,\n",
- " \"var_numbering\": True\n",
+ " \"var_numbering\": True,\n",
" }\n",
" }\n",
- ").formula_tokens"
+ ").formula_tokens\n",
+ "f\n"
],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": 17,
"outputs": [
{
+ "output_type": "execute_result",
"data": {
- "text/plain": "[,\n ,\n ,\n ,\n ,\n ,\n ,\n ,\n ,\n ]"
+ "text/plain": [
+ "[,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ]"
+ ]
},
- "execution_count": 17,
"metadata": {},
- "output_type": "execute_result"
+ "execution_count": 109
}
],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
+ "source": [
+ "for i in range(0, len(f)):\n",
+ " ForestPlotter().export(\n",
+ " f[i], root_list=[node for node in f[i]],\n",
+ " )\n",
+ "# plt.show()\n"
+ ],
+ "outputs": [],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "(3)如果您只是关心公式的结构和类型,并不关心变量具体是什么,比如二元二次方程 `x^2 + y = 1` ,它从公式结构和类型上来说,和 `w^2 + z = 1` 没有区别。 \n",
+ "此时,您可以设置如下参数:`ord2token = True`,将公式变量名转换成 token"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
"source": [
"sif4sci(\n",
" item[\"stem\"],\n",
@@ -517,13 +808,61 @@
" tokenization_params={\n",
" \"formula_params\":{\n",
" \"method\": \"ast\",\n",
- " \"return_type\": \"formula\",\n",
+ " \"return_type\": \"list\",\n",
" \"ord2token\": True,\n",
- " \"var_numbering\": True\n",
" }\n",
" }\n",
").formula_tokens"
],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "['mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " '\\\\bigtriangleup',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " ',',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " ',',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'mathord',\n",
+ " 'textord',\n",
+ " '\\\\supsub',\n",
+ " ',',\n",
+ " 'mathord',\n",
+ " 'textord',\n",
+ " '\\\\supsub',\n",
+ " ',',\n",
+ " 'mathord',\n",
+ " 'textord',\n",
+ " '\\\\supsub']"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 40
+ }
+ ],
"metadata": {
"collapsed": false,
"pycharm": {
@@ -532,18 +871,15 @@
}
},
{
- "cell_type": "code",
- "execution_count": 18,
- "outputs": [
- {
- "data": {
- "text/plain": "[,\n ,\n ,\n ,\n ,\n ,\n ,\n ,\n ,\n ]"
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "cell_type": "markdown",
+ "source": [
+ "(4) 如果您除了 (3) 中提供的功能之外,还需要区分不同的变量。此时可以另外设置参数:`var_numbering=True`"
],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
"source": [
"sif4sci(\n",
" item[\"stem\"],\n",
@@ -552,14 +888,61 @@
" tokenization_params={\n",
" \"formula_params\":{\n",
" \"method\": \"ast\",\n",
- " \"return_type\": \"ast\",\n",
" \"ord2token\": True,\n",
+ " \"return_type\": \"list\",\n",
" \"var_numbering\": True\n",
" }\n",
" }\n",
- ").formula_tokens\n",
- "\n",
- "#### Figure"
+ ").formula_tokens"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "['mathord_0',\n",
+ " 'mathord_1',\n",
+ " 'mathord_2',\n",
+ " 'mathord_1',\n",
+ " 'mathord_2',\n",
+ " 'mathord_0',\n",
+ " 'mathord_1',\n",
+ " 'mathord_0',\n",
+ " 'mathord_2',\n",
+ " '\\\\bigtriangleup',\n",
+ " 'mathord_0',\n",
+ " 'mathord_1',\n",
+ " 'mathord_2',\n",
+ " 'mathord_3',\n",
+ " 'mathord_3',\n",
+ " 'mathord_3',\n",
+ " 'mathord_3',\n",
+ " 'mathord_3',\n",
+ " 'mathord_3',\n",
+ " 'mathord_3',\n",
+ " ',',\n",
+ " 'mathord_3',\n",
+ " 'mathord_3',\n",
+ " ',',\n",
+ " 'mathord_3',\n",
+ " 'mathord_3',\n",
+ " 'mathord_3',\n",
+ " 'mathord_4',\n",
+ " 'textord',\n",
+ " '\\\\supsub',\n",
+ " ',',\n",
+ " 'mathord_4',\n",
+ " 'textord',\n",
+ " '\\\\supsub',\n",
+ " ',',\n",
+ " 'mathord_4',\n",
+ " 'textord',\n",
+ " '\\\\supsub']"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 44
+ }
],
"metadata": {
"collapsed": false,
@@ -571,9 +954,9 @@
{
"cell_type": "markdown",
"source": [
- "## Downstream tasks\n",
+ "## 综合训练\n",
"\n",
- "### Word to vector"
+ "综合上述方法,将题目转换成令牌序列,为后续向量化做准备。"
],
"metadata": {
"collapsed": false,
@@ -584,20 +967,23 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 96,
+ "source": [
+ "sif4sci(item[\"stem\"], figures=figures, tokenization=True,\n",
+ " symbol=\"fgm\")"
+ ],
"outputs": [
{
+ "output_type": "execute_result",
"data": {
- "text/plain": "['如图', '古希腊', '数学家', '希波', '克拉底', '研究', '几何图形', '此图', '三个', '半圆', '三个', '半圆', '直径', '直角三角形', '[FORMULA]', '斜边', '[FORMULA]', '直角', '[FORMULA]', '[FORMULA]', '[FORMULA]', '三边', '围成', '区域', '记', '[FORMULA]', '黑色', '记', '[FORMULA]', '其余部分', '记', '[FORMULA]', '图形', '中', '随机', '取', '一点', '此点', '取自', '[FORMULA]', '概率', '记', '[FORMULA]', '[MARK]', '[FIGURE]']"
+ "text/plain": [
+ "['如图', '古希腊', '数学家', '希波', '克拉底', '研究', '几何图形', '此图', '三个', '半圆', '三个', '半圆', '直径', '直角三角形', '[FORMULA]', '斜边', '[FORMULA]', '直角', '[FORMULA]', '[FORMULA]', '[FORMULA]', '三边', '围成', '区域', '记', '[FORMULA]', '黑色', '记', '[FORMULA]', '其余部分', '记', '[FORMULA]', '图形', '中', '随机', '取', '一点', '此点', '取自', '[FORMULA]', '概率', '记', '[FORMULA]', '[MARK]', '[FIGURE]']"
+ ]
},
- "execution_count": 19,
"metadata": {},
- "output_type": "execute_result"
+ "execution_count": 96
}
],
- "source": [
- "sif4sci(item[\"stem\"], figures=figures, tokenization=True, symbol=\"fgm\")"
- ],
"metadata": {
"collapsed": false,
"pycharm": {
@@ -608,23 +994,25 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
+ "name": "python3",
+ "display_name": "Python 3.8.5 64-bit"
},
"language_info": {
+ "name": "python",
+ "version": "3.8.5",
+ "mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
- "version": 2
+ "version": 3
},
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
+ "pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.6"
+ "file_extension": ".py"
+ },
+ "interpreter": {
+ "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
}
},
"nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 2
}