From 654df05f9df1b543684a8e23263af01772c0b945 Mon Sep 17 00:00:00 2001 From: QinLonghu <1097824882@qq.com> Date: Tue, 31 Aug 2021 14:08:53 +0800 Subject: [PATCH 1/9] [FEATURE] modify is_sif,to_sif,sif4sci --- EduNLP/SIF/parser/parser.py | 5 +++-- EduNLP/SIF/sif.py | 35 +++++++++++++++++++---------------- tests/test_sif/test_sif.py | 2 +- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/EduNLP/SIF/parser/parser.py b/EduNLP/SIF/parser/parser.py index db290946..bab1b3b9 100644 --- a/EduNLP/SIF/parser/parser.py +++ b/EduNLP/SIF/parser/parser.py @@ -2,7 +2,7 @@ class Parser: - def __init__(self, data): + def __init__(self, data, check_formula=True): self.lookahead = 0 self.head = 0 self.text = data @@ -13,6 +13,7 @@ def __init__(self, data): self.warnning = 0 self.fomula_illegal_flag = 0 self.fomula_illegal_message = '' + self.check_formula = check_formula # 定义特殊变量 self.len_bracket = len('$\\SIFChoice$') @@ -231,7 +232,7 @@ def get_token(self): self.call_error() return self.error # 检查latex公式的完整性和可解析性 - if not self._is_formula_legal(self.text[formula_start:self.head]): + if self.check_formula and not self._is_formula_legal(self.text[formula_start:self.head]): self.call_error() return self.error self.head += 1 diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index af4fa63a..f075710f 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -10,39 +10,43 @@ __all__ = ["is_sif", "to_sif", "sif4sci"] -def is_sif(item): +def is_sif(item, check_formula=True): r""" Parameters ---------- item + check_formula: bool + True if check the validity of formulas in items + False if not check the validity of formulas in items, which is faster Returns ------- when item can not be parsed correctly, raise Error; - when item doesn't need to be modified, return Ture; - when item needs to be modified, return False; + when item doesn't need to be modified, return Ture and original item ; + when item needs to be modified, return False and modified item; Examples -------- >>> text = '若$x,y$满足约束条件' \ ... '$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,' \ ... '则$z=x+7 y$的最大值$\\SIFUnderline$' - >>> is_sif(text) + >>> flag, _ = is_sif(text) + >>> print(flag) True >>> text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...' >>> is_sif(text) - False + (False, '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...') """ - item_parser = Parser(item) + item_parser = Parser(item, check_formula) item_parser.description_list() if item_parser.fomula_illegal_flag: raise ValueError(item_parser.fomula_illegal_message) if item_parser.error_flag == 0 and item_parser.modify_flag == 0: - return True - return False + return True, item + return False, item_parser.text -def to_sif(item): +def to_sif(item, check_formula=True): r""" Parameters ---------- @@ -59,14 +63,12 @@ def to_sif(item): >>> siftext '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...' """ - item_parser = Parser(item) - item_parser.description_list() - item = item_parser.text - return item + _, sif_item = is_sif(item, check_formula) + return sif_item def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = None, tokenization=True, - tokenization_params=None, errors="raise"): + tokenization_params=None, errors="raise", check_formula=True): r""" Default to use linear Tokenizer, change the tokenizer by specifying tokenization_params @@ -189,8 +191,9 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No [['已知'], ['说法', '中', '正确']] """ try: - if safe is True and is_sif(item) is not True: - item = to_sif(item) + if safe is True: + flag, sif_item = is_sif(item) + item = sif_item if flag is not True else item ret = seg(item, figures, symbol) diff --git a/tests/test_sif/test_sif.py b/tests/test_sif/test_sif.py index 210441d2..0f1febce 100644 --- a/tests/test_sif/test_sif.py +++ b/tests/test_sif/test_sif.py @@ -11,7 +11,7 @@ def test_is_sif(): text = '若$x,y$满足约束条件' \ '$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,' \ '则$z=x+7 y$的最大值$\\SIFUnderline$' - assert is_sif(text) == 1 + assert is_sif(text)[0] == 1 text = '公式需要满足完整性,完整的公式如' \ '$\\begin{matrix} a & b \\\\ c & d \\end{matrix}$' \ From 0a06e06ff13c41c545829563ed798e10a102e51c Mon Sep 17 00:00:00 2001 From: QinLonghu <1097824882@qq.com> Date: Tue, 31 Aug 2021 14:18:34 +0800 Subject: [PATCH 2/9] [FEATURE]add params in sif to decided whether to check formulas --- EduNLP/SIF/sif.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index f075710f..dd5c40a8 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -192,7 +192,7 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No """ try: if safe is True: - flag, sif_item = is_sif(item) + flag, sif_item = is_sif(item, check_formula) item = sif_item if flag is not True else item ret = seg(item, figures, symbol) From ef6f7c470458174a66169e8b93747e7fa5d2e7e8 Mon Sep 17 00:00:00 2001 From: QinLonghu <1097824882@qq.com> Date: Fri, 3 Sep 2021 16:53:36 +0800 Subject: [PATCH 3/9] modify examples --- examples/sif/sif.ipynb | 353 ++++++++++++-------------------- examples/sif/sif_addition.ipynb | 208 ++++++++++--------- 2 files changed, 243 insertions(+), 318 deletions(-) diff --git a/examples/sif/sif.ipynb b/examples/sif/sif.ipynb index 2076e126..50de8158 100644 --- a/examples/sif/sif.ipynb +++ b/examples/sif/sif.ipynb @@ -34,12 +34,12 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "source": [ - "item = {\n", - " \"stem\": r\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\n", - " \"options\": [\"$p_1=p_2$\", \"$p_1=p_3$\", \"$p_2=p_3$\", \"$p_1=p_2+p_3$\"]\n", - "}\n", + "item = {\r\n", + " \"stem\": r\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\r\n", + " \"options\": [\"$p_1=p_2$\", \"$p_1=p_3$\", \"$p_2=p_3$\", \"$p_1=p_2+p_3$\"]\r\n", + "}\r\n", "item[\"stem\"]" ], "outputs": [ @@ -51,7 +51,7 @@ ] }, "metadata": {}, - "execution_count": 5 + "execution_count": 1 } ], "metadata": { @@ -70,24 +70,24 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "source": [ - "from PIL import Image\n", - "img = Image.open(\"../../asset/_static/item_figure.png\")\n", - "figures = {\"1\": img}\n", + "from PIL import Image\r\n", + "img = Image.open(\"../../asset/_static/item_figure.png\")\r\n", + "figures = {\"1\": img}\r\n", "img" ], "outputs": [ { "output_type": "execute_result", "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAOYAAACICAYAAADzlJeRAAAe3klEQVR4nO2df1RT9/3/nyE/SWJyBdSUOgs72mKP1XhK24j2CGon4FagbtTMtpZt6rDzR7rjKt+2C+rnHGxdBbr2xB9zgc4j1NMJbpXg2hlsp2DnjrSHVaqcgquWbFqNLUgQkvf3DwgiSSA/7uUm8f04530O3Ny87zu5ed7X+8fr/XoJCCEEFAolrIjhuwEUCsUTKkwKJQyhwqRQwhAR3w2IBJqamuBwOHy+rtFokJKSMo4tokQ7VJgAGhoa0NzcDLvdjn//+99oa2uDw+FAa2srAIBhmFHf73K58O233wIAYmNj8cADDwAAZs+ejblz50Kr1UKn00Emk3H6OSjRg+Bum5Vtbm5Gc3MzrFYrPvnkE7S2toJhGPT09KC3t5f160mlUsTGxqKrqwsMw2D69OlYunQpdDodMjMzWb8eJTqIemHa7XbU19fjT3/6E/72t79BoVDA6XSiq6uL13YxDIOuri48/vjjyM/PR2ZmJpKSknhtEyV8iEph2mw21NbW4tChQzh16hSkUulQVzMckcvliImJgUwmQ3Z2NgwGA7RaLd/NovBIVAmzuroar732Gs6fPw8AuHnzJs8tCg61Wg2FQoFXX30VK1asGHOMS4k+Il6YNpsNf/jDH7Bz504ACGvLGChyuRxOpxOLFy/G+vXr6Zj0LiJihdnU1IQ33ngDdXV1cLlcoy5nRAMMw0AkEmHnzp14/vnn+W4OhWMiTpjNzc0oLCxEa2sr7HY7380Zd5RKJWQyGRVolBMxwrTZbFi/fj0++OAD3Lhxg+/m8A4VaHQT9sK02+347W9/i8rKyqgaP7KFUqmESqXCn//8Z+h0Or6bQ2GJsPaV3bdvH5KTk7Fv3z4qSh90dXXh66+/RlZWFjZs2HBXdu+jEhKGdHZ2Ep1OR9RqNQFAi59FJpORSZMmEYvFwuLdqCNlZRdYrI/iD2FnMfft24dZs2ahqamJjiUDxOFw4MqVK9Dr9XjqqadYsJ5tKJ+fjUNsNI4SGHw/Gdxcv36d5OXlUSvJUhGLxWTSpEmksbEx6HtyoSyNACBr6li80RS/CAuL2dDQgPvvvx/vv/8+tZIs0dfXhytXruBHP/oRSkpKAq+grRw78QrK0thvG2VseJ+VLS0txbZt2+ikBYeoVCrMmzcP1dXVfrr3WbB2/nlsPnk/dgr+Dw9eOImN07lrX0NDA4Db+17/+c9/4uuvv/Z67uzZs5GcnAwASEpKuqNEFXya65/97GeEYRjeu313QxGLxSQ5OZmcO3duzPtSt2YNqSOEkAtlJA2Df7NAT08PsVqtxGg0Ep1OR2JjYwkAwjAMYRiGiESigD6TUqkkDMMQpVJJABCtVkuys7OJ0WgMqQsfDvAizOvXr5NHH3106MbQMn4lLi6OHDlyxOe9cY8rb5fQhHnu3Dny8ssvk+TkZCISicbtQcwwDImNjSUZGRnEZDKR9vb2ED7F+DPuwmxvbycpKSlEIpHw/iO9W8uUKVPIO++843lzLpSRNcOXRurWEKSVkUAXSxobG0lhYSFJSEggKpUqYEvIdpHL5USpVJLExERiMpnI9evXQ/kJjwvjKsyzZ8+SxMRE3n+YtIBMnDiRvP3227dvzoUykjZChBfK0vwWZk9PDzGbzSQxMTGshydyuZwoFAqSnZ3N8novu4ybMM+ePUs0Gg3vN4aW2yUuLm5InAPH0siAwbxAytKGn+u7O9vZ2Uk2bdpEJkyYMDTWi5TCMAxJSEggZrN5nFTgP+MizPb2dmopw7TExcWRjRs3BnxPOzs7ycqVK4lcLue9qxpqUSqVYSdQzoXZ3t5OkpKSeP/yafFdGIbxW5w9PT3EaDSS+Ph4IhaLeW87m8Ut0JqaGk414Q+cCpOKMnIKwzBk165do97PqqoqotFoiEKh4L29XBa1Wk10Oh2vM7mcCfP69eskJSWF9y+ZFv/LlClTvC6luDcVqFQq3ts4noVhGGI0GklPTw9XMvEJJ54/DocDjzzyCFpaWtiuOuqRy+WQSCQecW6Tk5OhVqvvODcnJwcA8NFHH+G///0vrl27ht7eXvT19QUdnnPKlCloaGgYiixfX1+PZ555Bt98802QnyiyUSgUiI+Px5EjR8Y1ciEnwnzqqafw/vvvo6+vj+2qowaGYYYiuLtFl5OTM5RuQavVshodr6GhAXa7Hc3Nzaivr4fNZsNXX30FAIiJiUF/f//QuSqVCrt27cJf/vIXfPTRR9RdEkB8fDxKSkqwevXqcbke68Kkvq+eiEQiKJVKdHV1ITU1FUuXLkV6enpY5Dxxi7WjowN//etfceLECXz33XdwuVx3iJUyEFZ00aJFOHjwIPfpLtjsFzc2NpKEhATexwZ8F7frmUgkIjqdjpSUlESU7+bhw4eJRqMharWayOVyIpVKSUxMDO/fazgUsVhMZsyY4ZfPcSiwJszOzk4ybdo03r84vopMJiNKpZJotdqIE+JwTCYTSU9PJ+3t7aSqqoqsWLGCnDt3jphMJpKZmUkkEknULZMEU6ZMmULOnj3L2X1gRZg9PT0kNTWV9y+Lj6JWq0liYiLZvn17xDlKD6enp4esWLGC/PKXvyQ9PT3k448/Jh9//DHZtGkTKS0tveM8s9k8tDtEKBTyfg/4KhqNhrMHMCvCNBqNUb+2NVKMKpWKPPfcc5w+NceLc+fOEa1WS6qqqoYE6S49PT0kMzOTWK1Wj/e1t7eT7du3k8TERCKXy3m/L3yUhISEUXfrBEvIwmxvbw9rp2U2i1KpJNnZ2WHhGcIWVVVVRKfTkXPnznmI0l06OzuJVqslnZ2dPus5e/Ysee6554hSqSRSqZT3ezWexdf6byiEPCs7b948NDU1hVJFWCMSiSCXy5Gbm4utW7dGzU55h8OBoqIi2Gw2mEymMdecRSIRioqKYLFYRp2RdDgceOutt1BSUoLu7m5Oco6GI1OmTMHevXvx5JNPslJfSMLct28fNm/eHJVxemQyGWQyGX7yk59g27Zt0Gg0fDeJNWw2G/R6PXJycpCamur3+86cOYOLFy+itLR0zHMdDgcqKirw6quvoru7Gz09PaE0OSKIj4/Hhx9+yIojQtDCtNlsmDt3Lmw2W8iNCCcUCgViYmKwbt06bNmyJepS4NXX18NgMMBsNge1Trl//34sXLgwoLQMFRUV2Lx5M3p6etDd3R3wNSOJpKQkWK3WkHtWQUfJW7duXVS5aYnFYjAMg+LiYvzvf//Djh07ok6UxcXFKC8vR2NjY9DOAyaTCZWVlQENX55//nlcuXIFBw4cQHJyMmJjY4O6diTQ0dGBH/zgByFnnwvKYjY3NyM9PT1qurAMw2DZsmV46623ok6MwIB3T15eHhYuXIglS5aEXN/UqVORl5cHq9Ua1Pf1u9/9Djt27IDdbofT6Qy5PeGGRCKBVqvF6dOng64jKGEuWrQIVqs16IuGCxMmTMC9994Ls9kctQl5mpqaUFBQAJPJBJFIxFq9/f392Lp1a9C/A5vNhnXr1uH48eNR84AfjkqlwpYtW1BUVBTU+wMWZjRYS7FYDIVCgW3btmH9+vV8N4czysrKcOTIEVRVVaGtrY31+v/xj3/gxo0bwQWUHqShoQErV67E9evXo26CKJTJoIDHmC+++GJEi5JhGOTn56O9vT1qRWm326HX63Hx4kVYLBZORAkACxYsQGtrK2pra4OuIz09HZcvX8ZvfvMbxMfHs9e4MOCbb75BTk5OUOPNgCxmJFtLsViMyZMn47333ovabisAtLa2oqCgABs3bsTUqVM5v96sWbOQlZUFs9kc8k4Zm82GvLw8fPbZZ7h58yZLLeQXsViM/Px8HDhwIKD3BWQxI9VaMgyDRYsWoaWlJapFWVFRAb1eD7PZPC6iBICWlhaYzWbo9fqQZyI1Gg0aGxuxefPmqJmE6+vrw9GjR1FfXx/Q+/y2mE1NTcjKyoq4fZZxcXF45ZVXYDAY+G4KZzgcDhgMBjgcDphMJpw5c2bc23D16lVUVlaipqaGlfqamprw5JNP4sqVK6zUxzfJycn48ssv/T7fb4tpMpkiSpRisRjTpk3DyZMno1qUHR0dyMjIwJw5c/Dzn/+cF1ECQEJCAlJSUrBjxw5W6tPpdDh//jx0Oh3kcjkrdfLJ1atX8fvf/97v8/0SpsPhwKFDkZO+dOLEifjhD3+IL774gvcIAVxSW1uLvLw8mEwmzJo1i+/mYNmyZTh27NhQ9q5QYRgmarq23333HYqLi/3u7vslzOrqalbXwLgkISEB5eXlOHz4MPfhH3ikqKgIlZWVsFqtQQfe4oKamhoYDAZ0dHSwVmdxcTEsFgsSExNZq5MPent7/e5R+DXGnDlzJlpbW0NuGNdoNBpUVVUhPT2d76ZwhtsBfenSpViwYAHfzfGKvztRAqWjowNZWVkR8Vv0hUqlwhdffDHmpogxLWZra6vPJKLhRGJiIiwWS1SL0j0BZzQaw1aUwIBX0KpVq1BYWMhqvUlJSWhsbERqamrE9OBG4nQ6/Vo6GVOYe/bsCfs1pXvvvRcnT54c17if482OHTtgMBhgsVgi4kc5ffp0yGQy7N69m9V6GYbBxx9/jLS0tIh0hu/u7kZJScmYY80xhXngwIGwDWMokUiQkpKClpaWqNnAPBK3A/qNGzewc+dOzrx4uKC0tBTvvvsu6xvpZTIZTpw4gSeeeMIjCHYk4HQ6x1zXHHWM2dHRgYceeiisJhfcyOVyzJkzB3V1dRE/Y+eL5uZmFBQUwGg0IiEhge/mBMXUqVOh1+tRU1PDyWbzTZs2oaKiIuIcX8Za1xzVYtbX18PlcrHeqFCRSCSYM2cOjh8/HrWi3L17NwoLC1FTUxOxogSAS5cuoaSkBAUFBSF7BnmjrKwMRqMREydOZL1uLrl27Rqam5t9vj6qMA8dOhSW48v7778fx48fj8rlEIfDgYKCAnz66aewWq24dOkS300KGZFIhKVLlwa9BWosDAYDnnvuuYjq1nZ3d6OystLn6z67sg6HAxMnTuTkKRcKKSkpaGxsjEpL6XZAX7t2LaZPn853c1jn7bffRk5ODlasWMFJ/Xq9HrW1tWH3m/VFQkKCT5dDnxazqakp7CzStGnTgt41H+7U1tYObWiORlECgNlsxmuvvcbZOmRVVRUeffTRiJi1BgaWlXxNjPkU5ocffhhWkz6JiYk4ceJEVEWrc2MwGFBZWQmLxRJW3znbnDlzBjU1NdDr9Zz5XR87dgyPPPIIxGIxJ/WzSVdXF959912vr/nsyoZT+BCNRgOLxRJ165Tu/YdPP/10QGEkIx22d6KMxG63Y968eRHhIeSrO+vTYv7rX//itEH+kpCQgJqamqgTZUNDAzIyMlBaWnpXiRIYuKdz5sxBcXExJ/UzDAOr1Ypp06ZxUj+bOBwOr37FXoVpt9vDIoK2SqXCiy++GHWbm4uLi7F169aQwkhGOkuWLMHp06cD3kDsLxqNBseOHUNcXBwn9bOJt904XoXZ3NzMu7uTWCyGVqvlbIqdD+x2O7KysgAA27dvHzMtQbRTVVWFoqIiVneiDMe9P1SlUnFSPxt0dXXhk08+8TjuU5h8RyybPHkyjhw5wmsb2KSpqQkZGRnYuHEjK7Fdo4GWlhZUVVUhLy+PsyWO1atXY/HixWE9U+ut1+BVmGfPnuW1K5uQkID33nsvapZFdu/ejaKiItTU1ECpVPLdnLDi6tWreOmll1BQUMDZNQ4ePBjWezm/+uorjweTV2F+9tln49Igb0TTuNLhcECv1+PTTz+FxWKJCi8eLpg6dSo0Gg3Kyso4qV8mk4X1eFOpVHq453kVJl8/ILFYjMWLF0fFuLK1tRXz5s1DTk4OVq5cyVssnkihpKSE1bAkIwnn8abL5fIYZ/ucleWD+Ph4HDx4kJdrs0l1dTX0ej2qqqrGLYxkpHPmzBmYzWYYDAbOMsi5x5tCoZCT+oOlt7fX4zPfIUzi6se3Xd9BoIoF1ApgHD3yGIaB2WwOOzfAQHA4HCgsLMSRI0fQ2NiIq1ev8t2kiKKtrQ0mk4mVGLW++OMf/xh2Xdre3l6cP3/+jmN3CFMQI8K1K/+DTCwGbnYD4+QLLBaL8dhjjyEzM3N8LsgB7ng0c+bMwQsvvEC7rkHS39+PnJwczoYzDMPgjTfeCLsu7UgvJY+ubMfFi4iJCTptZlCoVCpUVFSM6zXZpL6+Hnl5eSgpKQmLMJKRTmpqKux2O2e/iWeffRbf//73Oak7WDo7O+/432Nxx27/dvAvMaCWDPzpugV81wfIZIBUCPQOWlOxGJAPntM7zMIOPw4ncGOY6R3x2gSXGEaj0dM5nThxy+kCAQBBDCRCIQQjj0MAsUiEGBD09/fDCcHA+MHZDycAxIggFbhunx8jgjRGENAXNhbFxcU4ffo0rFbrXe8wwCYmkwlZWVlISUnhZIbebDZj8eLFuHbtGut1B8PIYY+HabTbrw9GLegDbt4C4BwQJQA4nED/rUEBigfGoDe6gRu3AKkMEA8elwsHusI3uoF+IaBwe/p7vjbpnslesm4R9DsJYoRiSEUiCOGCk4w8LoY4hqDPRQAIIBLGQAACp9OJGJEYUmEMBK5+9LoA8dD/TrAVj8FutyMjIwMA8PLLL1NRsox7MqiwsJCTyUitVouVK1eG7S6U0fusfX1AL24LSyYEnH23/46RDEwSqSUAhIBw8LjLCQyehlu3AJHw9nuGvaaWSlFd9a6nWIgTzhghRAJgQHTigb+JE06BAMJBoxcjGCk2AYRC0bAPJYAwZtDSDuIKOE2vJ01NTZg3bx6MRiP14uGQS5cuobS0FHl5eZzU//rrr2PChAmc1B0qvoXpFpbDOSAsMQCh884Jof5bgxZzsLhfixllOnrwNZFIhPzlyzFn9hzv5xF3d3XkceL9+DhRVlYGg8EAq9Ua1m5e0QKXYUlkMhn2798flt5YQ8J0ufrQ6yIDyxXEOeyUQaspV9y2lsCgYCXDllQGu7YOJwDh7eNCIdDvvP2ewddUKhXeLC8FiRF6eToIICDu7isAkAFLJxBCCIL+IbNHfLyffdxhJC9evBhxYSQjHTYS5PoiNzc3LCfshn7TAggAVz/iJ09GTP+ICRKHE8AIa4k+oNcJSBWD3Vnh4OuDY1P3cbET6O67/Z6bt6BQxWHz/9sCgUQGydBkDEF/fx/6BwUoEQrgdPaht78Pvf1OYKhbGwO4+geOuwSD7yfod7pAQOB09sM15v+B0dzcjKysLKxatQrLly8P8N0UNuAyLInJZAo7q+kRwaChoQF5eXkjBtxiQNbH2rpmXFwcLl++HBHOBBUVFdizZw/MZjN1GOCZhIQE6PV6NDY2sv7b4Ttix8hIBv71AmVCwDn2af6gUChQVFQU9qJ0e/GcOHECVquVijIMuHr1KoxGI/R6Pet179q1i1erOdJ100OYdwhGMdgdheP2ZFCISKVS/OpXv2KnMo4Il2SwFE/YTpDrRqvVYtKkSazWGQgjDZWHMDUaze1wF90jZltDJBKsZbglg6V4wnaCXDe7du3iLXv1mBYzKSmJsxCK4W4twzUZLMUTLhLk5ubmYsqUKazVFwgjrbXXMSYXDr7hbC1tNhvmzZsHtVqNX//619SLJwJoaWmByWRiPScKH1ZTLpdj9uzZdxzzKsz4+HjWLy4UCvHMM8+wXm+oNDQ0ICsrC6WlpWGdDJbiCRcJcnNzczF58mTW6vMHiUTiEZ7VqzC5MOcPP/xw2EVR37FjB7Zu3QqLxXLXhpGMdLhIkPvSSy+Nq1dXT08PUlJS7jjmVZhs77pXq9XYsGEDq3WGwvBksNu3b6dePBEO2wlyc3NzIZFIxj6RJaRSqUfgOa/CfOCBB1h9YgiFwrDZBN3c3IyMjAysWrUKy5Yt47s5FBZgOyyJRqPBY489xkLL/OPBBx/0OOZVmAsWLGBtsVUkEmH58uVhMekTLclgKZ6wnSB3w4YNkEqlLLRsdEQiERYvXuxx3KswdTodawGfFQoF1q1bx0pdwRKNyWApnrC5EyUzM3NcjIlSqfS6ddCrMBmGYW1m9p577uE1IZA7jOTChQtpGMm7gNTUVNhsNlRXV4dUj0wmQ05ODkut8o3D4fAaocGnrywbYpLL5di4cWPI9QRLdXU1CgoKUFVVFbXJYCmesLUTpbCwkHOrqdVqvV7DpzCfeOKJkPvYTqeTs7Teo+FwOGAwGHDkyBFYLBbqgH6XwVaCXJ1Ox2mEA6lUiqefftrraz6FqdPpQs74NXPmzHHPP2Kz2ZCVlYX77rsPL7zwAvXiuUu5dOkSjEZjyDlR1qxZw1KLPJFKpT5XK0btyjqdwe/1EolE474c4U4GW1JSctclg6V4wkaC3CVLlnAWsCs+Pt7DscCNT2HKZDI8/vjjQV/U12wTV9BksBRvhJogV6fTcZJSYazVilE3Sj/77LNBO7Q7nc5xydhFk8FSxiKUBLkymQxz585lv1EAfvGLX/h8bVRhZmZmBt2dTU1N5XxGyx1GkiaDpYxGqAly8/PzIRCwGyh84cKFo86/jCpMhmHw0EMPBXxRqVSKJ598MuD3BUJZWRmKiopgtVrDLpASJfwIJUFueno6q+NMhmG8BDm/kzFj/qxatSrg/WmjzTaFit1uh16vx8WLF2GxWKgDOsVvgk2Qy7aDzMSJE8fUx5jCzM3NDfjCEonE52xTKLS2tiIrKws5OTlYvnw59eKhBEywCXLZGmeq1Wrs2rVrzPPGFGYwnvaPPvpoQOf7gzsZrNlspslgKUET7E6UH//4x6xcPy4uzi9j51f4yg0bNkCtVvt1YZFIFNIyy0hoMlgK2wSTIHfJkiUhp6f011oCXgI+++Lee+/F119/7dfFKyoqguoCj6SjowMFBQV4+umnacQ6CuucOXMGFy9eRGlp6ZjnOhwOyOVy+CkXryQnJ+PLL7/061y/HwEGgwEKhWLM8wghrIwvaTJYCtcEkiBXJpOFtPynVqtx+PBhv8/3W5ijLYYO5+bNmyELs6ioCOXl5bBardSLh8IpJpMJlZWVfoUlCTZmlUgkQn5+fkCzu34Lk2EY/PSnPx0z5Mj3vvc9vy8+EpvNhoyMDKjVapoMljIuBJIgN9iZWZVKhTfffDOg9wQ0ml23bt2Y3dmZM2cG1AA3TU1NyMrKgtFopGEkKeOKvwlytVptwB5AarUa+/fvD7gbHJAwtVot8vPzfVrNYGdk3clgLRYLTQZL4QV/wpI88sgjAdUpFouxaNGioCZCA57/3bZtm09PIIVCEdD4kiaDpYQTYyXITUlJCchi3nPPPTh48GBQbQlYmBqNBqtWrfJqmgOZkaXJYCnhyGhhSZKSkvxeLomLi8OxY8eCnskNasX09ddf92o1XS6XXw2pqKhAYWEhzGYzDSNJCStaWlpgNpt9Oh/4EwharVZjx44dIa1OBCVMmUyGoqIij4kgh8Mx6pSyO4wkTQZLCWdGS5A7Vhws97hy9erVIbXBb8+fkTgcDsyePRsXLly447iv6lpbW1FQUIC1a9fSiHWUiODo0aNQq9XYsmXL0LH4+Hhcu3bN6/kSiQQPP/wwjh8/HvJe5KCd/2QyGd555507Nnv6inZQW1uLgoICmEwmKkpKxOAtQe5oET1mz57NiiiBEIQJDMRDefbZZ4ca4q3/bTAYUFlZCYvFQpPBUiKOkQlyfW2YTklJwQcffMBa1I7Q3OUxMBHk9vYZvh3LnQz2vvvuo8lgKRHLyAS53nZZpaSkoLGxkdVQrSELUyaT4dChQ3eY+OHJYGkYSUqkMzxB7vAJT3dAALZFCbAgTGDAIygtLQ0zZsygyWApUYk7Qa5UKoVQKERsbCy0Wi0nogQCEKZlrQACwYgyvxxuX50lS5bAarWitbUVf//736kXDyXqKC0txX/+8x/I5XLk5OTg9OnTnGUaCGi5pK18PmZ8/grIniwAFqwVZGPvmjqQPVk4cOAAtmzZgsuXL7PeSK1Wy8oXkJ6ePvR3RUVFUHFGKRSRSIS+vj5OrxGAMNtQPn8GPn+FYM9AfGVY1gqQjbpBoVIo0U9HRwc6OjrueMgDACxrIcjeO/TvmjqCPViLtdiDYOTh/xiz7SgOnVqD3MGLtJXPR/beNJRtpqKk3D0kJSWNEGUbyucLIMgG6ggBGSy5tQIIslvw4IzgruO3MNuOHsIp7EX24PhyFSpByElspP4ClLuWgV7kJpThAtmD4SYqa08d1mAW7g9SH34Ksw1HD53CmrrBJ8KFMmDTDMwvpxM8lLuXtvJV2HQqDWWVG+Gpvxl4sCwXwfYn/RtjtpVj/ozP8cqwp4JlrQDZe9egbsSTgkK5Oxi0lrO4mWPxy2K2HT2EU2uGq78N51sApD2IILvQFEqEcwGfnwLSgh1EjoEfcTwGu7Gv3JalZe0MbDoFrKnzZsIplLuAtvPg0sl0TIspEAyIcG/2bceC7L1pKLtAgpoGplCigun3YxaAU59f8Pqypfy2800wBL0fk0K56xlcu0wru4CTQ8sTnuv9wUCFSaGEQls55s/YhFNDB9JQdiH0ZUQqTAolDGFldwmFQmEXKkwKJQz5/6GFNae7ZI7lAAAAAElFTkSuQmCC", "text/plain": [ - "" - ], - "image/png": "" + "" + ] }, "metadata": {}, - "execution_count": 6 + "execution_count": 2 } ], "metadata": { @@ -108,11 +108,20 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "source": [ "from EduNLP.SIF import sif4sci, is_sif, to_sif" ], - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "D:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", + " warnings.warn(msg)\n" + ] + } + ], "metadata": { "collapsed": false, "pycharm": { @@ -129,7 +138,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "source": [ "is_sif(item['stem'])" ], @@ -138,11 +147,12 @@ "output_type": "execute_result", "data": { "text/plain": [ - "True" + "(True,\n", + " '如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\\\SIFChoice$$\\\\FigureID{1}$')" ] }, "metadata": {}, - "execution_count": 7 + "execution_count": 4 } ], "metadata": {} @@ -156,9 +166,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "source": [ - "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", "is_sif(text)" ], "outputs": [ @@ -166,21 +176,21 @@ "output_type": "execute_result", "data": { "text/plain": [ - "False" + "(False, '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...')" ] }, "metadata": {}, - "execution_count": 8 + "execution_count": 5 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "source": [ - "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", - "to_sif(text)\n" + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", + "to_sif(text)\r\n" ], "outputs": [ { @@ -191,7 +201,7 @@ ] }, "metadata": {}, - "execution_count": 9 + "execution_count": 6 } ], "metadata": {} @@ -232,9 +242,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "source": [ - "segments = sif4sci(item[\"stem\"], figures=figures, tokenization=False)\n", + "segments = sif4sci(item[\"stem\"], figures=figures, tokenization=False)\r\n", "segments" ], "outputs": [ @@ -246,7 +256,7 @@ ] }, "metadata": {}, - "execution_count": 12 + "execution_count": 7 } ], "metadata": {} @@ -260,7 +270,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "source": [ "segments.text_segments" ], @@ -283,7 +293,7 @@ ] }, "metadata": {}, - "execution_count": 13 + "execution_count": 8 } ], "metadata": {} @@ -297,9 +307,9 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 9, "source": [ - "segments.formula_segments\n" + "segments.formula_segments\r\n" ], "outputs": [ { @@ -319,7 +329,7 @@ ] }, "metadata": {}, - "execution_count": 15 + "execution_count": 9 } ], "metadata": {} @@ -333,7 +343,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "source": [ "segments.figure_segments" ], @@ -346,14 +356,14 @@ ] }, "metadata": {}, - "execution_count": 16 + "execution_count": 10 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "source": [ "segments.figure_segments[0].figure" ], @@ -361,13 +371,13 @@ { "output_type": "execute_result", "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAOYAAACICAYAAADzlJeRAAAe3klEQVR4nO2df1RT9/3/nyE/SWJyBdSUOgs72mKP1XhK24j2CGon4FagbtTMtpZt6rDzR7rjKt+2C+rnHGxdBbr2xB9zgc4j1NMJbpXg2hlsp2DnjrSHVaqcgquWbFqNLUgQkvf3DwgiSSA/7uUm8f04530O3Ny87zu5ed7X+8fr/XoJCCEEFAolrIjhuwEUCsUTKkwKJQyhwqRQwhAR3w2IBJqamuBwOHy+rtFokJKSMo4tokQ7VJgAGhoa0NzcDLvdjn//+99oa2uDw+FAa2srAIBhmFHf73K58O233wIAYmNj8cADDwAAZs+ejblz50Kr1UKn00Emk3H6OSjRg+Bum5Vtbm5Gc3MzrFYrPvnkE7S2toJhGPT09KC3t5f160mlUsTGxqKrqwsMw2D69OlYunQpdDodMjMzWb8eJTqIemHa7XbU19fjT3/6E/72t79BoVDA6XSiq6uL13YxDIOuri48/vjjyM/PR2ZmJpKSknhtEyV8iEph2mw21NbW4tChQzh16hSkUulQVzMckcvliImJgUwmQ3Z2NgwGA7RaLd/NovBIVAmzuroar732Gs6fPw8AuHnzJs8tCg61Wg2FQoFXX30VK1asGHOMS4k+Il6YNpsNf/jDH7Bz504ACGvLGChyuRxOpxOLFy/G+vXr6Zj0LiJihdnU1IQ33ngDdXV1cLlcoy5nRAMMw0AkEmHnzp14/vnn+W4OhWMiTpjNzc0oLCxEa2sr7HY7380Zd5RKJWQyGRVolBMxwrTZbFi/fj0++OAD3Lhxg+/m8A4VaHQT9sK02+347W9/i8rKyqgaP7KFUqmESqXCn//8Z+h0Or6bQ2GJsPaV3bdvH5KTk7Fv3z4qSh90dXXh66+/RlZWFjZs2HBXdu+jEhKGdHZ2Ep1OR9RqNQFAi59FJpORSZMmEYvFwuLdqCNlZRdYrI/iD2FnMfft24dZs2ahqamJjiUDxOFw4MqVK9Dr9XjqqadYsJ5tKJ+fjUNsNI4SGHw/Gdxcv36d5OXlUSvJUhGLxWTSpEmksbEx6HtyoSyNACBr6li80RS/CAuL2dDQgPvvvx/vv/8+tZIs0dfXhytXruBHP/oRSkpKAq+grRw78QrK0thvG2VseJ+VLS0txbZt2+ikBYeoVCrMmzcP1dXVfrr3WbB2/nlsPnk/dgr+Dw9eOImN07lrX0NDA4Db+17/+c9/4uuvv/Z67uzZs5GcnAwASEpKuqNEFXya65/97GeEYRjeu313QxGLxSQ5OZmcO3duzPtSt2YNqSOEkAtlJA2Df7NAT08PsVqtxGg0Ep1OR2JjYwkAwjAMYRiGiESigD6TUqkkDMMQpVJJABCtVkuys7OJ0WgMqQsfDvAizOvXr5NHH3106MbQMn4lLi6OHDlyxOe9cY8rb5fQhHnu3Dny8ssvk+TkZCISicbtQcwwDImNjSUZGRnEZDKR9vb2ED7F+DPuwmxvbycpKSlEIpHw/iO9W8uUKVPIO++843lzLpSRNcOXRurWEKSVkUAXSxobG0lhYSFJSEggKpUqYEvIdpHL5USpVJLExERiMpnI9evXQ/kJjwvjKsyzZ8+SxMRE3n+YtIBMnDiRvP3227dvzoUykjZChBfK0vwWZk9PDzGbzSQxMTGshydyuZwoFAqSnZ3N8novu4ybMM+ePUs0Gg3vN4aW2yUuLm5InAPH0siAwbxAytKGn+u7O9vZ2Uk2bdpEJkyYMDTWi5TCMAxJSEggZrN5nFTgP+MizPb2dmopw7TExcWRjRs3BnxPOzs7ycqVK4lcLue9qxpqUSqVYSdQzoXZ3t5OkpKSeP/yafFdGIbxW5w9PT3EaDSS+Ph4IhaLeW87m8Ut0JqaGk414Q+cCpOKMnIKwzBk165do97PqqoqotFoiEKh4L29XBa1Wk10Oh2vM7mcCfP69eskJSWF9y+ZFv/LlClTvC6luDcVqFQq3ts4noVhGGI0GklPTw9XMvEJJ54/DocDjzzyCFpaWtiuOuqRy+WQSCQecW6Tk5OhVqvvODcnJwcA8NFHH+G///0vrl27ht7eXvT19QUdnnPKlCloaGgYiixfX1+PZ555Bt98802QnyiyUSgUiI+Px5EjR8Y1ciEnwnzqqafw/vvvo6+vj+2qowaGYYYiuLtFl5OTM5RuQavVshodr6GhAXa7Hc3Nzaivr4fNZsNXX30FAIiJiUF/f//QuSqVCrt27cJf/vIXfPTRR9RdEkB8fDxKSkqwevXqcbke68Kkvq+eiEQiKJVKdHV1ITU1FUuXLkV6enpY5Dxxi7WjowN//etfceLECXz33XdwuVx3iJUyEFZ00aJFOHjwIPfpLtjsFzc2NpKEhATexwZ8F7frmUgkIjqdjpSUlESU7+bhw4eJRqMharWayOVyIpVKSUxMDO/fazgUsVhMZsyY4ZfPcSiwJszOzk4ybdo03r84vopMJiNKpZJotdqIE+JwTCYTSU9PJ+3t7aSqqoqsWLGCnDt3jphMJpKZmUkkEknULZMEU6ZMmULOnj3L2X1gRZg9PT0kNTWV9y+Lj6JWq0liYiLZvn17xDlKD6enp4esWLGC/PKXvyQ9PT3k448/Jh9//DHZtGkTKS0tveM8s9k8tDtEKBTyfg/4KhqNhrMHMCvCNBqNUb+2NVKMKpWKPPfcc5w+NceLc+fOEa1WS6qqqoYE6S49PT0kMzOTWK1Wj/e1t7eT7du3k8TERCKXy3m/L3yUhISEUXfrBEvIwmxvbw9rp2U2i1KpJNnZ2WHhGcIWVVVVRKfTkXPnznmI0l06OzuJVqslnZ2dPus5e/Ysee6554hSqSRSqZT3ezWexdf6byiEPCs7b948NDU1hVJFWCMSiSCXy5Gbm4utW7dGzU55h8OBoqIi2Gw2mEymMdecRSIRioqKYLFYRp2RdDgceOutt1BSUoLu7m5Oco6GI1OmTMHevXvx5JNPslJfSMLct28fNm/eHJVxemQyGWQyGX7yk59g27Zt0Gg0fDeJNWw2G/R6PXJycpCamur3+86cOYOLFy+itLR0zHMdDgcqKirw6quvoru7Gz09PaE0OSKIj4/Hhx9+yIojQtDCtNlsmDt3Lmw2W8iNCCcUCgViYmKwbt06bNmyJepS4NXX18NgMMBsNge1Trl//34sXLgwoLQMFRUV2Lx5M3p6etDd3R3wNSOJpKQkWK3WkHtWQUfJW7duXVS5aYnFYjAMg+LiYvzvf//Djh07ok6UxcXFKC8vR2NjY9DOAyaTCZWVlQENX55//nlcuXIFBw4cQHJyMmJjY4O6diTQ0dGBH/zgByFnnwvKYjY3NyM9PT1qurAMw2DZsmV46623ok6MwIB3T15eHhYuXIglS5aEXN/UqVORl5cHq9Ua1Pf1u9/9Djt27IDdbofT6Qy5PeGGRCKBVqvF6dOng64jKGEuWrQIVqs16IuGCxMmTMC9994Ls9kctQl5mpqaUFBQAJPJBJFIxFq9/f392Lp1a9C/A5vNhnXr1uH48eNR84AfjkqlwpYtW1BUVBTU+wMWZjRYS7FYDIVCgW3btmH9+vV8N4czysrKcOTIEVRVVaGtrY31+v/xj3/gxo0bwQWUHqShoQErV67E9evXo26CKJTJoIDHmC+++GJEi5JhGOTn56O9vT1qRWm326HX63Hx4kVYLBZORAkACxYsQGtrK2pra4OuIz09HZcvX8ZvfvMbxMfHs9e4MOCbb75BTk5OUOPNgCxmJFtLsViMyZMn47333ovabisAtLa2oqCgABs3bsTUqVM5v96sWbOQlZUFs9kc8k4Zm82GvLw8fPbZZ7h58yZLLeQXsViM/Px8HDhwIKD3BWQxI9VaMgyDRYsWoaWlJapFWVFRAb1eD7PZPC6iBICWlhaYzWbo9fqQZyI1Gg0aGxuxefPmqJmE6+vrw9GjR1FfXx/Q+/y2mE1NTcjKyoq4fZZxcXF45ZVXYDAY+G4KZzgcDhgMBjgcDphMJpw5c2bc23D16lVUVlaipqaGlfqamprw5JNP4sqVK6zUxzfJycn48ssv/T7fb4tpMpkiSpRisRjTpk3DyZMno1qUHR0dyMjIwJw5c/Dzn/+cF1ECQEJCAlJSUrBjxw5W6tPpdDh//jx0Oh3kcjkrdfLJ1atX8fvf/97v8/0SpsPhwKFDkZO+dOLEifjhD3+IL774gvcIAVxSW1uLvLw8mEwmzJo1i+/mYNmyZTh27NhQ9q5QYRgmarq23333HYqLi/3u7vslzOrqalbXwLgkISEB5eXlOHz4MPfhH3ikqKgIlZWVsFqtQQfe4oKamhoYDAZ0dHSwVmdxcTEsFgsSExNZq5MPent7/e5R+DXGnDlzJlpbW0NuGNdoNBpUVVUhPT2d76ZwhtsBfenSpViwYAHfzfGKvztRAqWjowNZWVkR8Vv0hUqlwhdffDHmpogxLWZra6vPJKLhRGJiIiwWS1SL0j0BZzQaw1aUwIBX0KpVq1BYWMhqvUlJSWhsbERqamrE9OBG4nQ6/Vo6GVOYe/bsCfs1pXvvvRcnT54c17if482OHTtgMBhgsVgi4kc5ffp0yGQy7N69m9V6GYbBxx9/jLS0tIh0hu/u7kZJScmYY80xhXngwIGwDWMokUiQkpKClpaWqNnAPBK3A/qNGzewc+dOzrx4uKC0tBTvvvsu6xvpZTIZTpw4gSeeeMIjCHYk4HQ6x1zXHHWM2dHRgYceeiisJhfcyOVyzJkzB3V1dRE/Y+eL5uZmFBQUwGg0IiEhge/mBMXUqVOh1+tRU1PDyWbzTZs2oaKiIuIcX8Za1xzVYtbX18PlcrHeqFCRSCSYM2cOjh8/HrWi3L17NwoLC1FTUxOxogSAS5cuoaSkBAUFBSF7BnmjrKwMRqMREydOZL1uLrl27Rqam5t9vj6qMA8dOhSW48v7778fx48fj8rlEIfDgYKCAnz66aewWq24dOkS300KGZFIhKVLlwa9BWosDAYDnnvuuYjq1nZ3d6OystLn6z67sg6HAxMnTuTkKRcKKSkpaGxsjEpL6XZAX7t2LaZPn853c1jn7bffRk5ODlasWMFJ/Xq9HrW1tWH3m/VFQkKCT5dDnxazqakp7CzStGnTgt41H+7U1tYObWiORlECgNlsxmuvvcbZOmRVVRUeffTRiJi1BgaWlXxNjPkU5ocffhhWkz6JiYk4ceJEVEWrc2MwGFBZWQmLxRJW3znbnDlzBjU1NdDr9Zz5XR87dgyPPPIIxGIxJ/WzSVdXF959912vr/nsyoZT+BCNRgOLxRJ165Tu/YdPP/10QGEkIx22d6KMxG63Y968eRHhIeSrO+vTYv7rX//itEH+kpCQgJqamqgTZUNDAzIyMlBaWnpXiRIYuKdz5sxBcXExJ/UzDAOr1Ypp06ZxUj+bOBwOr37FXoVpt9vDIoK2SqXCiy++GHWbm4uLi7F169aQwkhGOkuWLMHp06cD3kDsLxqNBseOHUNcXBwn9bOJt904XoXZ3NzMu7uTWCyGVqvlbIqdD+x2O7KysgAA27dvHzMtQbRTVVWFoqIiVneiDMe9P1SlUnFSPxt0dXXhk08+8TjuU5h8RyybPHkyjhw5wmsb2KSpqQkZGRnYuHEjK7Fdo4GWlhZUVVUhLy+PsyWO1atXY/HixWE9U+ut1+BVmGfPnuW1K5uQkID33nsvapZFdu/ejaKiItTU1ECpVPLdnLDi6tWreOmll1BQUMDZNQ4ePBjWezm/+uorjweTV2F+9tln49Igb0TTuNLhcECv1+PTTz+FxWKJCi8eLpg6dSo0Gg3Kyso4qV8mk4X1eFOpVHq453kVJl8/ILFYjMWLF0fFuLK1tRXz5s1DTk4OVq5cyVssnkihpKSE1bAkIwnn8abL5fIYZ/ucleWD+Ph4HDx4kJdrs0l1dTX0ej2qqqrGLYxkpHPmzBmYzWYYDAbOMsi5x5tCoZCT+oOlt7fX4zPfIUzi6se3Xd9BoIoF1ApgHD3yGIaB2WwOOzfAQHA4HCgsLMSRI0fQ2NiIq1ev8t2kiKKtrQ0mk4mVGLW++OMf/xh2Xdre3l6cP3/+jmN3CFMQI8K1K/+DTCwGbnYD4+QLLBaL8dhjjyEzM3N8LsgB7ng0c+bMwQsvvEC7rkHS39+PnJwczoYzDMPgjTfeCLsu7UgvJY+ubMfFi4iJCTptZlCoVCpUVFSM6zXZpL6+Hnl5eSgpKQmLMJKRTmpqKux2O2e/iWeffRbf//73Oak7WDo7O+/432Nxx27/dvAvMaCWDPzpugV81wfIZIBUCPQOWlOxGJAPntM7zMIOPw4ncGOY6R3x2gSXGEaj0dM5nThxy+kCAQBBDCRCIQQjj0MAsUiEGBD09/fDCcHA+MHZDycAxIggFbhunx8jgjRGENAXNhbFxcU4ffo0rFbrXe8wwCYmkwlZWVlISUnhZIbebDZj8eLFuHbtGut1B8PIYY+HabTbrw9GLegDbt4C4BwQJQA4nED/rUEBigfGoDe6gRu3AKkMEA8elwsHusI3uoF+IaBwe/p7vjbpnslesm4R9DsJYoRiSEUiCOGCk4w8LoY4hqDPRQAIIBLGQAACp9OJGJEYUmEMBK5+9LoA8dD/TrAVj8FutyMjIwMA8PLLL1NRsox7MqiwsJCTyUitVouVK1eG7S6U0fusfX1AL24LSyYEnH23/46RDEwSqSUAhIBw8LjLCQyehlu3AJHw9nuGvaaWSlFd9a6nWIgTzhghRAJgQHTigb+JE06BAMJBoxcjGCk2AYRC0bAPJYAwZtDSDuIKOE2vJ01NTZg3bx6MRiP14uGQS5cuobS0FHl5eZzU//rrr2PChAmc1B0qvoXpFpbDOSAsMQCh884Jof5bgxZzsLhfixllOnrwNZFIhPzlyzFn9hzv5xF3d3XkceL9+DhRVlYGg8EAq9Ua1m5e0QKXYUlkMhn2798flt5YQ8J0ufrQ6yIDyxXEOeyUQaspV9y2lsCgYCXDllQGu7YOJwDh7eNCIdDvvP2ewddUKhXeLC8FiRF6eToIICDu7isAkAFLJxBCCIL+IbNHfLyffdxhJC9evBhxYSQjHTYS5PoiNzc3LCfshn7TAggAVz/iJ09GTP+ICRKHE8AIa4k+oNcJSBWD3Vnh4OuDY1P3cbET6O67/Z6bt6BQxWHz/9sCgUQGydBkDEF/fx/6BwUoEQrgdPaht78Pvf1OYKhbGwO4+geOuwSD7yfod7pAQOB09sM15v+B0dzcjKysLKxatQrLly8P8N0UNuAyLInJZAo7q+kRwaChoQF5eXkjBtxiQNbH2rpmXFwcLl++HBHOBBUVFdizZw/MZjN1GOCZhIQE6PV6NDY2sv7b4Ttix8hIBv71AmVCwDn2af6gUChQVFQU9qJ0e/GcOHECVquVijIMuHr1KoxGI/R6Pet179q1i1erOdJ100OYdwhGMdgdheP2ZFCISKVS/OpXv2KnMo4Il2SwFE/YTpDrRqvVYtKkSazWGQgjDZWHMDUaze1wF90jZltDJBKsZbglg6V4wnaCXDe7du3iLXv1mBYzKSmJsxCK4W4twzUZLMUTLhLk5ubmYsqUKazVFwgjrbXXMSYXDr7hbC1tNhvmzZsHtVqNX//619SLJwJoaWmByWRiPScKH1ZTLpdj9uzZdxzzKsz4+HjWLy4UCvHMM8+wXm+oNDQ0ICsrC6WlpWGdDJbiCRcJcnNzczF58mTW6vMHiUTiEZ7VqzC5MOcPP/xw2EVR37FjB7Zu3QqLxXLXhpGMdLhIkPvSSy+Nq1dXT08PUlJS7jjmVZhs77pXq9XYsGEDq3WGwvBksNu3b6dePBEO2wlyc3NzIZFIxj6RJaRSqUfgOa/CfOCBB1h9YgiFwrDZBN3c3IyMjAysWrUKy5Yt47s5FBZgOyyJRqPBY489xkLL/OPBBx/0OOZVmAsWLGBtsVUkEmH58uVhMekTLclgKZ6wnSB3w4YNkEqlLLRsdEQiERYvXuxx3KswdTodawGfFQoF1q1bx0pdwRKNyWApnrC5EyUzM3NcjIlSqfS6ddCrMBmGYW1m9p577uE1IZA7jOTChQtpGMm7gNTUVNhsNlRXV4dUj0wmQ05ODkut8o3D4fAaocGnrywbYpLL5di4cWPI9QRLdXU1CgoKUFVVFbXJYCmesLUTpbCwkHOrqdVqvV7DpzCfeOKJkPvYTqeTs7Teo+FwOGAwGHDkyBFYLBbqgH6XwVaCXJ1Ox2mEA6lUiqefftrraz6FqdPpQs74NXPmzHHPP2Kz2ZCVlYX77rsPL7zwAvXiuUu5dOkSjEZjyDlR1qxZw1KLPJFKpT5XK0btyjqdwe/1EolE474c4U4GW1JSctclg6V4wkaC3CVLlnAWsCs+Pt7DscCNT2HKZDI8/vjjQV/U12wTV9BksBRvhJogV6fTcZJSYazVilE3Sj/77LNBO7Q7nc5xydhFk8FSxiKUBLkymQxz585lv1EAfvGLX/h8bVRhZmZmBt2dTU1N5XxGyx1GkiaDpYxGqAly8/PzIRCwGyh84cKFo86/jCpMhmHw0EMPBXxRqVSKJ598MuD3BUJZWRmKiopgtVrDLpASJfwIJUFueno6q+NMhmG8BDm/kzFj/qxatSrg/WmjzTaFit1uh16vx8WLF2GxWKgDOsVvgk2Qy7aDzMSJE8fUx5jCzM3NDfjCEonE52xTKLS2tiIrKws5OTlYvnw59eKhBEywCXLZGmeq1Wrs2rVrzPPGFGYwnvaPPvpoQOf7gzsZrNlspslgKUET7E6UH//4x6xcPy4uzi9j51f4yg0bNkCtVvt1YZFIFNIyy0hoMlgK2wSTIHfJkiUhp6f011oCXgI+++Lee+/F119/7dfFKyoqguoCj6SjowMFBQV4+umnacQ6CuucOXMGFy9eRGlp6ZjnOhwOyOVy+CkXryQnJ+PLL7/061y/HwEGgwEKhWLM8wghrIwvaTJYCtcEkiBXJpOFtPynVqtx+PBhv8/3W5ijLYYO5+bNmyELs6ioCOXl5bBardSLh8IpJpMJlZWVfoUlCTZmlUgkQn5+fkCzu34Lk2EY/PSnPx0z5Mj3vvc9vy8+EpvNhoyMDKjVapoMljIuBJIgN9iZWZVKhTfffDOg9wQ0ml23bt2Y3dmZM2cG1AA3TU1NyMrKgtFopGEkKeOKvwlytVptwB5AarUa+/fvD7gbHJAwtVot8vPzfVrNYGdk3clgLRYLTQZL4QV/wpI88sgjAdUpFouxaNGioCZCA57/3bZtm09PIIVCEdD4kiaDpYQTYyXITUlJCchi3nPPPTh48GBQbQlYmBqNBqtWrfJqmgOZkaXJYCnhyGhhSZKSkvxeLomLi8OxY8eCnskNasX09ddf92o1XS6XXw2pqKhAYWEhzGYzDSNJCStaWlpgNpt9Oh/4EwharVZjx44dIa1OBCVMmUyGoqIij4kgh8Mx6pSyO4wkTQZLCWdGS5A7Vhws97hy9erVIbXBb8+fkTgcDsyePRsXLly447iv6lpbW1FQUIC1a9fSiHWUiODo0aNQq9XYsmXL0LH4+Hhcu3bN6/kSiQQPP/wwjh8/HvJe5KCd/2QyGd555507Nnv6inZQW1uLgoICmEwmKkpKxOAtQe5oET1mz57NiiiBEIQJDMRDefbZZ4ca4q3/bTAYUFlZCYvFQpPBUiKOkQlyfW2YTklJwQcffMBa1I7Q3OUxMBHk9vYZvh3LnQz2vvvuo8lgKRHLyAS53nZZpaSkoLGxkdVQrSELUyaT4dChQ3eY+OHJYGkYSUqkMzxB7vAJT3dAALZFCbAgTGDAIygtLQ0zZsygyWApUYk7Qa5UKoVQKERsbCy0Wi0nogQCEKZlrQACwYgyvxxuX50lS5bAarWitbUVf//736kXDyXqKC0txX/+8x/I5XLk5OTg9OnTnGUaCGi5pK18PmZ8/grIniwAFqwVZGPvmjqQPVk4cOAAtmzZgsuXL7PeSK1Wy8oXkJ6ePvR3RUVFUHFGKRSRSIS+vj5OrxGAMNtQPn8GPn+FYM9AfGVY1gqQjbpBoVIo0U9HRwc6OjrueMgDACxrIcjeO/TvmjqCPViLtdiDYOTh/xiz7SgOnVqD3MGLtJXPR/beNJRtpqKk3D0kJSWNEGUbyucLIMgG6ggBGSy5tQIIslvw4IzgruO3MNuOHsIp7EX24PhyFSpByElspP4ClLuWgV7kJpThAtmD4SYqa08d1mAW7g9SH34Ksw1HD53CmrrBJ8KFMmDTDMwvpxM8lLuXtvJV2HQqDWWVG+Gpvxl4sCwXwfYn/RtjtpVj/ozP8cqwp4JlrQDZe9egbsSTgkK5Oxi0lrO4mWPxy2K2HT2EU2uGq78N51sApD2IILvQFEqEcwGfnwLSgh1EjoEfcTwGu7Gv3JalZe0MbDoFrKnzZsIplLuAtvPg0sl0TIspEAyIcG/2bceC7L1pKLtAgpoGplCigun3YxaAU59f8Pqypfy2800wBL0fk0K56xlcu0wru4CTQ8sTnuv9wUCFSaGEQls55s/YhFNDB9JQdiH0ZUQqTAolDGFldwmFQmEXKkwKJQz5/6GFNae7ZI7lAAAAAElFTkSuQmCC", "text/plain": [ - "" - ], - "image/png": "" + "" + ] }, "metadata": {}, - "execution_count": 17 + "execution_count": 11 } ], "metadata": {} @@ -381,7 +391,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 12, "source": [ "segments.ques_mark_segments" ], @@ -394,7 +404,7 @@ ] }, "metadata": {}, - "execution_count": 19 + "execution_count": 12 } ], "metadata": {} @@ -420,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "source": [ "sif4sci(item[\"stem\"], figures=figures, tokenization=False, symbol=\"tfgm\")" ], @@ -433,7 +443,7 @@ ] }, "metadata": {}, - "execution_count": 11 + "execution_count": 13 } ], "metadata": { @@ -461,7 +471,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 14, "source": [ "tokens = sif4sci(item[\"stem\"], figures=figures, tokenization=True)" ], @@ -487,7 +497,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "source": [ "tokens.text_tokens" ], @@ -532,7 +542,7 @@ ] }, "metadata": {}, - "execution_count": 12 + "execution_count": 15 } ], "metadata": { @@ -556,7 +566,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "source": [ "tokens.formula_tokens" ], @@ -593,7 +603,7 @@ ] }, "metadata": {}, - "execution_count": 13 + "execution_count": 16 } ], "metadata": { @@ -619,17 +629,17 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 17, "source": [ - "sif4sci(\n", - " item[\"stem\"],\n", - " figures=figures,\n", - " tokenization=True,\n", - " tokenization_params={\n", - " \"formula_params\": {\n", - " \"method\": \"linear\",\n", - " }\n", - " }\n", + "sif4sci(\r\n", + " item[\"stem\"],\r\n", + " figures=figures,\r\n", + " tokenization=True,\r\n", + " tokenization_params={\r\n", + " \"formula_params\": {\r\n", + " \"method\": \"linear\",\r\n", + " }\r\n", + " }\r\n", ").formula_tokens" ], "outputs": [ @@ -665,7 +675,7 @@ ] }, "metadata": {}, - "execution_count": 37 + "execution_count": 17 } ], "metadata": { @@ -686,18 +696,18 @@ }, { "cell_type": "code", - "execution_count": 39, - "source": [ - "sif4sci(\n", - " item[\"stem\"],\n", - " figures=figures,\n", - " tokenization=True,\n", - " tokenization_params={\n", - " \"formula_params\":{\n", - " \"method\": \"ast\",\n", - " }\n", - " }\n", - ").formula_tokens\n" + "execution_count": 18, + "source": [ + "sif4sci(\r\n", + " item[\"stem\"],\r\n", + " figures=figures,\r\n", + " tokenization=True,\r\n", + " tokenization_params={\r\n", + " \"formula_params\":{\r\n", + " \"method\": \"ast\",\r\n", + " }\r\n", + " }\r\n", + ").formula_tokens\r\n" ], "outputs": [ { @@ -717,7 +727,7 @@ ] }, "metadata": {}, - "execution_count": 39 + "execution_count": 18 } ], "metadata": { @@ -736,49 +746,49 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 19, "source": [ - "f = sif4sci(\n", - " item[\"stem\"],\n", - " figures=figures,\n", - " tokenization=True,\n", - " tokenization_params={\n", - " \"formula_params\":{\n", - " \"method\": \"ast\",\n", - " \"return_type\": \"ast\",\n", - " \"ord2token\": True,\n", - " \"var_numbering\": True,\n", - " }\n", - " }\n", - ").formula_tokens\n", - "f\n" + "f = sif4sci(\r\n", + " item[\"stem\"],\r\n", + " figures=figures,\r\n", + " tokenization=True,\r\n", + " tokenization_params={\r\n", + " \"formula_params\":{\r\n", + " \"method\": \"ast\",\r\n", + " \"return_type\": \"ast\",\r\n", + " \"ord2token\": True,\r\n", + " \"var_numbering\": True,\r\n", + " }\r\n", + " }\r\n", + ").formula_tokens\r\n", + "f\r\n" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ]" + "[,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ]" ] }, "metadata": {}, - "execution_count": 109 + "execution_count": 19 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 20, "source": [ "for i in range(0, len(f)):\n", " ForestPlotter().export(\n", @@ -786,7 +796,19 @@ " )\n", "# plt.show()\n" ], - "outputs": [], + "outputs": [ + { + "output_type": "error", + "ename": "NameError", + "evalue": "name 'ForestPlotter' is not defined", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m ForestPlotter().export(\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mroot_list\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mnode\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mnode\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m )\n\u001b[0;32m 5\u001b[0m \u001b[1;31m# plt.show()\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mNameError\u001b[0m: name 'ForestPlotter' is not defined" + ] + } + ], "metadata": {} }, { @@ -799,7 +821,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "source": [ "sif4sci(\n", " item[\"stem\"],\n", @@ -814,55 +836,7 @@ " }\n", ").formula_tokens" ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " '\\\\bigtriangleup',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " ',',\n", - " 'mathord',\n", - " 'mathord',\n", - " ',',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'mathord',\n", - " 'textord',\n", - " '\\\\supsub',\n", - " ',',\n", - " 'mathord',\n", - " 'textord',\n", - " '\\\\supsub',\n", - " ',',\n", - " 'mathord',\n", - " 'textord',\n", - " '\\\\supsub']" - ] - }, - "metadata": {}, - "execution_count": 40 - } - ], + "outputs": [], "metadata": { "collapsed": false, "pycharm": { @@ -879,7 +853,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "source": [ "sif4sci(\n", " item[\"stem\"],\n", @@ -895,55 +869,7 @@ " }\n", ").formula_tokens" ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['mathord_0',\n", - " 'mathord_1',\n", - " 'mathord_2',\n", - " 'mathord_1',\n", - " 'mathord_2',\n", - " 'mathord_0',\n", - " 'mathord_1',\n", - " 'mathord_0',\n", - " 'mathord_2',\n", - " '\\\\bigtriangleup',\n", - " 'mathord_0',\n", - " 'mathord_1',\n", - " 'mathord_2',\n", - " 'mathord_3',\n", - " 'mathord_3',\n", - " 'mathord_3',\n", - " 'mathord_3',\n", - " 'mathord_3',\n", - " 'mathord_3',\n", - " 'mathord_3',\n", - " ',',\n", - " 'mathord_3',\n", - " 'mathord_3',\n", - " ',',\n", - " 'mathord_3',\n", - " 'mathord_3',\n", - " 'mathord_3',\n", - " 'mathord_4',\n", - " 'textord',\n", - " '\\\\supsub',\n", - " ',',\n", - " 'mathord_4',\n", - " 'textord',\n", - " '\\\\supsub',\n", - " ',',\n", - " 'mathord_4',\n", - " 'textord',\n", - " '\\\\supsub']" - ] - }, - "metadata": {}, - "execution_count": 44 - } - ], + "outputs": [], "metadata": { "collapsed": false, "pycharm": { @@ -967,23 +893,12 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": null, "source": [ "sif4sci(item[\"stem\"], figures=figures, tokenization=True,\n", " symbol=\"fgm\")" ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['如图', '古希腊', '数学家', '希波', '克拉底', '研究', '几何图形', '此图', '三个', '半圆', '三个', '半圆', '直径', '直角三角形', '[FORMULA]', '斜边', '[FORMULA]', '直角', '[FORMULA]', '[FORMULA]', '[FORMULA]', '三边', '围成', '区域', '记', '[FORMULA]', '黑色', '记', '[FORMULA]', '其余部分', '记', '[FORMULA]', '图形', '中', '随机', '取', '一点', '此点', '取自', '[FORMULA]', '概率', '记', '[FORMULA]', '[MARK]', '[FIGURE]']" - ] - }, - "metadata": {}, - "execution_count": 96 - } - ], + "outputs": [], "metadata": { "collapsed": false, "pycharm": { @@ -995,11 +910,11 @@ "metadata": { "kernelspec": { "name": "python3", - "display_name": "Python 3.8.5 64-bit" + "display_name": "Python 3.6.13 64-bit ('data': conda)" }, "language_info": { "name": "python", - "version": "3.8.5", + "version": "3.6.13", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", @@ -1010,7 +925,7 @@ "file_extension": ".py" }, "interpreter": { - "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + "hash": "776957673adb719a00031a24ed5efd2fa5ce8a13405e5193f8d278edd3805d55" } }, "nbformat": 4, diff --git a/examples/sif/sif_addition.ipynb b/examples/sif/sif_addition.ipynb index 57830c43..c9f5424e 100644 --- a/examples/sif/sif_addition.ipynb +++ b/examples/sif/sif_addition.ipynb @@ -9,11 +9,20 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "source": [ "from EduNLP.SIF import is_sif, to_sif,sif4sci" ], - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "D:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", + " warnings.warn(msg)\n" + ] + } + ], "metadata": {} }, { @@ -25,9 +34,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "source": [ - " text = '若$x,y$满足约束条件' \\\r\n", + "text = '若$x,y$满足约束条件' \\\r\n", " '$\\\\left\\\\{\\\\begin{array}{c}2 x+y-2 \\\\leq 0 \\\\\\\\ x-y-1 \\\\geq 0 \\\\\\\\ y+1 \\\\geq 0\\\\end{array}\\\\right.$,' \\\r\n", " '则$z=x+7 y$的最大值$\\\\SIFUnderline$'\r\n", " \r\n", @@ -38,18 +47,19 @@ "output_type": "execute_result", "data": { "text/plain": [ - "True" + "(True,\n", + " '若$x,y$满足约束条件$\\\\left\\\\{\\\\begin{array}{c}2 x+y-2 \\\\leq 0 \\\\\\\\ x-y-1 \\\\geq 0 \\\\\\\\ y+1 \\\\geq 0\\\\end{array}\\\\right.$,则$z=x+7 y$的最大值$\\\\SIFUnderline$')" ] }, "metadata": {}, - "execution_count": 4 + "execution_count": 2 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "source": [ "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", "is_sif(text)" @@ -59,11 +69,11 @@ "output_type": "execute_result", "data": { "text/plain": [ - "False" + "(False, '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...')" ] }, "metadata": {}, - "execution_count": 5 + "execution_count": 3 } ], "metadata": {} @@ -77,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "source": [ "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", "to_sif(text)" @@ -91,7 +101,7 @@ ] }, "metadata": {}, - "execution_count": 6 + "execution_count": 4 } ], "metadata": {} @@ -110,11 +120,11 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 5, "source": [ - " test_item = r\"如图所示,则$\\bigtriangleup ABC$的面积是$\\SIFBlank$。$\\FigureID{1}$\"\r\n", - " t1 = sif4sci(test_item)\r\n", - " t1" + "test_item = r\"如图所示,则$\\bigtriangleup ABC$的面积是$\\SIFBlank$。$\\FigureID{1}$\"\r\n", + "t1 = sif4sci(test_item)\r\n", + "t1" ], "outputs": [ { @@ -125,14 +135,14 @@ ] }, "metadata": {}, - "execution_count": 14 + "execution_count": 5 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 6, "source": [ "t1.describe()" ], @@ -145,16 +155,16 @@ ] }, "metadata": {}, - "execution_count": 15 + "execution_count": 6 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 7, "source": [ - "with t1.filter('fgm'):\n", + "with t1.filter('fgm'):\r\n", " print(t1)" ], "outputs": [ @@ -170,9 +180,9 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 8, "source": [ - "with t1.filter(keep='t'):\n", + "with t1.filter(keep='t'):\r\n", " print(t1)" ], "outputs": [ @@ -188,9 +198,9 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 9, "source": [ - "with t1.filter():\n", + "with t1.filter():\r\n", " print(t1)" ], "outputs": [ @@ -206,7 +216,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 10, "source": [ "t1.text_tokens" ], @@ -219,14 +229,14 @@ ] }, "metadata": {}, - "execution_count": 20 + "execution_count": 10 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 11, "source": [ "t1.formula_tokens" ], @@ -239,14 +249,14 @@ ] }, "metadata": {}, - "execution_count": 23 + "execution_count": 11 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 12, "source": [ "t1.figure_tokens" ], @@ -259,14 +269,14 @@ ] }, "metadata": {}, - "execution_count": 24 + "execution_count": 12 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 13, "source": [ "t1.ques_mark_tokens" ], @@ -279,14 +289,14 @@ ] }, "metadata": {}, - "execution_count": 25 + "execution_count": 13 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 14, "source": [ "sif4sci(test_item, symbol=\"gm\", tokenization_params={\"formula_params\": {\"method\": \"ast\"}})" ], @@ -299,14 +309,14 @@ ] }, "metadata": {}, - "execution_count": 26 + "execution_count": 14 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 15, "source": [ "sif4sci(test_item, symbol=\"tfgm\")" ], @@ -319,14 +329,14 @@ ] }, "metadata": {}, - "execution_count": 27 + "execution_count": 15 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 16, "source": [ "sif4sci(test_item, symbol=\"gm\", tokenization_params={\"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\"}})" ], @@ -339,43 +349,43 @@ ] }, "metadata": {}, - "execution_count": 28 + "execution_count": 16 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 17, "source": [ - " test_item_1 = {\n", - " \"stem\": r\"若$x=2$, $y=\\sqrt{x}$,则下列说法正确的是$\\SIFChoice$\",\n", - " \"options\": [r\"$x < y$\", r\"$y = x$\", r\"$y < x$\"]\n", - " }" + "test_item_1 = {\r\n", + " \"stem\": r\"若$x=2$, $y=\\sqrt{x}$,则下列说法正确的是$\\SIFChoice$\",\r\n", + " \"options\": [r\"$x < y$\", r\"$y = x$\", r\"$y < x$\"]\r\n", + "}" ], "outputs": [], "metadata": {} }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 18, "source": [ - " tls = [\n", - " sif4sci(e, symbol=\"gm\",\n", - " tokenization_params={\n", - " \"formula_params\": {\n", - " \"method\": \"ast\", \"return_type\": \"list\", \"ord2token\": True, \"var_numbering\": True,\n", - " \"link_variable\": False}\n", - " })\n", - " for e in ([test_item_1[\"stem\"]] + test_item_1[\"options\"])\n", - " ]" + "tls = [\r\n", + " sif4sci(e, symbol=\"gm\",\r\n", + " tokenization_params={\r\n", + " \"formula_params\": {\r\n", + " \"method\": \"ast\", \"return_type\": \"list\", \"ord2token\": True, \"var_numbering\": True,\r\n", + " \"link_variable\": False}\r\n", + " })\r\n", + " for e in ([test_item_1[\"stem\"]] + test_item_1[\"options\"])\r\n", + "]" ], "outputs": [], "metadata": {} }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 19, "source": [ "tls" ], @@ -391,14 +401,14 @@ ] }, "metadata": {}, - "execution_count": 33 + "execution_count": 19 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 20, "source": [ "tls[1:]" ], @@ -413,18 +423,18 @@ ] }, "metadata": {}, - "execution_count": 34 + "execution_count": 20 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 21, "source": [ - "from EduNLP.utils import dict2str4sif\n", - "\n", - "test_item_1_str = dict2str4sif(test_item_1, tag_mode=\"head\", add_list_no_tag=False)\n", + "from EduNLP.utils import dict2str4sif\r\n", + "\r\n", + "test_item_1_str = dict2str4sif(test_item_1, tag_mode=\"head\", add_list_no_tag=False)\r\n", "test_item_1_str " ], "outputs": [ @@ -436,21 +446,21 @@ ] }, "metadata": {}, - "execution_count": 35 + "execution_count": 21 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 22, "source": [ - "tl1 = sif4sci(\n", - " test_item_1_str, \n", - " symbol=\"gm\", \n", - " tokenization_params={\n", - " \"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\", \"ord2token\": True}\n", - " })\n", + "tl1 = sif4sci(\r\n", + " test_item_1_str, \r\n", + " symbol=\"gm\", \r\n", + " tokenization_params={\r\n", + " \"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\", \"ord2token\": True}\r\n", + " })\r\n", " " ], "outputs": [], @@ -458,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 23, "source": [ "tl1.get_segments()[0]" ], @@ -471,14 +481,14 @@ ] }, "metadata": {}, - "execution_count": 37 + "execution_count": 23 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 24, "source": [ "tl1.get_segments()[1:3]" ], @@ -492,14 +502,14 @@ ] }, "metadata": {}, - "execution_count": 38 + "execution_count": 24 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 25, "source": [ "tl1.get_segments(add_seg_type=False)[0:3]" ], @@ -514,14 +524,14 @@ ] }, "metadata": {}, - "execution_count": 39 + "execution_count": 25 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 26, "source": [ "test_item_2 = {\"options\": [r\"$x < y$\", r\"$y = x$\", r\"$y < x$\"]}" ], @@ -530,7 +540,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 27, "source": [ "test_item_2_str = dict2str4sif(test_item_2, tag_mode=\"head\", add_list_no_tag=False)" ], @@ -539,7 +549,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 28, "source": [ "test_item_2_str" ], @@ -552,17 +562,17 @@ ] }, "metadata": {}, - "execution_count": 43 + "execution_count": 28 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 29, "source": [ - "tl2 = sif4sci(test_item_2_str, symbol=\"gms\",\n", - " tokenization_params={\"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\"}})\n", + "tl2 = sif4sci(test_item_2_str, symbol=\"gms\",\r\n", + " tokenization_params={\"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\"}})\r\n", "tl2 " ], "outputs": [ @@ -574,14 +584,14 @@ ] }, "metadata": {}, - "execution_count": 44 + "execution_count": 29 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 30, "source": [ "tl2.get_segments(add_seg_type=False)" ], @@ -599,14 +609,14 @@ ] }, "metadata": {}, - "execution_count": 45 + "execution_count": 30 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 31, "source": [ "tl2.get_segments(add_seg_type=False, drop=\"s\")" ], @@ -619,16 +629,16 @@ ] }, "metadata": {}, - "execution_count": 46 + "execution_count": 31 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 32, "source": [ - "tl3 = sif4sci(test_item_1[\"stem\"], symbol=\"gs\")\n", + "tl3 = sif4sci(test_item_1[\"stem\"], symbol=\"gs\")\r\n", "tl3.text_segments" ], "outputs": [ @@ -640,14 +650,14 @@ ] }, "metadata": {}, - "execution_count": 47 + "execution_count": 32 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 33, "source": [ "tl3.formula_segments" ], @@ -660,14 +670,14 @@ ] }, "metadata": {}, - "execution_count": 48 + "execution_count": 33 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 34, "source": [ "tl3.figure_segments" ], @@ -680,14 +690,14 @@ ] }, "metadata": {}, - "execution_count": 49 + "execution_count": 34 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 35, "source": [ "tl3.ques_mark_segments" ], @@ -700,7 +710,7 @@ ] }, "metadata": {}, - "execution_count": 50 + "execution_count": 35 } ], "metadata": {} @@ -717,7 +727,7 @@ "orig_nbformat": 4, "language_info": { "name": "python", - "version": "3.8.5", + "version": "3.6.13", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", @@ -729,10 +739,10 @@ }, "kernelspec": { "name": "python3", - "display_name": "Python 3.8.5 64-bit" + "display_name": "Python 3.6.13 64-bit ('data': conda)" }, "interpreter": { - "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + "hash": "776957673adb719a00031a24ed5efd2fa5ce8a13405e5193f8d278edd3805d55" } }, "nbformat": 4, From 338e7273a88a805901a46814241f70589b57cba7 Mon Sep 17 00:00:00 2001 From: QinLonghu <1097824882@qq.com> Date: Fri, 3 Sep 2021 22:46:22 +0800 Subject: [PATCH 4/9] [feature] modify params and comments in sif4sci --- EduNLP/SIF/sif.py | 63 ++++++++++++++++++++++++++++---------- tests/test_sif/test_sif.py | 14 +++++++++ 2 files changed, 60 insertions(+), 17 deletions(-) diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index dd5c40a8..494eee77 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -67,26 +67,48 @@ def to_sif(item, check_formula=True): return sif_item -def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = None, tokenization=True, - tokenization_params=None, errors="raise", check_formula=True): +def sif4sci(item: str, figures: (dict, bool) = None, safe_mode: int = 2, symbol: str = None, tokenization=True, + tokenization_params=None, errors="raise"): r""" Default to use linear Tokenizer, change the tokenizer by specifying tokenization_params Parameters ---------- - item - figures - safe - symbol - tokenization - tokenization_params: - method: which tokenizer to be used, "linear" or "ast" - The parameters only useful for "linear": - - The parameters only useful for "ast": - ord2token: whether to transfer the variables (mathord) and constants (textord) to special tokens. - var_numbering: whether to use number suffix to denote different variables + item: str + figures: + when it is a dict, it means the id-to-instance information for figures in 'FormFigureID{...}' format, + when it is a bool, it means whether to instantiate figures in 'FormFigureBase64{...}' format + safe_mode: int + when safe = 2, use is_sif and check formula in item + when safe = 1, use is_sif but don't check formula in item + when safe = 0, don't use is_sif and don't check anything in item + symbol: str + The combination of "t","f","g","m","a","s", which determine what types of segments to be symbolize. + "t": text, + "f": formula, + "g": figuew, + "m": mask, + "a": tab, + "s": sep + tokenization: bool + whether to tokenize item after segmentation + tokenization_params: dict + the dict of text_params, formula_params and figure_params in tokenization + For formula_params: + method: which tokenizer to be used, "linear" or "ast" + The parameters only useful for "linear": + skip_figure_formula: whether to skip the formula in figure format + symbolize_figure_formula: whether to symbolize the formula in figure format + The parameters only useful for "ast": + ord2token: whether to transfer the variables (mathord) and constants (textord) to special tokens. + var_numbering: whether to use number suffix to denote different variables + return_type: 'list' or 'ast' + More parameters can be found in the definition in SIF.tokenization.formula + For figure_params: + figure_instance:whether to return instance of figures in tokens + For text_params: + See definition in SIF.tokenization.text errors: warn raise @@ -191,9 +213,16 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No [['已知'], ['说法', '中', '正确']] """ try: - if safe is True: - flag, sif_item = is_sif(item, check_formula) - item = sif_item if flag is not True else item + if safe_mode == 2: + _, item = is_sif(item, check_formula=True) + elif safe_mode == 1: + _, item = is_sif(item, check_formula=False) + elif safe_mode == 0: + pass # do nothing + else: + raise KeyError( + "Unknown safe_mode %s, use only 0 or 1 or 2." % safe_mode + ) ret = seg(item, figures, symbol) diff --git a/tests/test_sif/test_sif.py b/tests/test_sif/test_sif.py index 0f1febce..53ec2663 100644 --- a/tests/test_sif/test_sif.py +++ b/tests/test_sif/test_sif.py @@ -57,3 +57,17 @@ def test_sci4sif(figure0, figure1, figure0_base64, figure1_base64): "figure_params": {"figure_instance": True} } )) + repr(sif4sci( + r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", safe_mode=0 + )) + repr(sif4sci( + r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", safe_mode=1 + )) + repr(sif4sci( + r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", safe_mode=2 + )) + + with pytest.raises(KeyError): + repr(sif4sci( + r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", safe_mode=3 + )) From 9b72e3247c572489d113045eb3d222792a19694a Mon Sep 17 00:00:00 2001 From: QinLonghu <1097824882@qq.com> Date: Sat, 11 Sep 2021 16:52:47 +0800 Subject: [PATCH 5/9] [FEATURE] change the usage of is_sif,to_sif,sif4sci --- EduNLP/SIF/sif.py | 75 ++++++++++++++++++++++++-------------- tests/test_sif/test_sif.py | 8 +++- 2 files changed, 55 insertions(+), 28 deletions(-) diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index 494eee77..309b25a7 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -10,51 +10,65 @@ __all__ = ["is_sif", "to_sif", "sif4sci"] -def is_sif(item, check_formula=True): +def is_sif(item, check_formula=True, cache=False): r""" Parameters ---------- - item + item: str check_formula: bool - True if check the validity of formulas in items - False if not check the validity of formulas in items, which is faster + whether to check the formulas when parsing item. + + True if check the validity of formulas in item + False if not check the validity of formulas in item, which is faster + cache: bool + whether to put the parsed item in return. + + when True, the format of return is (bool, Parser) + when False, the format of return is bool Returns ------- when item can not be parsed correctly, raise Error; - when item doesn't need to be modified, return Ture and original item ; - when item needs to be modified, return False and modified item; + when item is in stardarded format originally, return Ture (and the Parser of item); + when item isn't in stardarded format originally, return False (and the Parser of item); Examples -------- >>> text = '若$x,y$满足约束条件' \ ... '$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,' \ ... '则$z=x+7 y$的最大值$\\SIFUnderline$' - >>> flag, _ = is_sif(text) - >>> print(flag) + >>> is_sif(text) True >>> text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...' - >>> is_sif(text) - (False, '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...') + >>> ret = is_sif(text, cache=True) + >>> ret # doctest: +ELLIPSIS + (False, ) """ item_parser = Parser(item, check_formula) item_parser.description_list() if item_parser.fomula_illegal_flag: raise ValueError(item_parser.fomula_illegal_message) - if item_parser.error_flag == 0 and item_parser.modify_flag == 0: - return True, item - return False, item_parser.text + ret = True if item_parser.error_flag == 0 and item_parser.modify_flag == 0 else False + if cache is True: + return ret, item_parser + else: + return ret -def to_sif(item, check_formula=True): +def to_sif(item, check_formula=True, cache_parser: Parser = None): r""" Parameters ---------- - item + item: str + check_formula: bool + whether to check the formulas when parsing item (only work when cache_parser=None). + cache_parser: Parser + the saved parser of item from is_sif. Returns ------- - item + str + the parsed item Examples -------- @@ -62,9 +76,17 @@ def to_sif(item, check_formula=True): >>> siftext = to_sif(text) >>> siftext '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...' + >>> ret = is_sif(text, cache=True) + >>> ret # doctest: +ELLIPSIS + (False, ) + >>> to_sif(text, cache_parser=ret[1]) + '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位... + """ - _, sif_item = is_sif(item, check_formula) - return sif_item + if cache_parser is not None: + return cache_parser.text + else: + return is_sif(item, check_formula, cache=True)[1].text def sif4sci(item: str, figures: (dict, bool) = None, safe_mode: int = 2, symbol: str = None, tokenization=True, @@ -87,9 +109,9 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe_mode: int = 2, symbol: The combination of "t","f","g","m","a","s", which determine what types of segments to be symbolize. "t": text, "f": formula, - "g": figuew, + "g": figure, "m": mask, - "a": tab, + "a": tag, "s": sep tokenization: bool whether to tokenize item after segmentation @@ -213,13 +235,12 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe_mode: int = 2, symbol: [['已知'], ['说法', '中', '正确']] """ try: - if safe_mode == 2: - _, item = is_sif(item, check_formula=True) - elif safe_mode == 1: - _, item = is_sif(item, check_formula=False) - elif safe_mode == 0: - pass # do nothing - else: + if safe_mode in [1, 2]: + check_formula = True if safe_mode == 1 else False + sif, item_parser = is_sif(item, check_formula=check_formula, cache=True) + if sif is not True: + item = to_sif(item, cache_parser=item_parser) + elif safe_mode != 0: raise KeyError( "Unknown safe_mode %s, use only 0 or 1 or 2." % safe_mode ) diff --git a/tests/test_sif/test_sif.py b/tests/test_sif/test_sif.py index 53ec2663..b596e927 100644 --- a/tests/test_sif/test_sif.py +++ b/tests/test_sif/test_sif.py @@ -11,7 +11,7 @@ def test_is_sif(): text = '若$x,y$满足约束条件' \ '$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,' \ '则$z=x+7 y$的最大值$\\SIFUnderline$' - assert is_sif(text)[0] == 1 + assert is_sif(text) == 1 text = '公式需要满足完整性,完整的公式如' \ '$\\begin{matrix} a & b \\\\ c & d \\end{matrix}$' \ @@ -31,6 +31,12 @@ def test_to_sif(): siftext = to_sif(text) print(siftext) + ret = is_sif(text, cache=True) + assert ret[0] == 0 + if ret[0] is not True: + siftext = to_sif(text, cache_parser=ret[1]) + print(siftext) + def test_sci4sif(figure0, figure1, figure0_base64, figure1_base64): repr(sif4sci( From 8f2856f7a2ad4fe443ff613bf70bfd1791ffdf28 Mon Sep 17 00:00:00 2001 From: QinLonghu <1097824882@qq.com> Date: Sun, 26 Sep 2021 17:07:53 +0800 Subject: [PATCH 6/9] [FEATURE] change the param name in sif.py --- EduNLP/SIF/sif.py | 28 ++++++++++++++-------------- tests/test_sif/test_sif.py | 10 +++++----- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index 309b25a7..9c9a650e 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -10,7 +10,7 @@ __all__ = ["is_sif", "to_sif", "sif4sci"] -def is_sif(item, check_formula=True, cache=False): +def is_sif(item, check_formula=True, return_parser=False): r""" Parameters ---------- @@ -20,8 +20,8 @@ def is_sif(item, check_formula=True, cache=False): True if check the validity of formulas in item False if not check the validity of formulas in item, which is faster - cache: bool - whether to put the parsed item in return. + return_parser: bool + whether to put the parsed item in return. when True, the format of return is (bool, Parser) when False, the format of return is bool @@ -40,7 +40,7 @@ def is_sif(item, check_formula=True, cache=False): >>> is_sif(text) True >>> text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...' - >>> ret = is_sif(text, cache=True) + >>> ret = is_sif(text, return_parser=True) >>> ret # doctest: +ELLIPSIS (False, ) """ @@ -49,7 +49,7 @@ def is_sif(item, check_formula=True, cache=False): if item_parser.fomula_illegal_flag: raise ValueError(item_parser.fomula_illegal_message) ret = True if item_parser.error_flag == 0 and item_parser.modify_flag == 0 else False - if cache is True: + if return_parser is True: return ret, item_parser else: return ret @@ -76,7 +76,7 @@ def to_sif(item, check_formula=True, cache_parser: Parser = None): >>> siftext = to_sif(text) >>> siftext '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...' - >>> ret = is_sif(text, cache=True) + >>> ret = is_sif(text, return_parser=True) >>> ret # doctest: +ELLIPSIS (False, ) >>> to_sif(text, cache_parser=ret[1]) @@ -86,10 +86,10 @@ def to_sif(item, check_formula=True, cache_parser: Parser = None): if cache_parser is not None: return cache_parser.text else: - return is_sif(item, check_formula, cache=True)[1].text + return is_sif(item, check_formula, return_parser=True)[1].text -def sif4sci(item: str, figures: (dict, bool) = None, safe_mode: int = 2, symbol: str = None, tokenization=True, +def sif4sci(item: str, figures: (dict, bool) = None, mode: int = 2, symbol: str = None, tokenization=True, tokenization_params=None, errors="raise"): r""" @@ -101,7 +101,7 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe_mode: int = 2, symbol: figures: when it is a dict, it means the id-to-instance information for figures in 'FormFigureID{...}' format, when it is a bool, it means whether to instantiate figures in 'FormFigureBase64{...}' format - safe_mode: int + mode: int when safe = 2, use is_sif and check formula in item when safe = 1, use is_sif but don't check formula in item when safe = 0, don't use is_sif and don't check anything in item @@ -235,14 +235,14 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe_mode: int = 2, symbol: [['已知'], ['说法', '中', '正确']] """ try: - if safe_mode in [1, 2]: - check_formula = True if safe_mode == 1 else False - sif, item_parser = is_sif(item, check_formula=check_formula, cache=True) + if mode in [1, 2]: + check_formula = True if mode == 1 else False + sif, item_parser = is_sif(item, check_formula=check_formula, return_parser=True) if sif is not True: item = to_sif(item, cache_parser=item_parser) - elif safe_mode != 0: + elif mode != 0: raise KeyError( - "Unknown safe_mode %s, use only 0 or 1 or 2." % safe_mode + "Unknown mode %s, use only 0 or 1 or 2." % mode ) ret = seg(item, figures, symbol) diff --git a/tests/test_sif/test_sif.py b/tests/test_sif/test_sif.py index b596e927..86606ed4 100644 --- a/tests/test_sif/test_sif.py +++ b/tests/test_sif/test_sif.py @@ -31,7 +31,7 @@ def test_to_sif(): siftext = to_sif(text) print(siftext) - ret = is_sif(text, cache=True) + ret = is_sif(text, return_parser=True) assert ret[0] == 0 if ret[0] is not True: siftext = to_sif(text, cache_parser=ret[1]) @@ -64,16 +64,16 @@ def test_sci4sif(figure0, figure1, figure0_base64, figure1_base64): } )) repr(sif4sci( - r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", safe_mode=0 + r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=0 )) repr(sif4sci( - r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", safe_mode=1 + r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=1 )) repr(sif4sci( - r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", safe_mode=2 + r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=2 )) with pytest.raises(KeyError): repr(sif4sci( - r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", safe_mode=3 + r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=3 )) From 24cc80b5d6e4395ca9a4b943bd07d8352b9c1d3f Mon Sep 17 00:00:00 2001 From: QinLonghu <1097824882@qq.com> Date: Sun, 26 Sep 2021 17:15:58 +0800 Subject: [PATCH 7/9] [examples] fix the sif examples --- examples/sif/sif.ipynb | 224 +++++++++++++++++++++++++++++------------ 1 file changed, 159 insertions(+), 65 deletions(-) diff --git a/examples/sif/sif.ipynb b/examples/sif/sif.ipynb index 50de8158..5df4f41b 100644 --- a/examples/sif/sif.ipynb +++ b/examples/sif/sif.ipynb @@ -83,7 +83,7 @@ "data": { "image/png": "", "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -147,8 +147,7 @@ "output_type": "execute_result", "data": { "text/plain": [ - "(True,\n", - " '如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\\\SIFChoice$$\\\\FigureID{1}$')" + "True" ] }, "metadata": {}, @@ -176,7 +175,7 @@ "output_type": "execute_result", "data": { "text/plain": [ - "(False, '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...')" + "False" ] }, "metadata": {}, @@ -373,7 +372,7 @@ "data": { "image/png": "", "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -768,16 +767,16 @@ "output_type": "execute_result", "data": { "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ]" + "[,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ]" ] }, "metadata": {}, @@ -790,25 +789,13 @@ "cell_type": "code", "execution_count": 20, "source": [ - "for i in range(0, len(f)):\n", - " ForestPlotter().export(\n", - " f[i], root_list=[node for node in f[i]],\n", - " )\n", - "# plt.show()\n" - ], - "outputs": [ - { - "output_type": "error", - "ename": "NameError", - "evalue": "name 'ForestPlotter' is not defined", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m ForestPlotter().export(\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mroot_list\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mnode\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mnode\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m )\n\u001b[0;32m 5\u001b[0m \u001b[1;31m# plt.show()\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mNameError\u001b[0m: name 'ForestPlotter' is not defined" - ] - } + "# for i in range(0, len(f)):\r\n", + "# ForestPlotter().export(\r\n", + "# f[i], root_list=[node for node in f[i]],\r\n", + "# )\r\n", + "# plt.show()\r\n" ], + "outputs": [], "metadata": {} }, { @@ -821,22 +808,70 @@ }, { "cell_type": "code", - "execution_count": null, - "source": [ - "sif4sci(\n", - " item[\"stem\"],\n", - " figures=figures,\n", - " tokenization=True,\n", - " tokenization_params={\n", - " \"formula_params\":{\n", - " \"method\": \"ast\",\n", - " \"return_type\": \"list\",\n", - " \"ord2token\": True,\n", - " }\n", - " }\n", + "execution_count": 21, + "source": [ + "sif4sci(\r\n", + " item[\"stem\"],\r\n", + " figures=figures,\r\n", + " tokenization=True,\r\n", + " tokenization_params={\r\n", + " \"formula_params\":{\r\n", + " \"method\": \"ast\",\r\n", + " \"return_type\": \"list\",\r\n", + " \"ord2token\": True,\r\n", + " }\r\n", + " }\r\n", ").formula_tokens" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " '\\\\bigtriangleup',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " ',',\n", + " 'mathord',\n", + " 'mathord',\n", + " ',',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'mathord',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord',\n", + " 'textord',\n", + " '\\\\supsub']" + ] + }, + "metadata": {}, + "execution_count": 21 + } + ], "metadata": { "collapsed": false, "pycharm": { @@ -853,23 +888,71 @@ }, { "cell_type": "code", - "execution_count": null, - "source": [ - "sif4sci(\n", - " item[\"stem\"],\n", - " figures=figures,\n", - " tokenization=True,\n", - " tokenization_params={\n", - " \"formula_params\":{\n", - " \"method\": \"ast\",\n", - " \"ord2token\": True,\n", - " \"return_type\": \"list\",\n", - " \"var_numbering\": True\n", - " }\n", - " }\n", + "execution_count": 22, + "source": [ + "sif4sci(\r\n", + " item[\"stem\"],\r\n", + " figures=figures,\r\n", + " tokenization=True,\r\n", + " tokenization_params={\r\n", + " \"formula_params\":{\r\n", + " \"method\": \"ast\",\r\n", + " \"ord2token\": True,\r\n", + " \"return_type\": \"list\",\r\n", + " \"var_numbering\": True\r\n", + " }\r\n", + " }\r\n", ").formula_tokens" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['mathord_0',\n", + " 'mathord_1',\n", + " 'mathord_2',\n", + " 'mathord_1',\n", + " 'mathord_2',\n", + " 'mathord_0',\n", + " 'mathord_1',\n", + " 'mathord_0',\n", + " 'mathord_2',\n", + " '\\\\bigtriangleup',\n", + " 'mathord_0',\n", + " 'mathord_1',\n", + " 'mathord_2',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " ',',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " ',',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_3',\n", + " 'mathord_4',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord_4',\n", + " 'textord',\n", + " '\\\\supsub',\n", + " ',',\n", + " 'mathord_4',\n", + " 'textord',\n", + " '\\\\supsub']" + ] + }, + "metadata": {}, + "execution_count": 22 + } + ], "metadata": { "collapsed": false, "pycharm": { @@ -893,12 +976,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "source": [ - "sif4sci(item[\"stem\"], figures=figures, tokenization=True,\n", + "sif4sci(item[\"stem\"], figures=figures, tokenization=True,\r\n", " symbol=\"fgm\")" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['如图', '古希腊', '数学家', '希波', '克拉底', '研究', '几何图形', '此图', '三个', '半圆', '三个', '半圆', '直径', '直角三角形', '[FORMULA]', '斜边', '[FORMULA]', '直角', '[FORMULA]', '[FORMULA]', '[FORMULA]', '三边', '围成', '区域', '记', '[FORMULA]', '黑色', '记', '[FORMULA]', '其余部分', '记', '[FORMULA]', '图形', '中', '随机', '取', '一点', '此点', '取自', '[FORMULA]', '概率', '记', '[FORMULA]', '[MARK]', '[FIGURE]']" + ] + }, + "metadata": {}, + "execution_count": 23 + } + ], "metadata": { "collapsed": false, "pycharm": { From 1d3889b7625b69305d6024ce5cf8f6590c50e2e9 Mon Sep 17 00:00:00 2001 From: QinLonghu <1097824882@qq.com> Date: Sun, 10 Oct 2021 12:44:50 +0800 Subject: [PATCH 8/9] change the param name in sif.py and update examples --- EduNLP/SIF/sif.py | 16 +- examples/sif/sif_addition.ipynb | 617 +++++++++++++++++--------------- tests/test_sif/test_sif.py | 2 +- 3 files changed, 344 insertions(+), 291 deletions(-) diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index 9c9a650e..ed5c99cd 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -55,15 +55,15 @@ def is_sif(item, check_formula=True, return_parser=False): return ret -def to_sif(item, check_formula=True, cache_parser: Parser = None): +def to_sif(item, check_formula=True, parser: Parser = None): r""" Parameters ---------- item: str check_formula: bool - whether to check the formulas when parsing item (only work when cache_parser=None). - cache_parser: Parser - the saved parser of item from is_sif. + whether to check the formulas when parsing item (only work when parser=None). + parser: Parser + the parser of item returned from is_sif. Returns ------- @@ -79,12 +79,12 @@ def to_sif(item, check_formula=True, cache_parser: Parser = None): >>> ret = is_sif(text, return_parser=True) >>> ret # doctest: +ELLIPSIS (False, ) - >>> to_sif(text, cache_parser=ret[1]) + >>> to_sif(text, parser=ret[1]) '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位... """ - if cache_parser is not None: - return cache_parser.text + if parser is not None: + return parser.text else: return is_sif(item, check_formula, return_parser=True)[1].text @@ -239,7 +239,7 @@ def sif4sci(item: str, figures: (dict, bool) = None, mode: int = 2, symbol: str check_formula = True if mode == 1 else False sif, item_parser = is_sif(item, check_formula=check_formula, return_parser=True) if sif is not True: - item = to_sif(item, cache_parser=item_parser) + item = to_sif(item, parser=item_parser) elif mode != 0: raise KeyError( "Unknown mode %s, use only 0 or 1 or 2." % mode diff --git a/examples/sif/sif_addition.ipynb b/examples/sif/sif_addition.ipynb index c9f5424e..7a2a1b20 100644 --- a/examples/sif/sif_addition.ipynb +++ b/examples/sif/sif_addition.ipynb @@ -2,112 +2,166 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ "# sif_addition" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 1, - "source": [ - "from EduNLP.SIF import is_sif, to_sif,sif4sci" - ], + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "D:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", " warnings.warn(msg)\n" ] } ], - "metadata": {} + "source": [ + "from EduNLP.SIF import is_sif, to_sif,sif4sci" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## is_sif" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 2, - "source": [ - "text = '若$x,y$满足约束条件' \\\r\n", - " '$\\\\left\\\\{\\\\begin{array}{c}2 x+y-2 \\\\leq 0 \\\\\\\\ x-y-1 \\\\geq 0 \\\\\\\\ y+1 \\\\geq 0\\\\end{array}\\\\right.$,' \\\r\n", - " '则$z=x+7 y$的最大值$\\\\SIFUnderline$'\r\n", - " \r\n", - "is_sif(text)\r\n" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ - "(True,\n", - " '若$x,y$满足约束条件$\\\\left\\\\{\\\\begin{array}{c}2 x+y-2 \\\\leq 0 \\\\\\\\ x-y-1 \\\\geq 0 \\\\\\\\ y+1 \\\\geq 0\\\\end{array}\\\\right.$,则$z=x+7 y$的最大值$\\\\SIFUnderline$')" + "True" ] }, + "execution_count": 2, "metadata": {}, - "execution_count": 2 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "text = '若$x,y$满足约束条件' \\\n", + " '$\\\\left\\\\{\\\\begin{array}{c}2 x+y-2 \\\\leq 0 \\\\\\\\ x-y-1 \\\\geq 0 \\\\\\\\ y+1 \\\\geq 0\\\\end{array}\\\\right.$,' \\\n", + " '则$z=x+7 y$的最大值$\\\\SIFUnderline$'\n", + " \n", + "is_sif(text)\n" + ] }, { "cell_type": "code", "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", "is_sif(text)" - ], + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ - "(False, '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...')" + "(False, )" ] }, + "execution_count": 4, "metadata": {}, - "execution_count": 3 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", + "is_sif(text, return_parser=True)" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## to_sif" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 4, - "source": [ - "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", - "to_sif(text)" - ], + "execution_count": 5, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "'某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...'" ] }, + "execution_count": 5, "metadata": {}, - "execution_count": 4 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", + "to_sif(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1]siftext : 某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位... ,consume time [0.018142223358154297s]\n", + "[2]return : (False, )\n", + "[2]siftext : 某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位... ,consume time [0.008990764617919922s]\n" + ] + } + ], + "source": [ + "import time\n", + "# ------------不使用‘加速’机制--------------- #\n", + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'*150\n", + "start = time.time()\n", + "if not is_sif(text):\n", + " siftext = to_sif(text)\n", + "print(\"[1]siftext : {} ,consume time [{}s]\".format(siftext[:35], time.time() - start))\n", + "\n", + "# ------------使用‘加速’机制--------------- #\n", + "start = time.time()\n", + "ret = is_sif(text, return_parser=True)\n", + "print(\"[2]return : \", ret)\n", + "if ret[0] is not True:\n", + " siftext = to_sif(text, parser=ret[1])\n", + "print(\"[2]siftext : {} ,consume time [{}s]\".format(siftext[:35], time.time() - start))" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## sif4sci\n", " to_symbolize:\n", @@ -115,283 +169,279 @@ " - \"f\": formula\n", " - \"g\": figure\n", " - \"m\": question mark" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 5, - "source": [ - "test_item = r\"如图所示,则$\\bigtriangleup ABC$的面积是$\\SIFBlank$。$\\FigureID{1}$\"\r\n", - "t1 = sif4sci(test_item)\r\n", - "t1" - ], + "execution_count": 7, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['如图所示', '\\\\bigtriangleup', 'ABC', '面积', '\\\\SIFBlank', \\FigureID{1}]" ] }, + "execution_count": 7, "metadata": {}, - "execution_count": 5 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "test_item = r\"如图所示,则$\\bigtriangleup ABC$的面积是$\\SIFBlank$。$\\FigureID{1}$\"\n", + "t1 = sif4sci(test_item)\n", + "t1" + ] }, { "cell_type": "code", - "execution_count": 6, - "source": [ - "t1.describe()" - ], + "execution_count": 8, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "{'t': 2, 'f': 2, 'g': 1, 'm': 1}" ] }, + "execution_count": 8, "metadata": {}, - "execution_count": 6 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "t1.describe()" + ] }, { "cell_type": "code", - "execution_count": 7, - "source": [ - "with t1.filter('fgm'):\r\n", - " print(t1)" - ], + "execution_count": 9, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "['如图所示', '面积']\n" ] } ], - "metadata": {} + "source": [ + "with t1.filter('fgm'):\n", + " print(t1)" + ] }, { "cell_type": "code", - "execution_count": 8, - "source": [ - "with t1.filter(keep='t'):\r\n", - " print(t1)" - ], + "execution_count": 10, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "['如图所示', '面积']\n" ] } ], - "metadata": {} + "source": [ + "with t1.filter(keep='t'):\n", + " print(t1)" + ] }, { "cell_type": "code", - "execution_count": 9, - "source": [ - "with t1.filter():\r\n", - " print(t1)" - ], + "execution_count": 11, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "['如图所示', '\\\\bigtriangleup', 'ABC', '面积', '\\\\SIFBlank', \\FigureID{1}]\n" ] } ], - "metadata": {} + "source": [ + "with t1.filter():\n", + " print(t1)" + ] }, { "cell_type": "code", - "execution_count": 10, - "source": [ - "t1.text_tokens" - ], + "execution_count": 12, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['如图所示', '面积']" ] }, + "execution_count": 12, "metadata": {}, - "execution_count": 10 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "t1.text_tokens" + ] }, { "cell_type": "code", - "execution_count": 11, - "source": [ - "t1.formula_tokens" - ], + "execution_count": 13, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['\\\\bigtriangleup', 'ABC']" ] }, + "execution_count": 13, "metadata": {}, - "execution_count": 11 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "t1.formula_tokens" + ] }, { "cell_type": "code", - "execution_count": 12, - "source": [ - "t1.figure_tokens" - ], + "execution_count": 14, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[\\FigureID{1}]" ] }, + "execution_count": 14, "metadata": {}, - "execution_count": 12 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "t1.figure_tokens" + ] }, { "cell_type": "code", - "execution_count": 13, - "source": [ - "t1.ques_mark_tokens" - ], + "execution_count": 15, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['\\\\SIFBlank']" ] }, + "execution_count": 15, "metadata": {}, - "execution_count": 13 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "t1.ques_mark_tokens" + ] }, { "cell_type": "code", - "execution_count": 14, - "source": [ - "sif4sci(test_item, symbol=\"gm\", tokenization_params={\"formula_params\": {\"method\": \"ast\"}})" - ], + "execution_count": 16, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['如图所示', , '面积', '[MARK]', '[FIGURE]']" ] }, + "execution_count": 16, "metadata": {}, - "execution_count": 14 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "sif4sci(test_item, symbol=\"gm\", tokenization_params={\"formula_params\": {\"method\": \"ast\"}})" + ] }, { "cell_type": "code", - "execution_count": 15, - "source": [ - "sif4sci(test_item, symbol=\"tfgm\")" - ], + "execution_count": 17, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]']" ] }, + "execution_count": 17, "metadata": {}, - "execution_count": 15 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "sif4sci(test_item, symbol=\"tfgm\")" + ] }, { "cell_type": "code", - "execution_count": 16, - "source": [ - "sif4sci(test_item, symbol=\"gm\", tokenization_params={\"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\"}})" - ], + "execution_count": 18, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['如图所示', '\\\\bigtriangleup', 'A', 'B', 'C', '面积', '[MARK]', '[FIGURE]']" ] }, + "execution_count": 18, "metadata": {}, - "execution_count": 16 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "sif4sci(test_item, symbol=\"gm\", tokenization_params={\"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\"}})" + ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, + "metadata": {}, + "outputs": [], "source": [ - "test_item_1 = {\r\n", - " \"stem\": r\"若$x=2$, $y=\\sqrt{x}$,则下列说法正确的是$\\SIFChoice$\",\r\n", - " \"options\": [r\"$x < y$\", r\"$y = x$\", r\"$y < x$\"]\r\n", + "test_item_1 = {\n", + " \"stem\": r\"若$x=2$, $y=\\sqrt{x}$,则下列说法正确的是$\\SIFChoice$\",\n", + " \"options\": [r\"$x < y$\", r\"$y = x$\", r\"$y < x$\"]\n", "}" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, + "metadata": {}, + "outputs": [], "source": [ - "tls = [\r\n", - " sif4sci(e, symbol=\"gm\",\r\n", - " tokenization_params={\r\n", - " \"formula_params\": {\r\n", - " \"method\": \"ast\", \"return_type\": \"list\", \"ord2token\": True, \"var_numbering\": True,\r\n", - " \"link_variable\": False}\r\n", - " })\r\n", - " for e in ([test_item_1[\"stem\"]] + test_item_1[\"options\"])\r\n", + "tls = [\n", + " sif4sci(e, symbol=\"gm\",\n", + " tokenization_params={\n", + " \"formula_params\": {\n", + " \"method\": \"ast\", \"return_type\": \"list\", \"ord2token\": True, \"var_numbering\": True,\n", + " \"link_variable\": False}\n", + " })\n", + " for e in ([test_item_1[\"stem\"]] + test_item_1[\"options\"])\n", "]" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 19, - "source": [ - "tls" - ], + "execution_count": 21, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['mathord_0', '=', 'textord', 'mathord_1', '=', 'mathord_0', '{ }', '\\\\sqrt', '说法', '正确', '[MARK]'],\n", @@ -400,21 +450,21 @@ " ['mathord_0', '<', 'mathord_1']]" ] }, + "execution_count": 21, "metadata": {}, - "execution_count": 19 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tls" + ] }, { "cell_type": "code", - "execution_count": 20, - "source": [ - "tls[1:]" - ], + "execution_count": 22, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['mathord_0', '<', 'mathord_1'],\n", @@ -422,100 +472,100 @@ " ['mathord_0', '<', 'mathord_1']]" ] }, + "execution_count": 22, "metadata": {}, - "execution_count": 20 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tls[1:]" + ] }, { "cell_type": "code", - "execution_count": 21, - "source": [ - "from EduNLP.utils import dict2str4sif\r\n", - "\r\n", - "test_item_1_str = dict2str4sif(test_item_1, tag_mode=\"head\", add_list_no_tag=False)\r\n", - "test_item_1_str " - ], + "execution_count": 23, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "'$\\\\SIFTag{stem}$若$x=2$, $y=\\\\sqrt{x}$,则下列说法正确的是$\\\\SIFChoice$$\\\\SIFTag{options}$$x < y$$\\\\SIFSep$$y = x$$\\\\SIFSep$$y < x$'" ] }, + "execution_count": 23, "metadata": {}, - "execution_count": 21 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "from EduNLP.utils import dict2str4sif\n", + "\n", + "test_item_1_str = dict2str4sif(test_item_1, tag_mode=\"head\", add_list_no_tag=False)\n", + "test_item_1_str " + ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, + "metadata": {}, + "outputs": [], "source": [ - "tl1 = sif4sci(\r\n", - " test_item_1_str, \r\n", - " symbol=\"gm\", \r\n", - " tokenization_params={\r\n", - " \"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\", \"ord2token\": True}\r\n", - " })\r\n", + "tl1 = sif4sci(\n", + " test_item_1_str, \n", + " symbol=\"gm\", \n", + " tokenization_params={\n", + " \"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\", \"ord2token\": True}\n", + " })\n", " " - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 23, - "source": [ - "tl1.get_segments()[0]" - ], + "execution_count": 25, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['\\\\SIFTag{stem}']" ] }, + "execution_count": 25, "metadata": {}, - "execution_count": 23 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl1.get_segments()[0]" + ] }, { "cell_type": "code", - "execution_count": 24, - "source": [ - "tl1.get_segments()[1:3]" - ], + "execution_count": 26, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['[TEXT_BEGIN]', '[TEXT_END]'],\n", " ['[FORMULA_BEGIN]', 'mathord', '=', 'textord', '[FORMULA_END]']]" ] }, + "execution_count": 26, "metadata": {}, - "execution_count": 24 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl1.get_segments()[1:3]" + ] }, { "cell_type": "code", - "execution_count": 25, - "source": [ - "tl1.get_segments(add_seg_type=False)[0:3]" - ], + "execution_count": 27, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['\\\\SIFTag{stem}'],\n", @@ -523,81 +573,81 @@ " ['mathord', '=', 'mathord', '{ }', '\\\\sqrt']]" ] }, + "execution_count": 27, "metadata": {}, - "execution_count": 25 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl1.get_segments(add_seg_type=False)[0:3]" + ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 28, + "metadata": {}, + "outputs": [], "source": [ "test_item_2 = {\"options\": [r\"$x < y$\", r\"$y = x$\", r\"$y < x$\"]}" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 29, + "metadata": {}, + "outputs": [], "source": [ "test_item_2_str = dict2str4sif(test_item_2, tag_mode=\"head\", add_list_no_tag=False)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 28, - "source": [ - "test_item_2_str" - ], + "execution_count": 30, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "'$\\\\SIFTag{options}$$x < y$$\\\\SIFSep$$y = x$$\\\\SIFSep$$y < x$'" ] }, + "execution_count": 30, "metadata": {}, - "execution_count": 28 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "test_item_2_str" + ] }, { "cell_type": "code", - "execution_count": 29, - "source": [ - "tl2 = sif4sci(test_item_2_str, symbol=\"gms\",\r\n", - " tokenization_params={\"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\"}})\r\n", - "tl2 " - ], + "execution_count": 31, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['\\\\SIFTag{options}', 'x', '<', 'y', '[SEP]', 'y', '=', 'x', '[SEP]', 'y', '<', 'x']" ] }, + "execution_count": 31, "metadata": {}, - "execution_count": 29 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl2 = sif4sci(test_item_2_str, symbol=\"gms\",\n", + " tokenization_params={\"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\"}})\n", + "tl2 " + ] }, { "cell_type": "code", - "execution_count": 30, - "source": [ - "tl2.get_segments(add_seg_type=False)" - ], + "execution_count": 32, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['\\\\SIFTag{options}'],\n", @@ -608,143 +658,146 @@ " ['y', '<', 'x']]" ] }, + "execution_count": 32, "metadata": {}, - "execution_count": 30 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl2.get_segments(add_seg_type=False)" + ] }, { "cell_type": "code", - "execution_count": 31, - "source": [ - "tl2.get_segments(add_seg_type=False, drop=\"s\")" - ], + "execution_count": 33, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['\\\\SIFTag{options}'], ['x', '<', 'y'], ['y', '=', 'x'], ['y', '<', 'x']]" ] }, + "execution_count": 33, "metadata": {}, - "execution_count": 31 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl2.get_segments(add_seg_type=False, drop=\"s\")" + ] }, { "cell_type": "code", - "execution_count": 32, - "source": [ - "tl3 = sif4sci(test_item_1[\"stem\"], symbol=\"gs\")\r\n", - "tl3.text_segments" - ], + "execution_count": 34, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['说法', '正确']]" ] }, + "execution_count": 34, "metadata": {}, - "execution_count": 32 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl3 = sif4sci(test_item_1[\"stem\"], symbol=\"gs\")\n", + "tl3.text_segments" + ] }, { "cell_type": "code", - "execution_count": 33, - "source": [ - "tl3.formula_segments" - ], + "execution_count": 35, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['x', '=', '2'], ['y', '=', '\\\\sqrt', '{', 'x', '}']]" ] }, + "execution_count": 35, "metadata": {}, - "execution_count": 33 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl3.formula_segments" + ] }, { "cell_type": "code", - "execution_count": 34, - "source": [ - "tl3.figure_segments" - ], + "execution_count": 36, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[]" ] }, + "execution_count": 36, "metadata": {}, - "execution_count": 34 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl3.figure_segments" + ] }, { "cell_type": "code", - "execution_count": 35, - "source": [ - "tl3.ques_mark_segments" - ], + "execution_count": 37, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['\\\\SIFChoice']]" ] }, + "execution_count": 37, "metadata": {}, - "execution_count": 35 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl3.ques_mark_segments" + ] }, { "cell_type": "code", "execution_count": null, - "source": [], + "metadata": {}, "outputs": [], - "metadata": {} + "source": [] } ], "metadata": { - "orig_nbformat": 4, + "interpreter": { + "hash": "776957673adb719a00031a24ed5efd2fa5ce8a13405e5193f8d278edd3805d55" + }, + "kernelspec": { + "display_name": "Python 3.6.13 64-bit ('data': conda)", + "name": "python3" + }, "language_info": { - "name": "python", - "version": "3.6.13", - "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, - "pygments_lexer": "ipython3", + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", "nbconvert_exporter": "python", - "file_extension": ".py" - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3.6.13 64-bit ('data': conda)" + "pygments_lexer": "ipython3", + "version": "3.6.13" }, - "interpreter": { - "hash": "776957673adb719a00031a24ed5efd2fa5ce8a13405e5193f8d278edd3805d55" - } + "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/tests/test_sif/test_sif.py b/tests/test_sif/test_sif.py index 86606ed4..02d30132 100644 --- a/tests/test_sif/test_sif.py +++ b/tests/test_sif/test_sif.py @@ -34,7 +34,7 @@ def test_to_sif(): ret = is_sif(text, return_parser=True) assert ret[0] == 0 if ret[0] is not True: - siftext = to_sif(text, cache_parser=ret[1]) + siftext = to_sif(text, parser=ret[1]) print(siftext) From f82e1860368490f00c5038a07cfea60fa2179b10 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Sun, 10 Oct 2021 19:16:19 +0800 Subject: [PATCH 9/9] Update sif.py --- EduNLP/SIF/sif.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index ed5c99cd..93489d0f 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -28,7 +28,7 @@ def is_sif(item, check_formula=True, return_parser=False): Returns ------- - when item can not be parsed correctly, raise Error; + when item can not be parsed correctly, raise ValueError; when item is in stardarded format originally, return Ture (and the Parser of item); when item isn't in stardarded format originally, return False (and the Parser of item);