diff --git a/EduNLP/SIF/parser/parser.py b/EduNLP/SIF/parser/parser.py index 471bb450..b9b1e269 100644 --- a/EduNLP/SIF/parser/parser.py +++ b/EduNLP/SIF/parser/parser.py @@ -15,7 +15,7 @@ class Parser: description_list use Parser to process and describe the txt """ - def __init__(self, data): + def __init__(self, data, check_formula=True): self.lookahead = 0 self.head = 0 self.text = data @@ -26,6 +26,7 @@ def __init__(self, data): self.warnning = 0 self.fomula_illegal_flag = 0 self.fomula_illegal_message = '' + self.check_formula = check_formula # 定义特殊变量 self.len_bracket = len('$\\SIFChoice$') @@ -254,8 +255,9 @@ def get_token(self): if self.head >= len(self.text): self.call_error() return self.error - # 检查 latex 公式的完整性和可解析性 - if not self._is_formula_legal(self.text[formula_start:self.head]): + + # 检查latex公式的完整性和可解析性 + if self.check_formula and not self._is_formula_legal(self.text[formula_start:self.head]): self.call_error() return self.error self.head += 1 diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index 68787131..41518966 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -10,7 +10,7 @@ __all__ = ["is_sif", "to_sif", "sif4sci"] -def is_sif(item): +def is_sif(item, check_formula=True, return_parser=False): r""" the part aims to check whether the input is sif format @@ -18,13 +18,23 @@ def is_sif(item): ---------- item:str a raw item which respects stem + check_formula: bool + whether to check the formulas when parsing item. + + True if check the validity of formulas in item + False if not check the validity of formulas in item, which is faster + return_parser: bool + whether to put the parsed item in return. + + when True, the format of return is (bool, Parser) + when False, the format of return is bool Returns ------- bool - when item can not be parsed correctly, raise Error; - when item doesn't need to be modified, return Ture; - when item needs to be modified, return False; + when item can not be parsed correctly, raise ValueError; + when item is in stardarded format originally, return Ture (and the Parser of item); + when item isn't in stardarded format originally, return False (and the Parser of item); Examples -------- @@ -34,19 +44,22 @@ def is_sif(item): >>> is_sif(text) True >>> text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...' - >>> is_sif(text) - False + >>> ret = is_sif(text, return_parser=True) + >>> ret # doctest: +ELLIPSIS + (False, ) """ - item_parser = Parser(item) + item_parser = Parser(item, check_formula) item_parser.description_list() if item_parser.fomula_illegal_flag: raise ValueError(item_parser.fomula_illegal_message) - if item_parser.error_flag == 0 and item_parser.modify_flag == 0: - return True - return False + ret = True if item_parser.error_flag == 0 and item_parser.modify_flag == 0 else False + if return_parser is True: + return ret, item_parser + else: + return ret -def to_sif(item): +def to_sif(item, check_formula=True, parser: Parser = None): r""" the part aims to switch item to sif formate @@ -54,6 +67,10 @@ def to_sif(item): ---------- items:str a raw item which respects stem + check_formula: bool + whether to check the formulas when parsing item (only work when parser=None). + parser: Parser + the parser of item returned from is_sif. Returns ------- @@ -66,14 +83,20 @@ def to_sif(item): >>> siftext = to_sif(text) >>> siftext '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...' + >>> ret = is_sif(text, return_parser=True) + >>> ret # doctest: +ELLIPSIS + (False, ) + >>> to_sif(text, parser=ret[1]) + '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位... + """ - item_parser = Parser(item) - item_parser.description_list() - item = item_parser.text - return item + if parser is not None: + return parser.text + else: + return is_sif(item, check_formula, return_parser=True)[1].text -def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = None, tokenization=True, +def sif4sci(item: str, figures: (dict, bool) = None, mode: int = 2, symbol: str = None, tokenization=True, tokenization_params=None, errors="raise"): r""" @@ -84,12 +107,15 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No item:str a raw item which respects stem figures:dict - {"FigureID": Base64 encoding of the figure} + when it is a dict, it means the id-to-instance for figures in 'FormFigureID{...}' format, + when it is a bool, it means whether to instantiate figures in 'FormFigureBase64{...}' format - safe:bool - Check whether the text conforms to the sif format + mode: int + when safe = 2, use is_sif and check formula in item + when safe = 1, use is_sif but don't check formula in item + when safe = 0, don't use is_sif and don't check anything in item - symbol:str + symbol: str select the methods to symbolize: "t": text "f": formula @@ -98,17 +124,26 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No "a": tag "s": sep - tokenization:bool - True: tokenize the item + tokenization: bool + whether to tokenize item after segmentation tokenization_params: - method: which tokenizer to be used, "linear" or "ast" - - The parameters only useful for "linear": None + the dict of text_params, formula_params and figure_params in tokenization + For formula_params: + method: which tokenizer to be used, "linear" or "ast" + The parameters only useful for "linear": + skip_figure_formula: whether to skip the formula in figure format + symbolize_figure_formula: whether to symbolize the formula in figure format + The parameters only useful for "ast": + ord2token: whether to transfer the variables (mathord) and constants (textord) to special tokens. + var_numbering: whether to use number suffix to denote different variables + return_type: 'list' or 'ast' + More parameters can be found in the definition in SIF.tokenization.formula + For figure_params: + figure_instance:whether to return instance of figures in tokens + For text_params: + See definition in SIF.tokenization.text - The parameters only useful for "ast": - ord2token: whether to transfer the variables (mathord) and constants (textord) to special tokens. - var_numbering: whether to use number suffix to denote different variables errors: warn, raise, @@ -214,8 +249,15 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No [['已知'], ['说法', '中', '正确']] """ try: - if safe is True and is_sif(item) is not True: - item = to_sif(item) + if mode in [1, 2]: + check_formula = True if mode == 1 else False + sif, item_parser = is_sif(item, check_formula=check_formula, return_parser=True) + if sif is not True: + item = to_sif(item, parser=item_parser) + elif mode != 0: + raise KeyError( + "Unknown mode %s, use only 0 or 1 or 2." % mode + ) ret = seg(item, figures, symbol) diff --git a/examples/sif/sif.ipynb b/examples/sif/sif.ipynb index 25affe58..3758f3f8 100644 --- a/examples/sif/sif.ipynb +++ b/examples/sif/sif.ipynb @@ -34,12 +34,12 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "source": [ - "item = {\n", - " \"stem\": r\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\n", - " \"options\": [\"$p_1=p_2$\", \"$p_1=p_3$\", \"$p_2=p_3$\", \"$p_1=p_2+p_3$\"]\n", - "}\n", + "item = {\r\n", + " \"stem\": r\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\r\n", + " \"options\": [\"$p_1=p_2$\", \"$p_1=p_3$\", \"$p_2=p_3$\", \"$p_1=p_2+p_3$\"]\r\n", + "}\r\n", "item[\"stem\"]" ], "outputs": [ @@ -51,7 +51,7 @@ ] }, "metadata": {}, - "execution_count": 5 + "execution_count": 1 } ], "metadata": { @@ -70,24 +70,24 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "source": [ - "from PIL import Image\n", - "img = Image.open(\"../../asset/_static/item_figure.png\")\n", - "figures = {\"1\": img}\n", + "from PIL import Image\r\n", + "img = Image.open(\"../../asset/_static/item_figure.png\")\r\n", + "figures = {\"1\": img}\r\n", "img" ], "outputs": [ { "output_type": "execute_result", "data": { + "image/png": "", "text/plain": [ - "" - ], - "image/png": "" + "" + ] }, "metadata": {}, - "execution_count": 6 + "execution_count": 2 } ], "metadata": { @@ -108,11 +108,20 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "source": [ "from EduNLP.SIF import sif4sci, is_sif, to_sif" ], - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "D:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", + " warnings.warn(msg)\n" + ] + } + ], "metadata": { "collapsed": false, "pycharm": { @@ -129,7 +138,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "source": [ "is_sif(item['stem'])" ], @@ -142,7 +151,7 @@ ] }, "metadata": {}, - "execution_count": 7 + "execution_count": 4 } ], "metadata": {} @@ -156,9 +165,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "source": [ - "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", "is_sif(text)" ], "outputs": [ @@ -170,17 +179,17 @@ ] }, "metadata": {}, - "execution_count": 8 + "execution_count": 5 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "source": [ - "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", - "to_sif(text)\n" + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", + "to_sif(text)\r\n" ], "outputs": [ { @@ -191,7 +200,7 @@ ] }, "metadata": {}, - "execution_count": 9 + "execution_count": 6 } ], "metadata": {} @@ -232,9 +241,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "source": [ - "segments = sif4sci(item[\"stem\"], figures=figures, tokenization=False)\n", + "segments = sif4sci(item[\"stem\"], figures=figures, tokenization=False)\r\n", "segments" ], "outputs": [ @@ -246,7 +255,7 @@ ] }, "metadata": {}, - "execution_count": 12 + "execution_count": 7 } ], "metadata": {} @@ -260,7 +269,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "source": [ "segments.text_segments" ], @@ -283,7 +292,7 @@ ] }, "metadata": {}, - "execution_count": 13 + "execution_count": 8 } ], "metadata": {} @@ -297,9 +306,9 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 9, "source": [ - "segments.formula_segments\n" + "segments.formula_segments\r\n" ], "outputs": [ { @@ -319,7 +328,7 @@ ] }, "metadata": {}, - "execution_count": 15 + "execution_count": 9 } ], "metadata": {} @@ -333,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "source": [ "segments.figure_segments" ], @@ -346,14 +355,14 @@ ] }, "metadata": {}, - "execution_count": 16 + "execution_count": 10 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "source": [ "segments.figure_segments[0].figure" ], @@ -361,13 +370,13 @@ { "output_type": "execute_result", "data": { + "image/png": "", "text/plain": [ - "" - ], - "image/png": "" + "" + ] }, "metadata": {}, - "execution_count": 17 + "execution_count": 11 } ], "metadata": {} @@ -381,7 +390,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 12, "source": [ "segments.ques_mark_segments" ], @@ -394,7 +403,7 @@ ] }, "metadata": {}, - "execution_count": 19 + "execution_count": 12 } ], "metadata": {} @@ -420,7 +429,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "source": [ "sif4sci(item[\"stem\"], figures=figures, tokenization=False, symbol=\"tfgm\")" ], @@ -433,7 +442,7 @@ ] }, "metadata": {}, - "execution_count": 11 + "execution_count": 13 } ], "metadata": { @@ -461,7 +470,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 14, "source": [ "tokens = sif4sci(item[\"stem\"], figures=figures, tokenization=True)" ], @@ -487,7 +496,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "source": [ "tokens.text_tokens" ], @@ -532,7 +541,7 @@ ] }, "metadata": {}, - "execution_count": 12 + "execution_count": 15 } ], "metadata": { @@ -556,7 +565,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "source": [ "tokens.formula_tokens" ], @@ -593,7 +602,7 @@ ] }, "metadata": {}, - "execution_count": 13 + "execution_count": 16 } ], "metadata": { @@ -619,17 +628,17 @@ }, { "cell_type": "code", - "execution_count": 37, - "source": [ - "sif4sci(\n", - " item[\"stem\"],\n", - " figures=figures,\n", - " tokenization=True,\n", - " tokenization_params={\n", - " \"formula_params\": {\n", - " \"method\": \"linear\",\n", - " }\n", - " }\n", + "execution_count": 17, + "source": [ + "sif4sci(\r\n", + " item[\"stem\"],\r\n", + " figures=figures,\r\n", + " tokenization=True,\r\n", + " tokenization_params={\r\n", + " \"formula_params\": {\r\n", + " \"method\": \"linear\",\r\n", + " }\r\n", + " }\r\n", ").formula_tokens" ], "outputs": [ @@ -665,7 +674,7 @@ ] }, "metadata": {}, - "execution_count": 37 + "execution_count": 17 } ], "metadata": { @@ -686,18 +695,18 @@ }, { "cell_type": "code", - "execution_count": 39, - "source": [ - "sif4sci(\n", - " item[\"stem\"],\n", - " figures=figures,\n", - " tokenization=True,\n", - " tokenization_params={\n", - " \"formula_params\":{\n", - " \"method\": \"ast\",\n", - " }\n", - " }\n", - ").formula_tokens\n" + "execution_count": 18, + "source": [ + "sif4sci(\r\n", + " item[\"stem\"],\r\n", + " figures=figures,\r\n", + " tokenization=True,\r\n", + " tokenization_params={\r\n", + " \"formula_params\":{\r\n", + " \"method\": \"ast\",\r\n", + " }\r\n", + " }\r\n", + ").formula_tokens\r\n" ], "outputs": [ { @@ -717,7 +726,7 @@ ] }, "metadata": {}, - "execution_count": 39 + "execution_count": 18 } ], "metadata": { @@ -736,55 +745,55 @@ }, { "cell_type": "code", - "execution_count": 109, - "source": [ - "f = sif4sci(\n", - " item[\"stem\"],\n", - " figures=figures,\n", - " tokenization=True,\n", - " tokenization_params={\n", - " \"formula_params\":{\n", - " \"method\": \"ast\",\n", - " \"return_type\": \"ast\",\n", - " \"ord2token\": True,\n", - " \"var_numbering\": True,\n", - " }\n", - " }\n", - ").formula_tokens\n", - "f\n" + "execution_count": 19, + "source": [ + "f = sif4sci(\r\n", + " item[\"stem\"],\r\n", + " figures=figures,\r\n", + " tokenization=True,\r\n", + " tokenization_params={\r\n", + " \"formula_params\":{\r\n", + " \"method\": \"ast\",\r\n", + " \"return_type\": \"ast\",\r\n", + " \"ord2token\": True,\r\n", + " \"var_numbering\": True,\r\n", + " }\r\n", + " }\r\n", + ").formula_tokens\r\n", + "f\r\n" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ]" + "[,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ]" ] }, "metadata": {}, - "execution_count": 109 + "execution_count": 19 } ], "metadata": {} }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 20, "source": [ - "for i in range(0, len(f)):\n", - " ForestPlotter().export(\n", - " f[i], root_list=[node for node in f[i]],\n", - " )\n", - "# plt.show()\n" + "# for i in range(0, len(f)):\r\n", + "# ForestPlotter().export(\r\n", + "# f[i], root_list=[node for node in f[i]],\r\n", + "# )\r\n", + "# plt.show()\r\n" ], "outputs": [], "metadata": {} @@ -799,19 +808,19 @@ }, { "cell_type": "code", - "execution_count": 40, - "source": [ - "sif4sci(\n", - " item[\"stem\"],\n", - " figures=figures,\n", - " tokenization=True,\n", - " tokenization_params={\n", - " \"formula_params\":{\n", - " \"method\": \"ast\",\n", - " \"return_type\": \"list\",\n", - " \"ord2token\": True,\n", - " }\n", - " }\n", + "execution_count": 21, + "source": [ + "sif4sci(\r\n", + " item[\"stem\"],\r\n", + " figures=figures,\r\n", + " tokenization=True,\r\n", + " tokenization_params={\r\n", + " \"formula_params\":{\r\n", + " \"method\": \"ast\",\r\n", + " \"return_type\": \"list\",\r\n", + " \"ord2token\": True,\r\n", + " }\r\n", + " }\r\n", ").formula_tokens" ], "outputs": [ @@ -860,7 +869,7 @@ ] }, "metadata": {}, - "execution_count": 40 + "execution_count": 21 } ], "metadata": { @@ -879,20 +888,20 @@ }, { "cell_type": "code", - "execution_count": 44, - "source": [ - "sif4sci(\n", - " item[\"stem\"],\n", - " figures=figures,\n", - " tokenization=True,\n", - " tokenization_params={\n", - " \"formula_params\":{\n", - " \"method\": \"ast\",\n", - " \"ord2token\": True,\n", - " \"return_type\": \"list\",\n", - " \"var_numbering\": True\n", - " }\n", - " }\n", + "execution_count": 22, + "source": [ + "sif4sci(\r\n", + " item[\"stem\"],\r\n", + " figures=figures,\r\n", + " tokenization=True,\r\n", + " tokenization_params={\r\n", + " \"formula_params\":{\r\n", + " \"method\": \"ast\",\r\n", + " \"ord2token\": True,\r\n", + " \"return_type\": \"list\",\r\n", + " \"var_numbering\": True\r\n", + " }\r\n", + " }\r\n", ").formula_tokens" ], "outputs": [ @@ -941,7 +950,7 @@ ] }, "metadata": {}, - "execution_count": 44 + "execution_count": 22 } ], "metadata": { @@ -967,9 +976,9 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 23, "source": [ - "sif4sci(item[\"stem\"], figures=figures, tokenization=True,\n", + "sif4sci(item[\"stem\"], figures=figures, tokenization=True,\r\n", " symbol=\"fgm\")" ], "outputs": [ @@ -981,7 +990,7 @@ ] }, "metadata": {}, - "execution_count": 96 + "execution_count": 23 } ], "metadata": { @@ -995,11 +1004,11 @@ "metadata": { "kernelspec": { "name": "python3", - "display_name": "Python 3.8.5 64-bit" + "display_name": "Python 3.6.13 64-bit ('data': conda)" }, "language_info": { "name": "python", - "version": "3.8.5", + "version": "3.6.13", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", @@ -1010,7 +1019,7 @@ "file_extension": ".py" }, "interpreter": { - "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + "hash": "776957673adb719a00031a24ed5efd2fa5ce8a13405e5193f8d278edd3805d55" } }, "nbformat": 4, diff --git a/examples/sif/sif_addition.ipynb b/examples/sif/sif_addition.ipynb index 57830c43..7a2a1b20 100644 --- a/examples/sif/sif_addition.ipynb +++ b/examples/sif/sif_addition.ipynb @@ -2,102 +2,166 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ "# sif_addition" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "D:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", + " warnings.warn(msg)\n" + ] + } + ], "source": [ "from EduNLP.SIF import is_sif, to_sif,sif4sci" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## is_sif" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 4, - "source": [ - " text = '若$x,y$满足约束条件' \\\r\n", - " '$\\\\left\\\\{\\\\begin{array}{c}2 x+y-2 \\\\leq 0 \\\\\\\\ x-y-1 \\\\geq 0 \\\\\\\\ y+1 \\\\geq 0\\\\end{array}\\\\right.$,' \\\r\n", - " '则$z=x+7 y$的最大值$\\\\SIFUnderline$'\r\n", - " \r\n", - "is_sif(text)\r\n" - ], + "execution_count": 2, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, + "execution_count": 2, "metadata": {}, - "execution_count": 4 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "text = '若$x,y$满足约束条件' \\\n", + " '$\\\\left\\\\{\\\\begin{array}{c}2 x+y-2 \\\\leq 0 \\\\\\\\ x-y-1 \\\\geq 0 \\\\\\\\ y+1 \\\\geq 0\\\\end{array}\\\\right.$,' \\\n", + " '则$z=x+7 y$的最大值$\\\\SIFUnderline$'\n", + " \n", + "is_sif(text)\n" + ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", "is_sif(text)" - ], + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ - "False" + "(False, )" ] }, + "execution_count": 4, "metadata": {}, - "execution_count": 5 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", + "is_sif(text, return_parser=True)" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## to_sif" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 6, - "source": [ - "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", - "to_sif(text)" - ], + "execution_count": 5, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "'某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...'" ] }, + "execution_count": 5, "metadata": {}, - "execution_count": 6 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", + "to_sif(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1]siftext : 某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位... ,consume time [0.018142223358154297s]\n", + "[2]return : (False, )\n", + "[2]siftext : 某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位... ,consume time [0.008990764617919922s]\n" + ] + } + ], + "source": [ + "import time\n", + "# ------------不使用‘加速’机制--------------- #\n", + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'*150\n", + "start = time.time()\n", + "if not is_sif(text):\n", + " siftext = to_sif(text)\n", + "print(\"[1]siftext : {} ,consume time [{}s]\".format(siftext[:35], time.time() - start))\n", + "\n", + "# ------------使用‘加速’机制--------------- #\n", + "start = time.time()\n", + "ret = is_sif(text, return_parser=True)\n", + "print(\"[2]return : \", ret)\n", + "if ret[0] is not True:\n", + " siftext = to_sif(text, parser=ret[1])\n", + "print(\"[2]siftext : {} ,consume time [{}s]\".format(siftext[:35], time.time() - start))" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## sif4sci\n", " to_symbolize:\n", @@ -105,283 +169,279 @@ " - \"f\": formula\n", " - \"g\": figure\n", " - \"m\": question mark" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 14, - "source": [ - " test_item = r\"如图所示,则$\\bigtriangleup ABC$的面积是$\\SIFBlank$。$\\FigureID{1}$\"\r\n", - " t1 = sif4sci(test_item)\r\n", - " t1" - ], + "execution_count": 7, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['如图所示', '\\\\bigtriangleup', 'ABC', '面积', '\\\\SIFBlank', \\FigureID{1}]" ] }, + "execution_count": 7, "metadata": {}, - "execution_count": 14 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "test_item = r\"如图所示,则$\\bigtriangleup ABC$的面积是$\\SIFBlank$。$\\FigureID{1}$\"\n", + "t1 = sif4sci(test_item)\n", + "t1" + ] }, { "cell_type": "code", - "execution_count": 15, - "source": [ - "t1.describe()" - ], + "execution_count": 8, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "{'t': 2, 'f': 2, 'g': 1, 'm': 1}" ] }, + "execution_count": 8, "metadata": {}, - "execution_count": 15 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "t1.describe()" + ] }, { "cell_type": "code", - "execution_count": 17, - "source": [ - "with t1.filter('fgm'):\n", - " print(t1)" - ], + "execution_count": 9, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "['如图所示', '面积']\n" ] } ], - "metadata": {} + "source": [ + "with t1.filter('fgm'):\n", + " print(t1)" + ] }, { "cell_type": "code", - "execution_count": 18, - "source": [ - "with t1.filter(keep='t'):\n", - " print(t1)" - ], + "execution_count": 10, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "['如图所示', '面积']\n" ] } ], - "metadata": {} + "source": [ + "with t1.filter(keep='t'):\n", + " print(t1)" + ] }, { "cell_type": "code", - "execution_count": 19, - "source": [ - "with t1.filter():\n", - " print(t1)" - ], + "execution_count": 11, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "['如图所示', '\\\\bigtriangleup', 'ABC', '面积', '\\\\SIFBlank', \\FigureID{1}]\n" ] } ], - "metadata": {} + "source": [ + "with t1.filter():\n", + " print(t1)" + ] }, { "cell_type": "code", - "execution_count": 20, - "source": [ - "t1.text_tokens" - ], + "execution_count": 12, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['如图所示', '面积']" ] }, + "execution_count": 12, "metadata": {}, - "execution_count": 20 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "t1.text_tokens" + ] }, { "cell_type": "code", - "execution_count": 23, - "source": [ - "t1.formula_tokens" - ], + "execution_count": 13, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['\\\\bigtriangleup', 'ABC']" ] }, + "execution_count": 13, "metadata": {}, - "execution_count": 23 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "t1.formula_tokens" + ] }, { "cell_type": "code", - "execution_count": 24, - "source": [ - "t1.figure_tokens" - ], + "execution_count": 14, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[\\FigureID{1}]" ] }, + "execution_count": 14, "metadata": {}, - "execution_count": 24 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "t1.figure_tokens" + ] }, { "cell_type": "code", - "execution_count": 25, - "source": [ - "t1.ques_mark_tokens" - ], + "execution_count": 15, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['\\\\SIFBlank']" ] }, + "execution_count": 15, "metadata": {}, - "execution_count": 25 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "t1.ques_mark_tokens" + ] }, { "cell_type": "code", - "execution_count": 26, - "source": [ - "sif4sci(test_item, symbol=\"gm\", tokenization_params={\"formula_params\": {\"method\": \"ast\"}})" - ], + "execution_count": 16, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['如图所示', , '面积', '[MARK]', '[FIGURE]']" ] }, + "execution_count": 16, "metadata": {}, - "execution_count": 26 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "sif4sci(test_item, symbol=\"gm\", tokenization_params={\"formula_params\": {\"method\": \"ast\"}})" + ] }, { "cell_type": "code", - "execution_count": 27, - "source": [ - "sif4sci(test_item, symbol=\"tfgm\")" - ], + "execution_count": 17, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]']" ] }, + "execution_count": 17, "metadata": {}, - "execution_count": 27 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "sif4sci(test_item, symbol=\"tfgm\")" + ] }, { "cell_type": "code", - "execution_count": 28, - "source": [ - "sif4sci(test_item, symbol=\"gm\", tokenization_params={\"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\"}})" - ], + "execution_count": 18, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['如图所示', '\\\\bigtriangleup', 'A', 'B', 'C', '面积', '[MARK]', '[FIGURE]']" ] }, + "execution_count": 18, "metadata": {}, - "execution_count": 28 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "sif4sci(test_item, symbol=\"gm\", tokenization_params={\"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\"}})" + ] }, { "cell_type": "code", - "execution_count": 29, - "source": [ - " test_item_1 = {\n", - " \"stem\": r\"若$x=2$, $y=\\sqrt{x}$,则下列说法正确的是$\\SIFChoice$\",\n", - " \"options\": [r\"$x < y$\", r\"$y = x$\", r\"$y < x$\"]\n", - " }" - ], + "execution_count": 19, + "metadata": {}, "outputs": [], - "metadata": {} + "source": [ + "test_item_1 = {\n", + " \"stem\": r\"若$x=2$, $y=\\sqrt{x}$,则下列说法正确的是$\\SIFChoice$\",\n", + " \"options\": [r\"$x < y$\", r\"$y = x$\", r\"$y < x$\"]\n", + "}" + ] }, { "cell_type": "code", - "execution_count": 30, - "source": [ - " tls = [\n", - " sif4sci(e, symbol=\"gm\",\n", - " tokenization_params={\n", - " \"formula_params\": {\n", - " \"method\": \"ast\", \"return_type\": \"list\", \"ord2token\": True, \"var_numbering\": True,\n", - " \"link_variable\": False}\n", - " })\n", - " for e in ([test_item_1[\"stem\"]] + test_item_1[\"options\"])\n", - " ]" - ], + "execution_count": 20, + "metadata": {}, "outputs": [], - "metadata": {} + "source": [ + "tls = [\n", + " sif4sci(e, symbol=\"gm\",\n", + " tokenization_params={\n", + " \"formula_params\": {\n", + " \"method\": \"ast\", \"return_type\": \"list\", \"ord2token\": True, \"var_numbering\": True,\n", + " \"link_variable\": False}\n", + " })\n", + " for e in ([test_item_1[\"stem\"]] + test_item_1[\"options\"])\n", + "]" + ] }, { "cell_type": "code", - "execution_count": 33, - "source": [ - "tls" - ], + "execution_count": 21, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['mathord_0', '=', 'textord', 'mathord_1', '=', 'mathord_0', '{ }', '\\\\sqrt', '说法', '正确', '[MARK]'],\n", @@ -390,21 +450,21 @@ " ['mathord_0', '<', 'mathord_1']]" ] }, + "execution_count": 21, "metadata": {}, - "execution_count": 33 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tls" + ] }, { "cell_type": "code", - "execution_count": 34, - "source": [ - "tls[1:]" - ], + "execution_count": 22, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['mathord_0', '<', 'mathord_1'],\n", @@ -412,38 +472,43 @@ " ['mathord_0', '<', 'mathord_1']]" ] }, + "execution_count": 22, "metadata": {}, - "execution_count": 34 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tls[1:]" + ] }, { "cell_type": "code", - "execution_count": 35, - "source": [ - "from EduNLP.utils import dict2str4sif\n", - "\n", - "test_item_1_str = dict2str4sif(test_item_1, tag_mode=\"head\", add_list_no_tag=False)\n", - "test_item_1_str " - ], + "execution_count": 23, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "'$\\\\SIFTag{stem}$若$x=2$, $y=\\\\sqrt{x}$,则下列说法正确的是$\\\\SIFChoice$$\\\\SIFTag{options}$$x < y$$\\\\SIFSep$$y = x$$\\\\SIFSep$$y < x$'" ] }, + "execution_count": 23, "metadata": {}, - "execution_count": 35 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "from EduNLP.utils import dict2str4sif\n", + "\n", + "test_item_1_str = dict2str4sif(test_item_1, tag_mode=\"head\", add_list_no_tag=False)\n", + "test_item_1_str " + ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 24, + "metadata": {}, + "outputs": [], "source": [ "tl1 = sif4sci(\n", " test_item_1_str, \n", @@ -452,60 +517,55 @@ " \"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\", \"ord2token\": True}\n", " })\n", " " - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 37, - "source": [ - "tl1.get_segments()[0]" - ], + "execution_count": 25, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['\\\\SIFTag{stem}']" ] }, + "execution_count": 25, "metadata": {}, - "execution_count": 37 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl1.get_segments()[0]" + ] }, { "cell_type": "code", - "execution_count": 38, - "source": [ - "tl1.get_segments()[1:3]" - ], + "execution_count": 26, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['[TEXT_BEGIN]', '[TEXT_END]'],\n", " ['[FORMULA_BEGIN]', 'mathord', '=', 'textord', '[FORMULA_END]']]" ] }, + "execution_count": 26, "metadata": {}, - "execution_count": 38 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl1.get_segments()[1:3]" + ] }, { "cell_type": "code", - "execution_count": 39, - "source": [ - "tl1.get_segments(add_seg_type=False)[0:3]" - ], + "execution_count": 27, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['\\\\SIFTag{stem}'],\n", @@ -513,81 +573,81 @@ " ['mathord', '=', 'mathord', '{ }', '\\\\sqrt']]" ] }, + "execution_count": 27, "metadata": {}, - "execution_count": 39 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl1.get_segments(add_seg_type=False)[0:3]" + ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 28, + "metadata": {}, + "outputs": [], "source": [ "test_item_2 = {\"options\": [r\"$x < y$\", r\"$y = x$\", r\"$y < x$\"]}" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 29, + "metadata": {}, + "outputs": [], "source": [ "test_item_2_str = dict2str4sif(test_item_2, tag_mode=\"head\", add_list_no_tag=False)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 43, - "source": [ - "test_item_2_str" - ], + "execution_count": 30, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "'$\\\\SIFTag{options}$$x < y$$\\\\SIFSep$$y = x$$\\\\SIFSep$$y < x$'" ] }, + "execution_count": 30, "metadata": {}, - "execution_count": 43 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "test_item_2_str" + ] }, { "cell_type": "code", - "execution_count": 44, - "source": [ - "tl2 = sif4sci(test_item_2_str, symbol=\"gms\",\n", - " tokenization_params={\"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\"}})\n", - "tl2 " - ], + "execution_count": 31, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['\\\\SIFTag{options}', 'x', '<', 'y', '[SEP]', 'y', '=', 'x', '[SEP]', 'y', '<', 'x']" ] }, + "execution_count": 31, "metadata": {}, - "execution_count": 44 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl2 = sif4sci(test_item_2_str, symbol=\"gms\",\n", + " tokenization_params={\"formula_params\": {\"method\": \"ast\", \"return_type\": \"list\"}})\n", + "tl2 " + ] }, { "cell_type": "code", - "execution_count": 45, - "source": [ - "tl2.get_segments(add_seg_type=False)" - ], + "execution_count": 32, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['\\\\SIFTag{options}'],\n", @@ -598,143 +658,146 @@ " ['y', '<', 'x']]" ] }, + "execution_count": 32, "metadata": {}, - "execution_count": 45 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl2.get_segments(add_seg_type=False)" + ] }, { "cell_type": "code", - "execution_count": 46, - "source": [ - "tl2.get_segments(add_seg_type=False, drop=\"s\")" - ], + "execution_count": 33, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['\\\\SIFTag{options}'], ['x', '<', 'y'], ['y', '=', 'x'], ['y', '<', 'x']]" ] }, + "execution_count": 33, "metadata": {}, - "execution_count": 46 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl2.get_segments(add_seg_type=False, drop=\"s\")" + ] }, { "cell_type": "code", - "execution_count": 47, - "source": [ - "tl3 = sif4sci(test_item_1[\"stem\"], symbol=\"gs\")\n", - "tl3.text_segments" - ], + "execution_count": 34, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['说法', '正确']]" ] }, + "execution_count": 34, "metadata": {}, - "execution_count": 47 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl3 = sif4sci(test_item_1[\"stem\"], symbol=\"gs\")\n", + "tl3.text_segments" + ] }, { "cell_type": "code", - "execution_count": 48, - "source": [ - "tl3.formula_segments" - ], + "execution_count": 35, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['x', '=', '2'], ['y', '=', '\\\\sqrt', '{', 'x', '}']]" ] }, + "execution_count": 35, "metadata": {}, - "execution_count": 48 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl3.formula_segments" + ] }, { "cell_type": "code", - "execution_count": 49, - "source": [ - "tl3.figure_segments" - ], + "execution_count": 36, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[]" ] }, + "execution_count": 36, "metadata": {}, - "execution_count": 49 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl3.figure_segments" + ] }, { "cell_type": "code", - "execution_count": 50, - "source": [ - "tl3.ques_mark_segments" - ], + "execution_count": 37, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['\\\\SIFChoice']]" ] }, + "execution_count": 37, "metadata": {}, - "execution_count": 50 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tl3.ques_mark_segments" + ] }, { "cell_type": "code", "execution_count": null, - "source": [], + "metadata": {}, "outputs": [], - "metadata": {} + "source": [] } ], "metadata": { - "orig_nbformat": 4, + "interpreter": { + "hash": "776957673adb719a00031a24ed5efd2fa5ce8a13405e5193f8d278edd3805d55" + }, + "kernelspec": { + "display_name": "Python 3.6.13 64-bit ('data': conda)", + "name": "python3" + }, "language_info": { - "name": "python", - "version": "3.8.5", - "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, - "pygments_lexer": "ipython3", + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", "nbconvert_exporter": "python", - "file_extension": ".py" - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3.8.5 64-bit" + "pygments_lexer": "ipython3", + "version": "3.6.13" }, - "interpreter": { - "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" - } + "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/examples/tokenizer/tokenizier.ipynb b/examples/tokenizer/tokenizier.ipynb index 8dcec093..1f52994d 100644 --- a/examples/tokenizer/tokenizier.ipynb +++ b/examples/tokenizer/tokenizier.ipynb @@ -3,82 +3,76 @@ { "cell_type": "code", "execution_count": 1, - "source": [ - "from EduNLP.Tokenizer import PureTextTokenizer, TextTokenizer, get_tokenizer" - ], + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "D:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", " warnings.warn(msg)\n" ] } ], - "metadata": {} + "source": [ + "from EduNLP.Tokenizer import PureTextTokenizer, TextTokenizer, get_tokenizer" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ - "# TextTokenizer and PureTextTokenizer\r\n", - "\r\n", - "- ‘text’ Tokenizer ignores and skips the FormulaFigures and tokenize latex Formulas as Text\r\n", + "# TextTokenizer and PureTextTokenizer\n", + "\n", + "- ‘text’ Tokenizer ignores and skips the FormulaFigures and tokenize latex Formulas as Text\n", "- ‘pure_text’ Tokenizer symbolizes the FormulaFigures as [FUMULA] and tokenize latex Formulas as Text" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## TextTokenizer" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 2, - "source": [ - "items = [{\r\n", - " \"stem\": \"已知集合$A=\\\\left\\\\{x \\\\mid x^{2}-3 x-4<0\\\\right\\\\}, \\\\quad B=\\\\{-4,1,3,5\\\\}, \\\\quad$ 则 $A \\\\cap B=$\",\r\n", - " \"options\": [\"1\", \"2\"]\r\n", - " }]\r\n", - "tokenizer = get_tokenizer(\"text\") # tokenizer = TextTokenizer()\r\n", - "tokens = tokenizer(items, key=lambda x: x[\"stem\"])\r\n", - "print(next(tokens))" - ], + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "['已知', '集合', 'A', '=', '\\\\left', '\\\\{', 'x', '\\\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\\\right', '\\\\}', ',', '\\\\quad', 'B', '=', '\\\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\\\}', ',', '\\\\quad', 'A', '\\\\cap', 'B', '=']\n" ] } ], - "metadata": {} + "source": [ + "items = [{\n", + " \"stem\": \"已知集合$A=\\\\left\\\\{x \\\\mid x^{2}-3 x-4<0\\\\right\\\\}, \\\\quad B=\\\\{-4,1,3,5\\\\}, \\\\quad$ 则 $A \\\\cap B=$\",\n", + " \"options\": [\"1\", \"2\"]\n", + " }]\n", + "tokenizer = get_tokenizer(\"text\") # tokenizer = TextTokenizer()\n", + "tokens = tokenizer(items, key=lambda x: x[\"stem\"])\n", + "print(next(tokens))" + ] }, { "cell_type": "code", "execution_count": 3, + "metadata": {}, + "outputs": [], "source": [ "items = [\"有公式$\\\\FormFigureID{wrong1?}$,如图$\\\\FigureID{088f15ea-xxx}$,若$x,y$满足约束条件公式$\\\\FormFigureBase64{wrong2?}$,$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$\"]" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 4, - "source": [ - "\r\n", - "tokenizer = get_tokenizer(\"text\") # tokenizer = TextTokenizer()\r\n", - "tokens = [t for t in tokenizer(items)]\r\n", - "tokens" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['公式',\n", @@ -102,30 +96,31 @@ " '[MARK]']]" ] }, + "execution_count": 4, "metadata": {}, - "execution_count": 4 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "\n", + "tokenizer = get_tokenizer(\"text\") # tokenizer = TextTokenizer()\n", + "tokens = [t for t in tokenizer(items)]\n", + "tokens" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## PureTextTokenizer" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 5, - "source": [ - "tokenizer = get_tokenizer(\"pure_text\") # tokenizer = PureTextTokenizer()\r\n", - "tokens = [t for t in tokenizer(items)]\r\n", - "tokens" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[['公式',\n", @@ -147,17 +142,25 @@ " '[MARK]']]" ] }, + "execution_count": 5, "metadata": {}, - "execution_count": 5 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "tokenizer = get_tokenizer(\"pure_text\") # tokenizer = PureTextTokenizer()\n", + "tokens = [t for t in tokenizer(items)]\n", + "tokens" + ] } ], "metadata": { + "interpreter": { + "hash": "776957673adb719a00031a24ed5efd2fa5ce8a13405e5193f8d278edd3805d55" + }, "kernelspec": { - "name": "python3", - "display_name": "Python 3.6.13 64-bit ('data': conda)" + "display_name": "Python 3.6.13 64-bit ('data': conda)", + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -170,11 +173,8 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" - }, - "interpreter": { - "hash": "776957673adb719a00031a24ed5efd2fa5ce8a13405e5193f8d278edd3805d55" } }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/tests/test_sif/test_sif.py b/tests/test_sif/test_sif.py index 210441d2..02d30132 100644 --- a/tests/test_sif/test_sif.py +++ b/tests/test_sif/test_sif.py @@ -31,6 +31,12 @@ def test_to_sif(): siftext = to_sif(text) print(siftext) + ret = is_sif(text, return_parser=True) + assert ret[0] == 0 + if ret[0] is not True: + siftext = to_sif(text, parser=ret[1]) + print(siftext) + def test_sci4sif(figure0, figure1, figure0_base64, figure1_base64): repr(sif4sci( @@ -57,3 +63,17 @@ def test_sci4sif(figure0, figure1, figure0_base64, figure1_base64): "figure_params": {"figure_instance": True} } )) + repr(sif4sci( + r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=0 + )) + repr(sif4sci( + r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=1 + )) + repr(sif4sci( + r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=2 + )) + + with pytest.raises(KeyError): + repr(sif4sci( + r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=3 + ))