bigdata-ustc · KenelmQLH · Oct 10, 2021 · Aug 31, 2021 · Aug 31, 2021 · Sep 3, 2021
diff --git a/EduNLP/SIF/parser/parser.py b/EduNLP/SIF/parser/parser.py
@@ -15,7 +15,7 @@ class Parser:
     description_list
         use Parser to process and describe the txt
     """
-    def __init__(self, data):
+    def __init__(self, data, check_formula=True):
         self.lookahead = 0
         self.head = 0
         self.text = data
@@ -26,6 +26,7 @@ def __init__(self, data):
         self.warnning = 0
         self.fomula_illegal_flag = 0
         self.fomula_illegal_message = ''
+        self.check_formula = check_formula
 
         # 定义特殊变量
         self.len_bracket = len('$\\SIFChoice$')
@@ -254,8 +255,9 @@ def get_token(self):
             if self.head >= len(self.text):
                 self.call_error()
                 return self.error
-            # 检查 latex 公式的完整性和可解析性
-            if not self._is_formula_legal(self.text[formula_start:self.head]):
+
+            # 检查latex公式的完整性和可解析性
+            if self.check_formula and not self._is_formula_legal(self.text[formula_start:self.head]):
                 self.call_error()
                 return self.error
             self.head += 1

diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py
@@ -10,21 +10,31 @@
 __all__ = ["is_sif", "to_sif", "sif4sci"]
 
 
-def is_sif(item):
+def is_sif(item, check_formula=True, return_parser=False):
     r"""
     the part aims to check whether the input is sif format
 
     Parameters
     ----------
     item:str
         a raw item which respects stem
+    check_formula: bool
+        whether to check the formulas when parsing item.
+
+        True if check the validity of formulas in item
+        False if not check the validity of formulas in item, which is faster
+    return_parser: bool
+        whether to put the parsed item in return.
+
+        when True, the format of return is (bool, Parser)
+        when False, the format of return is bool
 
     Returns
     -------
     bool
-        when item can not be parsed correctly, raise Error;
-        when item doesn't need to be modified, return Ture;
-        when item needs to be modified, return False;
+        when item can not be parsed correctly, raise ValueError;
+        when item is in stardarded format originally, return Ture (and the Parser of item);
+        when item isn't in stardarded format originally, return False (and the Parser of item);
 
     Examples
     --------
@@ -34,26 +44,33 @@ def is_sif(item):
     >>> is_sif(text)
     True
     >>> text = '某校一个课外学习小组为研究某作物的发芽率y和温度x（单位...'
-    >>> is_sif(text)
-    False
+    >>> ret = is_sif(text, return_parser=True)
+    >>> ret # doctest: +ELLIPSIS
+    (False, <EduNLP.SIF.parser.parser.Parser object...>)
     """
-    item_parser = Parser(item)
+    item_parser = Parser(item, check_formula)
     item_parser.description_list()
     if item_parser.fomula_illegal_flag:
         raise ValueError(item_parser.fomula_illegal_message)
-    if item_parser.error_flag == 0 and item_parser.modify_flag == 0:
-        return True
-    return False
+    ret = True if item_parser.error_flag == 0 and item_parser.modify_flag == 0 else False
+    if return_parser is True:
+        return ret, item_parser
+    else:
+        return ret
 
 
-def to_sif(item):
+def to_sif(item, check_formula=True, parser: Parser = None):
     r"""
     the part aims to switch item to sif formate
 
     Parameters
     ----------
     items:str
         a raw item which respects stem
+    check_formula: bool
+        whether to check the formulas when parsing item (only work when parser=None).
+    parser: Parser
+        the parser of item returned from is_sif.
 
     Returns
     -------
@@ -66,14 +83,20 @@ def to_sif(item):
     >>> siftext = to_sif(text)
     >>> siftext
     '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$（单位...'
+    >>> ret = is_sif(text, return_parser=True)
+    >>> ret # doctest: +ELLIPSIS
+    (False, <EduNLP.SIF.parser.parser.Parser object...>)
+    >>> to_sif(text, parser=ret[1])
+    '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$（单位...
+
     """
-    item_parser = Parser(item)
-    item_parser.description_list()
-    item = item_parser.text
-    return item
+    if parser is not None:
+        return parser.text
+    else:
+        return is_sif(item, check_formula, return_parser=True)[1].text
 
 
-def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = None, tokenization=True,
+def sif4sci(item: str, figures: (dict, bool) = None, mode: int = 2, symbol: str = None, tokenization=True,
             tokenization_params=None, errors="raise"):
     r"""
 
@@ -84,12 +107,15 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No
     item:str
         a raw item which respects stem
     figures:dict
-        {"FigureID": Base64 encoding of the figure}
+        when it is a dict, it means the id-to-instance for figures in 'FormFigureID{...}' format,
+        when it is a bool, it means whether to instantiate figures in 'FormFigureBase64{...}' format
 
-    safe:bool
-        Check whether the text conforms to the sif format
+    mode: int
+        when safe = 2, use is_sif and check formula in item
+        when safe = 1, use is_sif but don't check formula in item
+        when safe = 0, don't use is_sif and don't check anything in item
 
-    symbol:str
+    symbol: str
         select the methods to symbolize:
             "t": text
             "f": formula
@@ -98,17 +124,26 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No
             "a": tag
             "s": sep
 
-    tokenization:bool
-        True: tokenize the item
+    tokenization: bool
+        whether to tokenize item after segmentation
 
     tokenization_params:
-        method: which tokenizer to be used, "linear" or "ast"
-
-        The parameters only useful for "linear": None
+        the dict of text_params, formula_params and figure_params in tokenization
+        For formula_params:
+            method: which tokenizer to be used, "linear" or "ast"
+            The parameters only useful for "linear":
+                skip_figure_formula: whether to skip the formula in figure format
+                symbolize_figure_formula: whether to symbolize the formula in figure format
+            The parameters only useful for "ast":
+                ord2token: whether to transfer the variables (mathord) and constants (textord) to special tokens.
+                var_numbering: whether to use number suffix to denote different variables
+                return_type: 'list' or 'ast'
+            More parameters can be found in the definition in SIF.tokenization.formula
+        For figure_params:
+            figure_instance：whether to return instance of figures in tokens
+        For text_params:
+            See definition in SIF.tokenization.text
 
-        The parameters only useful for "ast":
-            ord2token: whether to transfer the variables (mathord) and constants (textord) to special tokens.
-            var_numbering: whether to use number suffix to denote different variables
     errors:
         warn,
         raise,
@@ -214,8 +249,15 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No
     [['已知'], ['说法', '中', '正确']]
     """
     try:
-        if safe is True and is_sif(item) is not True:
-            item = to_sif(item)
+        if mode in [1, 2]:
+            check_formula = True if mode == 1 else False
+            sif, item_parser = is_sif(item, check_formula=check_formula, return_parser=True)
+            if sif is not True:
+                item = to_sif(item, parser=item_parser)
+        elif mode != 0:
+            raise KeyError(
+                "Unknown mode %s, use only 0 or 1 or 2." % mode
+            )
 
         ret = seg(item, figures, symbol)