diff --git a/AUTHORS.md b/AUTHORS.md index 4188b5a6..b52ccdc6 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -12,5 +12,7 @@ [Longhu Qin](https://github.com/KenelmQLH) +[Pingzhi Li](https://github.com/pingzhiLi) -The stared contributors are the corresponding authors. \ No newline at end of file + +The stared contributors are the corresponding authors. diff --git a/EduNLP/SIF/parser/parser.py b/EduNLP/SIF/parser/parser.py index 0ebd498d..db290946 100644 --- a/EduNLP/SIF/parser/parser.py +++ b/EduNLP/SIF/parser/parser.py @@ -76,7 +76,7 @@ def _is_formula_legal(self, formula_str): """ legal_tags = ['FormFigureID', 'FormFigureBase64', 'FigureID', 'FigureBase64', - 'SIFBlank', 'SIFChoice', 'SIFTag', 'SIFSep', 'SIFUnderline'] + 'SIFBlank', 'SIFChoice', 'SIFTag', 'SIFSep', 'SIFUnderline', 'textf'] for tag in legal_tags: if tag in formula_str: return True diff --git a/EduNLP/SIF/segment/segment.py b/EduNLP/SIF/segment/segment.py index a6d7f7c8..93c5713f 100644 --- a/EduNLP/SIF/segment/segment.py +++ b/EduNLP/SIF/segment/segment.py @@ -84,7 +84,11 @@ def __init__(self, item, figures: dict = None): self._ques_mark_segments = [] self._tag_segments = [] self._sep_segments = [] - segments = re.split(r"(\$.+?\$)", item) + + # remove $\textf{*} from the item$ + item_no_textf = "".join(re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", item)) + + segments = re.split(r"(\$.+?\$)", item_no_textf) for segment in segments: if not segment: continue @@ -294,6 +298,10 @@ def seg(item, figures=None, symbol=None): >>> s2 = seg(test_item_1_str_2, symbol="fgm") >>> s2.tag_segments ['\\SIFTag{stem}', '\\SIFTag{options}'] + >>> test_item_2 = r"已知$y=x$,则以下说法中$\textf{正确,b}$的是" + >>> s2 = seg(test_item_2) + >>> s2.text_segments + ['已知', ',则以下说法中正确的是'] """ segments = SegmentList(item, figures) if symbol is not None: diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index fa893f54..af4fa63a 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -182,6 +182,11 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No [] >>> tl3.ques_mark_segments [['\\SIFChoice']] + >>> test_item_3 = r"已知$y=x$,则以下说法中$\textf{正确,b}$的是" + >>> tl4 = sif4sci(test_item_3) + Warning: there is some chinese characters in formula! + >>> tl4.text_segments + [['已知'], ['说法', '中', '正确']] """ try: if safe is True and is_sif(item) is not True: diff --git a/tests/test_sif/test_segement.py b/tests/test_sif/test_segement.py index 40f44624..04d43750 100644 --- a/tests/test_sif/test_segement.py +++ b/tests/test_sif/test_segement.py @@ -19,6 +19,10 @@ def test_segment(figure0, figure1, figure0_base64, figure1_base64): r"如图所示,则$\FormFigureBase64{%s}$的面积是$\SIFBlank$。$\FigureBase64{%s}$" % (figure0_base64, figure1_base64), figures=True ) - with pytest.raises(TypeError): s.append("123") + seg_test_text = seg( + r"如图所示,有三组$\textf{机器人,bu}$在踢$\textf{足球,b}$", + figures=True + ) + assert seg_test_text.text_segments == ['如图所示,有三组机器人在踢足球']