From f8d3115cc27dae8b581e98e858cf45e079ceb1b2 Mon Sep 17 00:00:00 2001 From: pingzhili Date: Fri, 13 Aug 2021 02:14:42 +0800 Subject: [PATCH 01/18] add text format segmentation --- EduNLP/SIF/parser/parser.py | 2 +- EduNLP/SIF/segment/__init__.py | 2 +- EduNLP/SIF/segment/segment.py | 20 +++++++++++++++++--- tests/test_sif/test_segement.py | 6 +++++- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/EduNLP/SIF/parser/parser.py b/EduNLP/SIF/parser/parser.py index 0ebd498d..db290946 100644 --- a/EduNLP/SIF/parser/parser.py +++ b/EduNLP/SIF/parser/parser.py @@ -76,7 +76,7 @@ def _is_formula_legal(self, formula_str): """ legal_tags = ['FormFigureID', 'FormFigureBase64', 'FigureID', 'FigureBase64', - 'SIFBlank', 'SIFChoice', 'SIFTag', 'SIFSep', 'SIFUnderline'] + 'SIFBlank', 'SIFChoice', 'SIFTag', 'SIFSep', 'SIFUnderline', 'textf'] for tag in legal_tags: if tag in formula_str: return True diff --git a/EduNLP/SIF/segment/__init__.py b/EduNLP/SIF/segment/__init__.py index 3393d28e..70024b17 100644 --- a/EduNLP/SIF/segment/__init__.py +++ b/EduNLP/SIF/segment/__init__.py @@ -2,4 +2,4 @@ # 2021/5/18 @ tongshiwei from .segment import (SegmentList, TextSegment, FigureFormulaSegment, LatexFormulaSegment, FigureSegment, - QuesMarkSegment, Figure, TagSegment, SepSegment, seg) + QuesMarkSegment, Figure, TagSegment, SepSegment, seg, TextFSegment) diff --git a/EduNLP/SIF/segment/segment.py b/EduNLP/SIF/segment/segment.py index a6d7f7c8..a95304f5 100644 --- a/EduNLP/SIF/segment/segment.py +++ b/EduNLP/SIF/segment/segment.py @@ -75,6 +75,10 @@ class SepSegment(str): pass +class TextFSegment(str): + pass + + class SegmentList(object): def __init__(self, item, figures: dict = None): self._segments = [] @@ -104,6 +108,9 @@ def __init__(self, item, figures: dict = None): self.append(TagSegment(segment[1:-1])) elif re.match(r"\$\\SIFSep\$", segment): self.append(SepSegment(segment[1:-1])) + elif re.match(r"\$\\textf\{[^,]+?,b?d?i?t?u?w?}\$", segment): + seg_capture = re.match(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", segment) + self.append(TextFSegment(seg_capture.group(1))) else: self.append(LatexFormulaSegment(segment[1:-1])) self._seg_idx = None @@ -115,8 +122,12 @@ def __len__(self): return len(self._segments) def append(self, segment) -> None: - if isinstance(segment, TextSegment): - self._text_segments.append(len(self)) + if isinstance(segment, TextSegment) or isinstance(segment, TextFSegment): + if len(self._text_segments) != 0 and self._text_segments[-1] == len(self) - 1: + self._segments[-1] = self._segments[-1] + segment + else: + self._text_segments.append(len(self)) + self._segments.append(segment) elif isinstance(segment, (LatexFormulaSegment, FigureFormulaSegment)): self._formula_segments.append(len(self)) elif isinstance(segment, FigureSegment): @@ -129,7 +140,10 @@ def append(self, segment) -> None: self._sep_segments.append(len(self)) else: raise TypeError("Unknown Segment Type: %s" % type(segment)) - self._segments.append(segment) + if isinstance(segment, TextFSegment) or isinstance(segment, TextSegment): + pass + else: + self._segments.append(segment) @property def segments(self): diff --git a/tests/test_sif/test_segement.py b/tests/test_sif/test_segement.py index 40f44624..04d43750 100644 --- a/tests/test_sif/test_segement.py +++ b/tests/test_sif/test_segement.py @@ -19,6 +19,10 @@ def test_segment(figure0, figure1, figure0_base64, figure1_base64): r"如图所示,则$\FormFigureBase64{%s}$的面积是$\SIFBlank$。$\FigureBase64{%s}$" % (figure0_base64, figure1_base64), figures=True ) - with pytest.raises(TypeError): s.append("123") + seg_test_text = seg( + r"如图所示,有三组$\textf{机器人,bu}$在踢$\textf{足球,b}$", + figures=True + ) + assert seg_test_text.text_segments == ['如图所示,有三组机器人在踢足球'] From 5d0e931fde44fd7a3aeec49c4e706a744759746e Mon Sep 17 00:00:00 2001 From: pingzhi Date: Fri, 13 Aug 2021 02:21:17 +0800 Subject: [PATCH 02/18] Update AUTHORS.md --- AUTHORS.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/AUTHORS.md b/AUTHORS.md index 4188b5a6..b52ccdc6 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -12,5 +12,7 @@ [Longhu Qin](https://github.com/KenelmQLH) +[Pingzhi Li](https://github.com/pingzhiLi) -The stared contributors are the corresponding authors. \ No newline at end of file + +The stared contributors are the corresponding authors. From fe104a44a92513eb26d72393cff2d6f6833857aa Mon Sep 17 00:00:00 2001 From: pingzhi Date: Fri, 13 Aug 2021 02:30:09 +0800 Subject: [PATCH 03/18] remove class TextFSegment --- EduNLP/SIF/segment/segment.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/EduNLP/SIF/segment/segment.py b/EduNLP/SIF/segment/segment.py index a95304f5..59ece582 100644 --- a/EduNLP/SIF/segment/segment.py +++ b/EduNLP/SIF/segment/segment.py @@ -75,10 +75,6 @@ class SepSegment(str): pass -class TextFSegment(str): - pass - - class SegmentList(object): def __init__(self, item, figures: dict = None): self._segments = [] @@ -110,7 +106,7 @@ def __init__(self, item, figures: dict = None): self.append(SepSegment(segment[1:-1])) elif re.match(r"\$\\textf\{[^,]+?,b?d?i?t?u?w?}\$", segment): seg_capture = re.match(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", segment) - self.append(TextFSegment(seg_capture.group(1))) + self.append(TextSegment(seg_capture.group(1))) else: self.append(LatexFormulaSegment(segment[1:-1])) self._seg_idx = None @@ -122,7 +118,7 @@ def __len__(self): return len(self._segments) def append(self, segment) -> None: - if isinstance(segment, TextSegment) or isinstance(segment, TextFSegment): + if isinstance(segment, TextSegment): if len(self._text_segments) != 0 and self._text_segments[-1] == len(self) - 1: self._segments[-1] = self._segments[-1] + segment else: @@ -140,7 +136,7 @@ def append(self, segment) -> None: self._sep_segments.append(len(self)) else: raise TypeError("Unknown Segment Type: %s" % type(segment)) - if isinstance(segment, TextFSegment) or isinstance(segment, TextSegment): + if isinstance(segment, TextSegment): pass else: self._segments.append(segment) From 5ed4fc13dc3eab233617eedb5171871d063eaf7d Mon Sep 17 00:00:00 2001 From: pingzhi Date: Fri, 13 Aug 2021 02:30:40 +0800 Subject: [PATCH 04/18] remove class TextFSegment --- EduNLP/SIF/segment/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EduNLP/SIF/segment/__init__.py b/EduNLP/SIF/segment/__init__.py index 70024b17..3393d28e 100644 --- a/EduNLP/SIF/segment/__init__.py +++ b/EduNLP/SIF/segment/__init__.py @@ -2,4 +2,4 @@ # 2021/5/18 @ tongshiwei from .segment import (SegmentList, TextSegment, FigureFormulaSegment, LatexFormulaSegment, FigureSegment, - QuesMarkSegment, Figure, TagSegment, SepSegment, seg, TextFSegment) + QuesMarkSegment, Figure, TagSegment, SepSegment, seg) From 5674f915dbddc88fb817d254dc552d96421cba18 Mon Sep 17 00:00:00 2001 From: pingzhi Date: Sun, 15 Aug 2021 15:02:55 +0800 Subject: [PATCH 05/18] Add text format examples and fix type bug - add text format examples - fix the bug that new added text_segment may be type of string, rather than TextSegment --- EduNLP/SIF/segment/segment.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/EduNLP/SIF/segment/segment.py b/EduNLP/SIF/segment/segment.py index 59ece582..9c8585ad 100644 --- a/EduNLP/SIF/segment/segment.py +++ b/EduNLP/SIF/segment/segment.py @@ -120,7 +120,7 @@ def __len__(self): def append(self, segment) -> None: if isinstance(segment, TextSegment): if len(self._text_segments) != 0 and self._text_segments[-1] == len(self) - 1: - self._segments[-1] = self._segments[-1] + segment + self._segments[-1] = TextSegment(self._segments[-1] + segment) else: self._text_segments.append(len(self)) self._segments.append(segment) @@ -304,6 +304,10 @@ def seg(item, figures=None, symbol=None): >>> s2 = seg(test_item_1_str_2, symbol="fgm") >>> s2.tag_segments ['\\SIFTag{stem}', '\\SIFTag{options}'] + >>> test_item_2 = r"已知$y=x$,则以下说法中$\textf{正确,b}$的是" + >>> s2 = seg(test_item_2) + >>> s2.text_segments + ['已知', 'y=x', ',则以下说法中正确的是'] """ segments = SegmentList(item, figures) if symbol is not None: From 26c3a5f1cf70d0f05f37514a13ddfde67f188305 Mon Sep 17 00:00:00 2001 From: pingzhi Date: Sun, 15 Aug 2021 15:03:58 +0800 Subject: [PATCH 06/18] Add text format examples --- EduNLP/SIF/sif.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index fa893f54..998d19c3 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -182,6 +182,10 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No [] >>> tl3.ques_mark_segments [['\\SIFChoice']] + >>> test_item_3 = r"已知$y=x$,则以下说法中$\textf{正确,b}$的是" + >>> tl4 = sif4sci(test_item_3) + >>> tl4.text_segments + [['已知'], ['说法', '中', '正确']] """ try: if safe is True and is_sif(item) is not True: From 6f0403d4a4c2a8107559d3fb3a7d74801789dd53 Mon Sep 17 00:00:00 2001 From: pingzhi Date: Sun, 15 Aug 2021 22:14:08 +0800 Subject: [PATCH 07/18] Update sif.py --- EduNLP/SIF/sif.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index 998d19c3..2b5e82b8 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -3,6 +3,7 @@ import traceback import warnings +import re from .segment import seg from .tokenization import tokenize, link_formulas from .parser import Parser @@ -59,7 +60,11 @@ def to_sif(item): >>> siftext '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...' """ - item_parser = Parser(item) + item_detextf = '' + textf_segs = re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", item) + for textf_seg in textf_segs: + item_detextf = item_detextf + textf_seg + item_parser = Parser(item_detextf) item_parser.description_list() item = item_parser.text return item From 3e4d6fae211e8a43c1e65cb0540a15a0c7fa7f7d Mon Sep 17 00:00:00 2001 From: pingzhi Date: Sun, 15 Aug 2021 22:15:23 +0800 Subject: [PATCH 08/18] Update segment.py --- EduNLP/SIF/segment/segment.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/EduNLP/SIF/segment/segment.py b/EduNLP/SIF/segment/segment.py index 9c8585ad..d0eed499 100644 --- a/EduNLP/SIF/segment/segment.py +++ b/EduNLP/SIF/segment/segment.py @@ -84,7 +84,11 @@ def __init__(self, item, figures: dict = None): self._ques_mark_segments = [] self._tag_segments = [] self._sep_segments = [] - segments = re.split(r"(\$.+?\$)", item) + item_detextf = '' + textf_segs = re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", item) + for textf_seg in textf_segs: + item_detextf = item_detextf + textf_seg + segments = re.split(r"(\$.+?\$)", item_detextf) for segment in segments: if not segment: continue @@ -104,9 +108,6 @@ def __init__(self, item, figures: dict = None): self.append(TagSegment(segment[1:-1])) elif re.match(r"\$\\SIFSep\$", segment): self.append(SepSegment(segment[1:-1])) - elif re.match(r"\$\\textf\{[^,]+?,b?d?i?t?u?w?}\$", segment): - seg_capture = re.match(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", segment) - self.append(TextSegment(seg_capture.group(1))) else: self.append(LatexFormulaSegment(segment[1:-1])) self._seg_idx = None @@ -119,11 +120,7 @@ def __len__(self): def append(self, segment) -> None: if isinstance(segment, TextSegment): - if len(self._text_segments) != 0 and self._text_segments[-1] == len(self) - 1: - self._segments[-1] = TextSegment(self._segments[-1] + segment) - else: - self._text_segments.append(len(self)) - self._segments.append(segment) + self._text_segments.append(len(self)) elif isinstance(segment, (LatexFormulaSegment, FigureFormulaSegment)): self._formula_segments.append(len(self)) elif isinstance(segment, FigureSegment): @@ -136,10 +133,7 @@ def append(self, segment) -> None: self._sep_segments.append(len(self)) else: raise TypeError("Unknown Segment Type: %s" % type(segment)) - if isinstance(segment, TextSegment): - pass - else: - self._segments.append(segment) + self._segments.append(segment) @property def segments(self): @@ -307,7 +301,7 @@ def seg(item, figures=None, symbol=None): >>> test_item_2 = r"已知$y=x$,则以下说法中$\textf{正确,b}$的是" >>> s2 = seg(test_item_2) >>> s2.text_segments - ['已知', 'y=x', ',则以下说法中正确的是'] + ['已知', ',则以下说法中正确的是'] """ segments = SegmentList(item, figures) if symbol is not None: From 77a6f15b91a058d9a013df659d1e5147fc87eff6 Mon Sep 17 00:00:00 2001 From: pingzhi Date: Mon, 16 Aug 2021 16:38:11 +0800 Subject: [PATCH 09/18] Rollback Parser process in sif.py --- EduNLP/SIF/sif.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index 2b5e82b8..998d19c3 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -3,7 +3,6 @@ import traceback import warnings -import re from .segment import seg from .tokenization import tokenize, link_formulas from .parser import Parser @@ -60,11 +59,7 @@ def to_sif(item): >>> siftext '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...' """ - item_detextf = '' - textf_segs = re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", item) - for textf_seg in textf_segs: - item_detextf = item_detextf + textf_seg - item_parser = Parser(item_detextf) + item_parser = Parser(item) item_parser.description_list() item = item_parser.text return item From 0d110dcdc35f7700d4e5de0f4e4976522424d52a Mon Sep 17 00:00:00 2001 From: pingzhi Date: Mon, 16 Aug 2021 16:39:10 +0800 Subject: [PATCH 10/18] Update parser.py --- EduNLP/SIF/parser/parser.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/EduNLP/SIF/parser/parser.py b/EduNLP/SIF/parser/parser.py index db290946..0d26585e 100644 --- a/EduNLP/SIF/parser/parser.py +++ b/EduNLP/SIF/parser/parser.py @@ -1,11 +1,11 @@ from EduNLP.Formula.ast import str2ast, katex_parse +import re class Parser: def __init__(self, data): self.lookahead = 0 self.head = 0 - self.text = data self.error_message = '' self.error_postion = 0 self.error_flag = 0 @@ -40,6 +40,13 @@ def __init__(self, data): '$', ',', '.', '?', '!', ':', ';', '\'', '\"', '(', ')', ' ', '_', '/', '|', '<', '>', '-', '[', ']', '—'] + # 去除 data 中的文本标注格式 + data_detextf = '' + textf_segs = re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", data) + for textf_seg in textf_segs: + data_detextf = data_detextf + textf_seg + self.text = data_detextf + def is_number(self, uchar): """判断一个unicode是否是数字""" if u'\u0030' <= uchar <= u'\u0039': From 3e49d88f6f84d5d50fd04fefe0b1a9bdf1522fa8 Mon Sep 17 00:00:00 2001 From: pingzhi Date: Tue, 17 Aug 2021 09:44:09 +0800 Subject: [PATCH 11/18] Update parser.py --- EduNLP/SIF/parser/parser.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/EduNLP/SIF/parser/parser.py b/EduNLP/SIF/parser/parser.py index 0d26585e..558cadae 100644 --- a/EduNLP/SIF/parser/parser.py +++ b/EduNLP/SIF/parser/parser.py @@ -6,6 +6,7 @@ class Parser: def __init__(self, data): self.lookahead = 0 self.head = 0 + self.data = data self.error_message = '' self.error_postion = 0 self.error_flag = 0 @@ -39,13 +40,7 @@ def __init__(self, data): ';', '‘', '’', '“', '”', '(', ')', ' ', '、', '《', '》', '$', ',', '.', '?', '!', ':', ';', '\'', '\"', '(', ')', ' ', '_', '/', '|', '<', '>', '-', '[', ']', '—'] - - # 去除 data 中的文本标注格式 - data_detextf = '' - textf_segs = re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", data) - for textf_seg in textf_segs: - data_detextf = data_detextf + textf_seg - self.text = data_detextf + def is_number(self, uchar): """判断一个unicode是否是数字""" From 45eaa71c5847b253f2737723510231e314e00685 Mon Sep 17 00:00:00 2001 From: pingzhi Date: Tue, 17 Aug 2021 09:45:31 +0800 Subject: [PATCH 12/18] Update parser.py --- EduNLP/SIF/parser/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EduNLP/SIF/parser/parser.py b/EduNLP/SIF/parser/parser.py index 558cadae..2a33d229 100644 --- a/EduNLP/SIF/parser/parser.py +++ b/EduNLP/SIF/parser/parser.py @@ -6,7 +6,7 @@ class Parser: def __init__(self, data): self.lookahead = 0 self.head = 0 - self.data = data + self.text = data self.error_message = '' self.error_postion = 0 self.error_flag = 0 From 83ba24bf80d925444e6720b8ef0199e2aec35046 Mon Sep 17 00:00:00 2001 From: pingzhi Date: Tue, 17 Aug 2021 09:46:39 +0800 Subject: [PATCH 13/18] Update sif.py --- EduNLP/SIF/sif.py | 1 + 1 file changed, 1 insertion(+) diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index 998d19c3..af4fa63a 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -184,6 +184,7 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No [['\\SIFChoice']] >>> test_item_3 = r"已知$y=x$,则以下说法中$\textf{正确,b}$的是" >>> tl4 = sif4sci(test_item_3) + Warning: there is some chinese characters in formula! >>> tl4.text_segments [['已知'], ['说法', '中', '正确']] """ From 74b9443f47b9fcff19c0a06ae58ceb851fe15e76 Mon Sep 17 00:00:00 2001 From: pingzhi Date: Tue, 17 Aug 2021 23:56:58 +0800 Subject: [PATCH 14/18] Update parser.py Delete a blank line which results in error --- EduNLP/SIF/parser/parser.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/EduNLP/SIF/parser/parser.py b/EduNLP/SIF/parser/parser.py index 2a33d229..db290946 100644 --- a/EduNLP/SIF/parser/parser.py +++ b/EduNLP/SIF/parser/parser.py @@ -1,5 +1,4 @@ from EduNLP.Formula.ast import str2ast, katex_parse -import re class Parser: @@ -40,7 +39,6 @@ def __init__(self, data): ';', '‘', '’', '“', '”', '(', ')', ' ', '、', '《', '》', '$', ',', '.', '?', '!', ':', ';', '\'', '\"', '(', ')', ' ', '_', '/', '|', '<', '>', '-', '[', ']', '—'] - def is_number(self, uchar): """判断一个unicode是否是数字""" From c3159c019244866bec53040ed46e65861d62f872 Mon Sep 17 00:00:00 2001 From: pingzhi Date: Thu, 19 Aug 2021 12:33:56 +0800 Subject: [PATCH 15/18] Update segment.py Rename variable and add annotation for removing `$\textf{}$` --- EduNLP/SIF/segment/segment.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/EduNLP/SIF/segment/segment.py b/EduNLP/SIF/segment/segment.py index d0eed499..ba0cf4c2 100644 --- a/EduNLP/SIF/segment/segment.py +++ b/EduNLP/SIF/segment/segment.py @@ -84,11 +84,13 @@ def __init__(self, item, figures: dict = None): self._ques_mark_segments = [] self._tag_segments = [] self._sep_segments = [] - item_detextf = '' - textf_segs = re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", item) - for textf_seg in textf_segs: - item_detextf = item_detextf + textf_seg - segments = re.split(r"(\$.+?\$)", item_detextf) + remove_textf_item = '' + remove_textf_segments = re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", item) + # 按照$\textf{}$切割,$\textf{}$段仅捕获文本内容 + for remove_textf_segment in remove_textf_segments: + remove_textf_item = remove_textf_item + remove_textf_segment + # 连接处理后的字符串 + segments = re.split(r"(\$.+?\$)", remove_textf_item) for segment in segments: if not segment: continue From b023a3469edc12db2bb710ffdfed43ebf6c39cfc Mon Sep 17 00:00:00 2001 From: tswsxk Date: Fri, 20 Aug 2021 19:42:24 +0800 Subject: [PATCH 16/18] [feature] rename variable and pythonoicing code --- EduNLP/SIF/segment/segment.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/EduNLP/SIF/segment/segment.py b/EduNLP/SIF/segment/segment.py index ba0cf4c2..f7254cdb 100644 --- a/EduNLP/SIF/segment/segment.py +++ b/EduNLP/SIF/segment/segment.py @@ -84,13 +84,8 @@ def __init__(self, item, figures: dict = None): self._ques_mark_segments = [] self._tag_segments = [] self._sep_segments = [] - remove_textf_item = '' - remove_textf_segments = re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", item) - # 按照$\textf{}$切割,$\textf{}$段仅捕获文本内容 - for remove_textf_segment in remove_textf_segments: - remove_textf_item = remove_textf_item + remove_textf_segment - # 连接处理后的字符串 - segments = re.split(r"(\$.+?\$)", remove_textf_item) + item_no_textf = "".join(re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", item)) # remove $\textf{*} from the item$ + segments = re.split(r"(\$.+?\$)", item_no_textf) for segment in segments: if not segment: continue From 4ebaa356145d5145e692ce70325ed0e6f40452ef Mon Sep 17 00:00:00 2001 From: tswsxk Date: Fri, 20 Aug 2021 19:47:42 +0800 Subject: [PATCH 17/18] [fix] flake8 --- EduNLP/SIF/segment/segment.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/EduNLP/SIF/segment/segment.py b/EduNLP/SIF/segment/segment.py index f7254cdb..506f2281 100644 --- a/EduNLP/SIF/segment/segment.py +++ b/EduNLP/SIF/segment/segment.py @@ -84,7 +84,10 @@ def __init__(self, item, figures: dict = None): self._ques_mark_segments = [] self._tag_segments = [] self._sep_segments = [] - item_no_textf = "".join(re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", item)) # remove $\textf{*} from the item$ + + # remove $\textf{*} from the item$ + item_no_textf = "".join(re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", item)) + segments = re.split(r"(\$.+?\$)", item_no_textf) for segment in segments: if not segment: From c078ce3f2459a4c9d7a22b2046080bf8050121e8 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Fri, 20 Aug 2021 19:52:39 +0800 Subject: [PATCH 18/18] [fix] FLAKE8 --- EduNLP/SIF/segment/segment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/EduNLP/SIF/segment/segment.py b/EduNLP/SIF/segment/segment.py index 506f2281..93c5713f 100644 --- a/EduNLP/SIF/segment/segment.py +++ b/EduNLP/SIF/segment/segment.py @@ -84,10 +84,10 @@ def __init__(self, item, figures: dict = None): self._ques_mark_segments = [] self._tag_segments = [] self._sep_segments = [] - + # remove $\textf{*} from the item$ item_no_textf = "".join(re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", item)) - + segments = re.split(r"(\$.+?\$)", item_no_textf) for segment in segments: if not segment: