diff --git a/AUTHORS.md b/AUTHORS.md index 41423fa6..df16f050 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -16,5 +16,7 @@ [Meikai Bao](https://github.com/BAOOOOOM) +[Yuting Ning](https://github.com/nnnyt) + The stared contributors are the corresponding authors. diff --git a/EduNLP/Tokenizer/tokenizer.py b/EduNLP/Tokenizer/tokenizer.py index bb7b47e5..08b09e26 100644 --- a/EduNLP/Tokenizer/tokenizer.py +++ b/EduNLP/Tokenizer/tokenizer.py @@ -65,6 +65,12 @@ class TextTokenizer(Tokenizer): >>> tokens = tokenizer(items) >>> next(tokens)[:10] ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] + >>> items = ["$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$\ + ... $\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$\ + ... $\\SIFTag{list_3}$2$\\SIFTag{options_end}$"] + >>> tokens = tokenizer(items) + >>> next(tokens)[:10] + ['[TAG]', '复数', 'z', '=', '1', '+', '2', 'i', '+', 'i'] """ def __init__(self, *args, **kwargs): diff --git a/tests/test_sif/test_tokenization.py b/tests/test_sif/test_tokenization.py index b904a71b..43dfde02 100644 --- a/tests/test_sif/test_tokenization.py +++ b/tests/test_sif/test_tokenization.py @@ -3,7 +3,7 @@ import pytest from EduNLP.SIF.constants import Symbol -from EduNLP.SIF.segment.segment import SegmentList +from EduNLP.SIF.segment.segment import SegmentList, LatexFormulaSegment from EduNLP.SIF.tokenization import text from EduNLP.SIF.tokenization import formula from EduNLP.SIF.tokenization.tokenization import TokenList @@ -32,3 +32,5 @@ def test_tokenization(): with pytest.raises(TypeError): tl.append("[Unknown]") + + tl.append(LatexFormulaSegment('x+y'), False)