From 1d0ad185bca2e01406dc25653ba5b895d6aa541e Mon Sep 17 00:00:00 2001 From: nnnyt <793313994@qq.com> Date: Fri, 3 Sep 2021 16:05:26 +0800 Subject: [PATCH 1/2] [test] add test for tokenizer --- AUTHORS.md | 2 ++ tests/test_tokenizer/test_tokenizer.py | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/AUTHORS.md b/AUTHORS.md index 41423fa6..df16f050 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -16,5 +16,7 @@ [Meikai Bao](https://github.com/BAOOOOOM) +[Yuting Ning](https://github.com/nnnyt) + The stared contributors are the corresponding authors. diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index 9f7e03f8..acc9269c 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -8,3 +8,25 @@ def test_tokenizer(): with pytest.raises(KeyError): get_tokenizer("error") + + +def test_text_tokenizer(): + items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束条件公式\ + $\\FormFigureBase64{wrong2?}$,$\\SIFSep$,\ + 则$z=x+7 y$的最大值为$\\SIFBlank$"] + tokenizer = get_tokenizer('text') + tokens = tokenizer(items) + token_list = next(tokens)[:10] + assert token_list == ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', + 'y', '约束条件', '公式', '[FORMULA]'], token_list + + +def test_pure_text_tokenzier(): + items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束条件公式\ + $\\FormFigureBase64{wrong2?}$,$\\SIFSep$,\ + 则$z=x+7 y$的最大值为$\\SIFBlank$"] + tokenizer = get_tokenizer('pure_text') + tokens = tokenizer(items) + token_list = next(tokens)[:10] + assert token_list == ['公式', '如图', '[FIGURE]', 'x', ',', 'y', + '约束条件', '公式', '[SEP]', 'z'], token_list From 469af6f09e81e3e2cb2854ac9f59b3f6209ef9b2 Mon Sep 17 00:00:00 2001 From: nnnyt <793313994@qq.com> Date: Fri, 3 Sep 2021 20:47:25 +0800 Subject: [PATCH 2/2] [test] add test to cover the tokenization --- EduNLP/Tokenizer/tokenizer.py | 6 ++++++ tests/test_sif/test_tokenization.py | 4 +++- tests/test_tokenizer/test_tokenizer.py | 22 ---------------------- 3 files changed, 9 insertions(+), 23 deletions(-) diff --git a/EduNLP/Tokenizer/tokenizer.py b/EduNLP/Tokenizer/tokenizer.py index bb7b47e5..08b09e26 100644 --- a/EduNLP/Tokenizer/tokenizer.py +++ b/EduNLP/Tokenizer/tokenizer.py @@ -65,6 +65,12 @@ class TextTokenizer(Tokenizer): >>> tokens = tokenizer(items) >>> next(tokens)[:10] ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] + >>> items = ["$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$\ + ... $\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$\ + ... $\\SIFTag{list_3}$2$\\SIFTag{options_end}$"] + >>> tokens = tokenizer(items) + >>> next(tokens)[:10] + ['[TAG]', '复数', 'z', '=', '1', '+', '2', 'i', '+', 'i'] """ def __init__(self, *args, **kwargs): diff --git a/tests/test_sif/test_tokenization.py b/tests/test_sif/test_tokenization.py index b904a71b..43dfde02 100644 --- a/tests/test_sif/test_tokenization.py +++ b/tests/test_sif/test_tokenization.py @@ -3,7 +3,7 @@ import pytest from EduNLP.SIF.constants import Symbol -from EduNLP.SIF.segment.segment import SegmentList +from EduNLP.SIF.segment.segment import SegmentList, LatexFormulaSegment from EduNLP.SIF.tokenization import text from EduNLP.SIF.tokenization import formula from EduNLP.SIF.tokenization.tokenization import TokenList @@ -32,3 +32,5 @@ def test_tokenization(): with pytest.raises(TypeError): tl.append("[Unknown]") + + tl.append(LatexFormulaSegment('x+y'), False) diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index acc9269c..9f7e03f8 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -8,25 +8,3 @@ def test_tokenizer(): with pytest.raises(KeyError): get_tokenizer("error") - - -def test_text_tokenizer(): - items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束条件公式\ - $\\FormFigureBase64{wrong2?}$,$\\SIFSep$,\ - 则$z=x+7 y$的最大值为$\\SIFBlank$"] - tokenizer = get_tokenizer('text') - tokens = tokenizer(items) - token_list = next(tokens)[:10] - assert token_list == ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', - 'y', '约束条件', '公式', '[FORMULA]'], token_list - - -def test_pure_text_tokenzier(): - items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束条件公式\ - $\\FormFigureBase64{wrong2?}$,$\\SIFSep$,\ - 则$z=x+7 y$的最大值为$\\SIFBlank$"] - tokenizer = get_tokenizer('pure_text') - tokens = tokenizer(items) - token_list = next(tokens)[:10] - assert token_list == ['公式', '如图', '[FIGURE]', 'x', ',', 'y', - '约束条件', '公式', '[SEP]', 'z'], token_list