forked from JuliaStrings/TinySegmenter.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_tinysegmenter.py
45 lines (35 loc) · 1.28 KB
/
test_tinysegmenter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# coding: utf-8
#
# Usage: py.test -v test_tinysegmenter.py
#
# `pip install -r requirements.txt` is required.
from __future__ import unicode_literals
import io
import subprocess
import tinysegmenter
import pytest
def test_ctypes():
ctype = tinysegmenter._ctype
assert ctype('一') == 'M'
assert ctype('〆') == 'H'
assert ctype('名') == 'H'
assert ctype('あ') == 'I'
assert ctype('ア') == 'K'
assert ctype('Z') == 'A'
assert ctype('9') == 'N'
def test_tokenize():
tokenize = tinysegmenter.tokenize
assert tokenize("私の名前は中野です") == ["私", "の", "名前", "は", "中野", "です"]
assert tokenize("TinySegmenterは25kBで書かれています。") == ["TinySegmenter", "は", "2", "5", "kB", "で", "書か", "れ", "て", "い", "ます", "。"]
assert tokenize("") == []
def test_timemachine(tmpdir):
with io.open('../test/timemachineu8j.txt', encoding='utf-8') as f:
text = f.read()
toks = tinysegmenter.tokenize(text)
out = tmpdir.join("tokenized.txt")
out.write_text(' | '.join(toks), encoding='utf-8')
print(str(out)) # pytest show this only when test failed
assert 0 == subprocess.call([
"diff", "-u",
"../test/timemachineu8j.tokenized.txt",
str(out)])