-
Notifications
You must be signed in to change notification settings - Fork 0
/
kanamatcher.py
107 lines (83 loc) · 3.52 KB
/
kanamatcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import functools
import itertools
import Levenshtein
import common
from needleman_wunsch import needleman_wunsch as align
from reading_splitter import get_readings, process_reading, split_reading
# kanji missing ruby
NO_RUBY_PENALTY = 2
# mismatch between given kana and resulting ruby
KANA_MISMATCH_PENALTY = 2
# max number of alignments to test
MAX_NUM_ALIGNMENTS = 1000
def find_matches(a, b):
def split(v, el):
a, b = el
if len(v) == 0:
return [(a, b)]
elif (a == b and v[-1][0] == v[-1][1] or
a != b and v[-1][0] != v[-1][1]):
v[-1] = v[-1][0] + a, v[-1][1] + b
else:
v.append((a, b))
return v
return functools.reduce(split, zip(a, b), [])
def filter_alignments(alignments, fill="-", limit=None):
limit = limit or MAX_NUM_ALIGNMENTS
matches = set()
for a, b in itertools.islice(alignments, limit):
match = clear_fill(find_matches(a, b), fill=fill)
tup = tuple(match)
if tup in matches:
continue
yield match
matches.add(tup)
def clear_fill(l, fill='-'):
return [(a.replace(fill, ''), b.replace(fill, '')) for a, b in l]
def finalize_furigana(l, return_score=False):
def process_furigana(kanji, kana):
if (common.to_hiragana(kanji) != kana
and len(kanji) != 0
and all(common.is_kanji(k) or ord('0') <= ord(k) <= ord('9') for k in kanji)):
return split_reading(kanji, kana, return_score=True)
else:
return [(kanji, None)], sum(NO_RUBY_PENALTY for k in kanji
if common.is_kanji(k))
nested_furigana, scores = zip(*(process_furigana(kanji, common.to_hiragana(kana))
for kanji, kana in l))
furigana = [pair
for nested in nested_furigana
for pair in nested]
total_score = sum(scores) + KANA_MISMATCH_PENALTY * \
Levenshtein.distance(common.to_hiragana("".join(kana for _, kana in l)),
common.to_hiragana("".join(kana if kana is not None else kanji
for kanji, kana in furigana)))
return (furigana, total_score) if return_score else furigana
def match_kana(kanji, kana, return_score=False):
def stoponzero(alignments):
for i, alignment in enumerate(alignments):
furigana, score = finalize_furigana(alignment, return_score=True)
yield furigana, score
if score == 0:
break
best_match, score = min(stoponzero(filter_alignments(align(kanji, kana))),
key=lambda x: x[1])
return (best_match, score) if return_score else best_match
def pretty_print(pairs, fill=" "):
ruby = ""
output = ""
for kanji, kana in pairs:
if kana is not None:
ruby += kana
output += kanji
while len(ruby) < len(output):
ruby += " "
while len(output) < len(ruby):
output += " "
ruby += "|"
output += "|"
return ruby, output
if __name__ == "__main__":
kanji = "強い相手を求めて空を飛び回る。なんでも溶かしてしまう高熱の炎を自分より弱いものに向けることはしない。"
kana = "つよいあいてをもとめてそらをとびまわる。なんでもとかしてしまうこうねつのほのおをじぶんよりよわいものにむけることはしない。"
print("\n".join(pretty_print(match_kana(kanji, kana))))