# Bag-of-Wordsを作ってみる

In [24]:
import MeCab

In [25]:
documents = """布団が吹っ飛んだ
今日と明日は京都に行きます
今日は雨が降っています
授業は明日の午後からです""".splitlines()

In [26]:
# 定数たち

EOS = "EOS"
KOMOJI_KANA = "ァィゥェォャュョ"

In [27]:
word_list: list[str] = list()

In [28]:
tagger  = MeCab.Tagger("-Ochasen")

In [29]:
for sentence in documents:
    parsed: str = tagger.parse(sentence)
    for line in parsed.splitlines()[:-1]:
        word: str = line.split()[0]
        word_list.append(word)

In [30]:
word_to_id: dict[str, int] = dict()
for word in word_list:
    if word not in word_to_id:
        word_to_id[word] = len(word_to_id)

In [31]:
def gen_bow(sentence: str):
    bow = [0] * len(word_to_id)
    parsed = tagger.parse(sentence)
    for line in parsed.splitlines()[:-1]:
        word = line.split()[0]
        if word in word_to_id:
            bow[word_to_id[word]] = 1
    return bow

In [32]:
bow = gen_bow("今日私がいただくのは、名店のロールキャベツです")

In [33]:
bow

[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1]

# 読み仮名列完全一致区間を探したい

## 文章の読み仮名を取得する

# 子音の音韻類似度を計算する

## 読み仮名をローマ字に変換する

In [34]:
import romkan

In [35]:
def split_mora(kana: str):
    """カタカナの文字列をモーラに分割する"""
    mora_list:list[str] = []
    for char in kana:
        if char in KOMOJI_KANA:
            mora_list[-1] += char
        else:
            mora_list.append(char)
    return mora_list

In [36]:
for sentence in documents:
    parsed = tagger.parse(sentence)
    for line in parsed.splitlines()[:-1]:
        word = line.split()[1]
        print(list(map(romkan.to_roma, split_mora(word))))

['fu', 'to', 'n']
['ga']
['fu', 'xtsu', 'to', 'n']
['da']
['kyo', 'u']
['to']
['a', 'shi', 'ta']
['ha']
['kyo', 'u', 'to']
['ni']
['i', 'ki']
['ma', 'su']
['kyo', 'u']
['ha']
['a', 'me']
['ga']
['fu', 'xtsu']
['te']
['i']
['ma', 'su']
['ju', 'gyo', 'u']
['ha']
['a', 'shi', 'ta']
['no']
['go', 'go']
['ka', 'ra']
['de', 'su']
