# BoWの動作理解

# 単純なBoWの実装

In [1]:
morphemes = ['I like animals. But I don\'t like bags.']

# 単語に数値を割り当てます。
word2id = {} # {単語: ID}
for line in morphemes:
    for word in line.split():
        if word in word2id:
            continue
        word2id[word] = len(word2id)

# Bag of Wordsを作る
bow_set = []
for line in morphemes:
    bow = [0] * len(word2id)
    for word in line.split():
        try:
            bow[word2id[word]] += 1
        except:
            pass
    bow_set.append(bow)
print(*bow_set, sep="\n")

[2, 2, 1, 1, 1, 1]


# Gensimを使ったBoWの実装

In [6]:
from gensim.corpora import Dictionary
from gensim import matutils as mtu

# 辞書を作る
dct = Dictionary()
for line in morphemes:
    # 辞書の更新
    # All tokens should be already tokenized and normalized.
    dct.add_documents([line.split()])
word2id = dct.token2id # 単語 -> ID
print(word2id)
bow_set = []
# 文をBoWに変換
for line in morphemes:
    # [(word ID, word frequency)]
    bow_format = dct.doc2bow(line.split())
    bow_set.append(bow_format)
    print("-"*20)
    print(line)
    print("-"*20)
    print("BoW format: (word ID, word frequency)")
    print(bow_format)
    bow = mtu.corpus2dense([bow_format], num_terms=len(dct)).T[0]
    print("-"*20)
    print("BoW")
#     print(bow)
#     # numpyからlistに変える
#     print(bow.tolist())
    # intにする
    print(list(map(int, bow.tolist())))
    print("-"*20)

{'But': 0, 'I': 1, 'animals.': 2, 'bags.': 3, "don't": 4, 'like': 5}
--------------------
I like animals. But I don't like bags.
--------------------
BoW format: (word ID, word frequency)
[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 2)]
--------------------
BoW
[1, 2, 1, 1, 1, 2]
--------------------


# Scikit-learnを使ったBoW

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
bag = vectorizer.fit_transform(morphemes)

In [9]:
print(bag.toarray())
print(vectorizer.vocabulary_)
print(vectorizer.get_feature_names())

[[1 1 1 1 2 2 1]]
{'i': 4, 'like': 5, 'animals': 0, 'but': 2, 'don': 3, 't': 6, 'bags': 1}
['animals', 'bags', 'but', 'don', 'i', 'like', 't']
