In [12]:
import glob
import MeCab
import ipadic
from gensim.models import Word2Vec
import re
import numpy as np

In [13]:
files = glob.glob("sample_data/*.txt")

documents = []

for file_path in files:
    with open(file_path, "r", encoding="utf-8") as f:
        documents.append(f.read())

In [14]:
document_list = []

m = MeCab.Tagger(ipadic.MECAB_ARGS)
pattern = re.compile(r"[ぁ-んァ-ヶ一-龥々]")

for document in documents:
    m1 = m.parse(document)
    
    noun_list =[]
    for row in m1.split("\n"):
        target_word = row.split("\t")[0]
        if target_word == "EOS":
            break
        else:
            word = row.split("\t")[1]
            if word[:2] == "名詞":
                if pattern.findall(target_word) != []:
                   noun_list.append(target_word)

    document_list.append(noun_list)

In [15]:
model = Word2Vec(document_list, min_count=1)

In [16]:
model.wv["放送"]

array([-0.01223801,  0.00899587, -0.00742968,  0.00768361,  0.00733178,
       -0.0109793 ,  0.01513781,  0.01975149, -0.0187292 ,  0.00405822,
       -0.01087262, -0.00713097,  0.0066231 ,  0.0051896 , -0.00376334,
       -0.01117102,  0.00956615, -0.01695116,  0.00156113, -0.0269539 ,
        0.01944072, -0.00366285,  0.01575053, -0.00824062, -0.01383275,
       -0.00673301, -0.01022001,  0.00027153, -0.00880882,  0.00927916,
        0.01530651,  0.00194911,  0.00422172, -0.00035427, -0.00509598,
        0.01499727, -0.00834502, -0.00342295, -0.00518718, -0.0175535 ,
        0.00845263, -0.01097074, -0.00749972,  0.00063815,  0.00681255,
       -0.00838283, -0.00290042, -0.00911621,  0.0117795 ,  0.00662136,
        0.00911904, -0.00320179, -0.00363818,  0.00017367, -0.00949273,
        0.00931522,  0.01036452, -0.00371016, -0.02073824, -0.00818164,
        0.00437739, -0.00243505, -0.00645005, -0.00339031, -0.02134898,
        0.01282998,  0.00800594,  0.00387142, -0.02135232,  0.01

In [17]:
def cos_sim(vec_a, vec_b):
    dot = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot / (norm_a * norm_b)

In [18]:
cos_sim(model.wv["放送"], model.wv["視聴"])

0.54794896

In [19]:
model.wv.most_similar("視聴", topn=5)

[('の', 0.703569233417511),
 ('日本', 0.6732020974159241),
 ('こと', 0.6616107225418091),
 ('ため', 0.65836101770401),
 ('ん', 0.6570427417755127)]

#### ストップワードを取得

In [20]:
stop_words = []
with open("stop_words.txt", "r", encoding="utf-8") as f:
    for row in f.readlines():
        stop_words.append(row.strip())

In [26]:
document_list = []

m = MeCab.Tagger(ipadic.MECAB_ARGS)
pattern = re.compile(r"[ぁ-んァ-ヶ一-龥々]")

for document in documents:
    m1 = m.parse(document)
    
    noun_list =[]
    for row in m1.split("\n"):
        target_word = row.split("\t")[0]
        if target_word == "EOS":
            break
        else:
            word = row.split("\t")[1]
            if word[:2] == "名詞":
                if pattern.findall(target_word) != [] and target_word not in stop_words:
                   noun_list.append(target_word)

    document_list.append(noun_list)

In [27]:
model = Word2Vec(document_list, min_count=1)

In [28]:
cos_sim(model.wv["放送"], model.wv["視聴"])

0.27328742

In [29]:
model.wv.most_similar("視聴", topn=5)

[('記事', 0.49594712257385254),
 ('話題', 0.49297645688056946),
 ('番組', 0.48523572087287903),
 ('ネット', 0.4701656103134155),
 ('批判', 0.45258843898773193)]

#### 文章の類似度

In [33]:
sentence = ["私", "は", "報道", "アナウンサー", "です"]

word_vec = []

for word in sentence:
    if word in model.wv:
        word_vec.append(model.wv[word])

In [34]:
word_vec

[array([-0.00250608,  0.00617311, -0.00019012,  0.01070487,  0.00344567,
        -0.00387439,  0.00242922,  0.01281414,  0.0019681 , -0.00227902,
         0.00147246, -0.01029242,  0.00373494,  0.00741207,  0.00418108,
        -0.00866058, -0.00465111, -0.00234443, -0.00560736, -0.01059185,
         0.00306205,  0.00746861, -0.00432432,  0.0074281 , -0.00310169,
        -0.00461752, -0.00097507, -0.00542675, -0.01225483, -0.00456709,
         0.00529901, -0.00666004,  0.00168128, -0.00074992, -0.0051614 ,
         0.00753335,  0.00361974,  0.00594313, -0.00202876, -0.00848688,
         0.00376762, -0.00562216, -0.01415814,  0.00955153,  0.00207945,
        -0.01175285, -0.01211939, -0.00452767,  0.00493611,  0.00865615,
         0.00829657, -0.01244357, -0.00217279, -0.00393765, -0.00883374,
         0.01009202,  0.00680818, -0.00452568, -0.01388145, -0.00408897,
         0.00031419,  0.00930025,  0.00415155, -0.00817937, -0.01619464,
         0.01157124, -0.00071848,  0.00059852, -0.0

In [36]:
np.array(word_vec).shape

(2, 100)

In [None]:
np.mean(word_vec, axis=0)

array([-2.5070109e-05,  3.2929011e-04], dtype=float32)

In [41]:
def cal_vec(sentence, model):
    word_vec = [model.wv[word]for word in sentence if word in model.wv]
    sentence_vec = np.mean(word_vec, axis=0)
    
    return sentence_vec

In [45]:
vec1 = cal_vec(document_list[38], model)
vec2 = cal_vec(document_list[40], model)

In [46]:
cos_sim(vec1, vec2)

0.91900086