# 文字をカウントする関数を定義

In [1]:
import numpy as np

In [2]:
word_to_id = {} 
id_to_word = {}

def preprocess(text):
        text = text.lower()
        text  = text.replace('.', ' .')
        text = text.replace(',', ' ,')
        words = text.split(" ")
        
        for word in words:
            if word  not in word_to_id:
                num = len(word_to_id)
                word_to_id[word] = num
                id_to_word[num] = word
        corpus = np.array([i for i in id_to_word.keys()])
        return corpus, words, word_to_id, id_to_word

# マトリックスの生成

In [3]:
def create_co_matrix(corpus, windowsize=1):
    co_size = len(corpus)
    co_matrix = np.zeros((co_size, co_size), dtype=np.int32)
    
    for idx, word_id in enumerate(corpus):
        for i in range(1, windowsize + 1):
            left_idx = idx - i
            right_idx = idx + i 
            
            if left_idx >=0:
                left_idx = corpus[left_idx]
                co_matrix[word_id, left_idx] +=1
            
            if right_idx < co_size:
                right_idx = corpus[right_idx]
                co_matrix[word_id, right_idx] += 1
    return co_matrix

In [4]:
# 自作

def create_co_matrix(text, windowsize=1):
    corpus, words, word_to_id, id_to_word = preprocess(text)
    co_size = len(corpus)
    co_matrix = np.zeros((co_size, co_size), dtype=np.int32)
    
    for idn, word in enumerate(words):
        for i in range(1, windowsize + 1):
            left_idx = idn - i
            right_idx = idn + i 
            
            if left_idx >=0:
                left_idx = word_to_id[words[left_idx]]
                co_matrix[word_to_id[word], left_idx] +=1
            
            if right_idx < co_size:
                right_idx = word_to_id[words[right_idx]]
                co_matrix[word_to_id[word], right_idx] +=1
    return co_matrix

In [5]:
sample = "Today, most people buy tofu at supermarkets, but the significance of tofu in the Japanese diet has not changed a bit."
a = create_co_matrix(sample)

# 類似度を計算

In [6]:
def coss_similarity(x, y, eps=1e-8):
    nx = x/np.sqrt(np.sum(x**2) + eps)
    ny = y/np.sqrt(np.sum(y**2) + eps)
    return np.dot(nx, ny)

# 類似度ランキングを表示

In [7]:
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    
    if query not in word_to_id:
        return print('%s is not found' % query)
    
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]
    
    vocab_size = len(id_to_word)
    similality = np.zeros(vocab_size)#答えのベクトルの配列
    
    for i in range(vocab_size):
        similality[i] = coss_similarity(word_matrix[i], query_vec)
        
    count = 0
    
    for i in np.argsort(-1 * similality):#sort　して配列の番号だけを返す
        if i == query_id:
            continue
        print('{}:{}'.format(id_to_word[i], similality[i]))
        count += 1
        
        if count == top:
            break
        

In [8]:
text = "Today, most people buy tofu at supermarkets, but the significance of tofu in the Japanese diet has not changed a bit."
a,b,c,d = preprocess(text) 
matrix = create_co_matrix(text)
most_similar('tofu', c,d,matrix)

significance:0.35355338926744856
people:0.35355338926744856
supermarkets:0.35355338926744856
the:0.24999999937500006
a:0.0
