In [6]:
import sys
sys.path.append('..')
import numpy as np

In [7]:
def preprocessing(text):
    
    text = text.lower()
    text = text.replace('.', ' .')
    words = text.split(' ')
    word_to_id = {}
    id_to_word = {}
    
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
        
    corpus = np.array([word_to_id[w] for w in words])
    
    return corpus, word_to_id, id_to_word

In [11]:
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocessing(text)

print("coupus:     ", corpus)
print("word_to_id: ", word_to_id)
print("id_to_word: ", id_to_word)

coupus:      [0 1 2 3 4 1 5 6]
word_to_id:  {'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}
id_to_word:  {0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}


In [13]:
def create_to_matrix(corpus, vocab_size, window_size=1):
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)
    
    for idx, word_id in enumerate(corpus):
        for i in range(1, window_size+1):
            left_idx = idx - i
            right_idx = idx + i
            
            if left_idx > 0:
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] += 1
                
            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1
                
    return co_matrix

In [21]:
vocab_size = len(word_to_id)
co_matrix = create_to_matrix(corpus, vocab_size)

co_matrix

array([[0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 1, 0],
       [0, 1, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0]])

In [29]:
def cos_similiarity(x, y, eps=1e-8):
    
    # 분모 0이 되지않게 하기 위해서 epsilon을 더해준다
    nx = x / np.sqrt(np.sum(x**2) + eps)
    ny = y / np.sqrt(np.sum(y**2) + eps)
    
    return np.dot(nx, ny)

In [31]:
C = co_matrix

c0 = C[word_to_id['you']]
c1 = C[word_to_id['say']]
c2 = C[word_to_id['goodbye']]
c3 = C[word_to_id['and']]
c4 = C[word_to_id['i']]
c5 = C[word_to_id['hello']]
c6 = C[word_to_id['.']]

print(c0)
print(c4)
print("cos_similiarity(c0, c1): ", cos_similiarity(c0, c4))

[0 1 0 0 0 0 0]
[0 1 0 1 0 0 0]
cos_similiarity(c0, c1):  0.7071067758832467
