In [1]:
text = 'You say goodbye and I say hello.'

In [2]:
text = text.lower()
text

'you say goodbye and i say hello.'

In [3]:
text = text.replace('.', ' .')
text

'you say goodbye and i say hello .'

In [4]:
words = text.split(' ')
words

['you', 'say', 'goodbye', 'and', 'i', 'say', 'hello', '.']

In [5]:
import re

In [6]:
re.split('(\w+)', text)

['',
 'you',
 ' ',
 'say',
 ' ',
 'goodbye',
 ' ',
 'and',
 ' ',
 'i',
 ' ',
 'say',
 ' ',
 'hello',
 ' .']

In [7]:
word_to_id = {}
id_to_word = {}

for word in words:
    if word not in word_to_id:
        new_id = len(word_to_id)
        word_to_id[word] = new_id
        id_to_word[new_id] = word

In [8]:
word_to_id

{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}

In [9]:
id_to_word

{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}

In [10]:
id_to_word[2]

'goodbye'

In [11]:
word_to_id['hello']

5

In [12]:
import numpy as np

corpus = [word_to_id[w] for w in words]
corpus

[0, 1, 2, 3, 4, 1, 5, 6]

In [13]:
corpus = np.array(corpus)
corpus

array([0, 1, 2, 3, 4, 1, 5, 6])

In [14]:
import sys
sys.path.append('..')

In [15]:
from common.util import preprocess

corpus, word_to_id, id_to_word = preprocess("Sumer is very cold.")

In [16]:
corpus

array([0, 1, 2, 3, 4])

In [17]:
word_to_id

{'sumer': 0, 'is': 1, 'very': 2, 'cold': 3, '.': 4}

In [18]:
id_to_word

{0: 'sumer', 1: 'is', 2: 'very', 3: 'cold', 4: '.'}

In [19]:
print(corpus)

[0 1 2 3 4]


In [20]:
from common.util import create_co_matrix

In [21]:
co_matrix = create_co_matrix(corpus, 5, 1)

In [22]:
co_matrix

array([[0, 1, 0, 0, 0],
       [1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0],
       [0, 0, 1, 0, 1],
       [0, 0, 0, 1, 0]], dtype=int32)

In [23]:
print(co_matrix[word_to_id['sumer']])

[0 1 0 0 0]


In [24]:
enumerate(corpus)

<enumerate at 0x7f7cf981a840>

In [25]:
for i, j in enumerate(corpus):
    print(i, j)

0 0
1 1
2 2
3 3
4 4


In [26]:
len(corpus)

5

In [27]:
from common.util import cos_similarity

In [30]:
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)

c0 = C[word_to_id['you']]
c1 = C[word_to_id['i']]
print(cos_similarity(c0, c1))

0.7071067691154799


In [32]:
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    if query not in word_to_id:
        print('Cannot find %s.' % query)
        return
    
    print('\n[query] ' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]
    
    vocab_size = len(id_to_word)
    similarity = np.zeros(vocab_size)
    
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)
        
    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
        print(' %s: %s ' % (id_to_word[i], similarity[i]))
        
        count += 1
        
        if count >= top:
            return

In [33]:
most_similar('you', word_to_id, id_to_word, C, 5)


[query] you
 goodbye: 0.7071067691154799 
 i: 0.7071067691154799 
 hello: 0.7071067691154799 
 say: 0.0 
 and: 0.0 
