In [1]:
import numpy as np
import tensorflow as tf 
from tensorflow.keras.layers import Dense, Embedding, Lambda
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import string

# Summary

In [2]:
sentences = [
    'The cat sat on the mat',
    'The dog sat on the log',
    'Cats and dogs are great?',
    'Sitting on the log is fun',
    'dogs are great companion',
    'On a sunny day a cat sat on the mat.',
    'Cats and dogs sitting on the log'
]

In [3]:
# word2vec
sentences = [''.join(ch for ch in s if ch not in string.punctuation) for s in sentences]
sentences = [s.lower() for s in sentences]
sentences

['the cat sat on the mat',
 'the dog sat on the log',
 'cats and dogs are great',
 'sitting on the log is fun',
 'dogs are great companion',
 'on a sunny day a cat sat on the mat',
 'cats and dogs sitting on the log']

In [4]:
# word index
word_index = {}

# loop with each sentece
for sentence in sentences:
    # split with blank
    words = sentence.split()
    # each word
    for word in words:
        # check new words
        if word not in word_index:
            # index starts from 1
            word_index[word] = len(word_index) + 1
            
# +1 for a padding token
vocab_size = len(word_index) + 1
# create CBOW pair 
cbow_pairs = []
window_size = 2

for sentence in sentences:
    # chunk to words -> to list
    words = sentence.split()
    
    for i, word in enumerate(words):
        # emply context list
        context = [0] * (2 * window_size)
        # target word
        target = word_index[word]
        
        context_words = []
        # 타켓 단어 주변의 window_size 기준으로 순회
        for j in range(max(0, i - window_size), min(i + window_size + 1, len(words))):
            # if not target word
            if j != i:
                context_words.append(word_index[words[j]])

        # 컨텍스트 단어를 컨텍스트 리스트에 붙여줌
        context[:len(context_words)] = context_words
        # update all pairs
        cbow_pairs.append((context, target))

X, y = zip(*cbow_pairs)
X = np.array(X)
y = to_categorical(y, num_classes=vocab_size)

embed_size = 32

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1), output_shape=(embed_size,)))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

# fit the model
model.fit(X, y, epochs=10, verbose=1)

Epoch 1/10




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 3.0034  
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.9976   
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.9919 
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 2.9873
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 999us/step - loss: 2.9830
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 999us/step - loss: 2.9785
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.9730  
Epoch 8/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1000us/step - loss: 2.9676
Epoch 9/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.9631 
Epoch 10/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1000us/step - loss: 2.9585


<keras.src.callbacks.history.History at 0x1fdf32d5d80>

In [5]:
word_index

{'the': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'mat': 5,
 'dog': 6,
 'log': 7,
 'cats': 8,
 'and': 9,
 'dogs': 10,
 'are': 11,
 'great': 12,
 'sitting': 13,
 'is': 14,
 'fun': 15,
 'companion': 16,
 'a': 17,
 'sunny': 18,
 'day': 19}

In [6]:
# call fitted embedding layers
embeddings = model.get_layer('embedding').get_weights()[0]

# most similar word
def find_similar_words(word, embeddings, word_index, index_word, num_similar=5):
    # word index
    word_idx = word_index[word]
    
    # embedding the word
    word_embedding = embeddings[word_idx]
    
    # calculate similarity
    dot_products = np.dot(embeddings, word_embedding)
    norms = np.linalg.norm(embeddings, axis=1) * np.linalg.norm(word_embedding)
    similarities = dot_products / norms
    
    # get top n words
    top_indices = np.argsort(similarities)[::-1][:num_similar + 1]
    
    # words info
    similar_words = [index_word[i] for i in top_indices if i != word_idx]
    
    return similar_words

In [7]:
# index-word dicstionary using reverse-mapping 
index_word = {v: k for k, v in word_index.items()}

word = 'dog'
similar_words = find_similar_words(word, embeddings, word_index, index_word)
print(f"Words simiar to {word}:", similar_words)

Words simiar to dog: ['sat', 'log', 'great', 'dogs', 'mat']


# Step by Step

In [24]:
sentences = [''.join(ch for ch in s if ch not in string.punctuation) for s in sentences]
print(sentences)

['the cat sat on the mat', 'the dog sat on the log', 'cats and dogs are great', 'sitting on the log is fun', 'dogs are great companion', 'on a sunny day a cat sat on the mat', 'cats and dogs sitting on the log']


In [26]:
sentences = [s.lower() for s in sentences]
print(sentences)

['the cat sat on the mat', 'the dog sat on the log', 'cats and dogs are great', 'sitting on the log is fun', 'dogs are great companion', 'on a sunny day a cat sat on the mat', 'cats and dogs sitting on the log']


In [27]:
# word index
word_index = {}
for sentence in sentences:
    # split a sentence
    words = sentence.split()
    
    for word in words:
        # add new words
        if word not in word_index:
            # index starts from 1
            word_index[word] = len(word_index) + 1
            
word_index

{'the': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'mat': 5,
 'dog': 6,
 'log': 7,
 'cats': 8,
 'and': 9,
 'dogs': 10,
 'are': 11,
 'great': 12,
 'sitting': 13,
 'is': 14,
 'fun': 15,
 'companion': 16,
 'a': 17,
 'sunny': 18,
 'day': 19}

In [28]:
# add 1 for padding token
vocab_size = len(word_index) + 1

In [33]:
# create CBOW pair
cbow_pairs = []
window_size = 2

for sentence in sentences:
    # split 
    words = sentence.split()
    # 단어를 대상으로, padding부터
    for i, word in enumerate(words):
        # empty context list
        context = [0] * (2 * window_size)
        # target word
        target = word_index[word]
        
        context_words = []
        # 타겟 단어 주변의 window_size 기준으로 순회
        for j in range(max(0, i - window_size), min(i + window_size +1, len(words))):
            # if not target word
            if j != i:
                context_words.append(word_index[words[j]])
                
        # append context words to the context list
        context[:len(context_words)] = context_words
        # update total CBOW pairs
        cbow_pairs.append((context, target))
cbow_pairs

[([2, 3, 0, 0], 1),
 ([1, 3, 4, 0], 2),
 ([1, 2, 4, 1], 3),
 ([2, 3, 1, 5], 4),
 ([3, 4, 5, 0], 1),
 ([4, 1, 0, 0], 5),
 ([6, 3, 0, 0], 1),
 ([1, 3, 4, 0], 6),
 ([1, 6, 4, 1], 3),
 ([6, 3, 1, 7], 4),
 ([3, 4, 7, 0], 1),
 ([4, 1, 0, 0], 7),
 ([9, 10, 0, 0], 8),
 ([8, 10, 11, 0], 9),
 ([8, 9, 11, 12], 10),
 ([9, 10, 12, 0], 11),
 ([10, 11, 0, 0], 12),
 ([4, 1, 0, 0], 13),
 ([13, 1, 7, 0], 4),
 ([13, 4, 7, 14], 1),
 ([4, 1, 14, 15], 7),
 ([1, 7, 15, 0], 14),
 ([7, 14, 0, 0], 15),
 ([11, 12, 0, 0], 10),
 ([10, 12, 16, 0], 11),
 ([10, 11, 16, 0], 12),
 ([11, 12, 0, 0], 16),
 ([17, 18, 0, 0], 4),
 ([4, 18, 19, 0], 17),
 ([4, 17, 19, 17], 18),
 ([17, 18, 17, 2], 19),
 ([18, 19, 2, 3], 17),
 ([19, 17, 3, 4], 2),
 ([17, 2, 4, 1], 3),
 ([2, 3, 1, 5], 4),
 ([3, 4, 5, 0], 1),
 ([4, 1, 0, 0], 5),
 ([9, 10, 0, 0], 8),
 ([8, 10, 13, 0], 9),
 ([8, 9, 13, 4], 10),
 ([9, 10, 4, 1], 13),
 ([10, 13, 1, 7], 4),
 ([13, 4, 7, 0], 1),
 ([4, 1, 0, 0], 7)]

In [37]:
# CBOW pair 생성
cbow_pairs = []
window_size = 2

# 각 문장에 대해 다음을 반복
for sentence in sentences:
    print(f"Sentence: {sentence}")
    # 단어 단위로 쪼갬 -> 리스트화
    words = sentence.split()
    print(f"words: {words}")
    # 단어를 대상으로, padding부터
    for i, word in enumerate(words):
        # 빈 컨텍스트 리스트 생성
        context = [0] * (2 * window_size)
        print(f"context: {context}")
        # target 단어 지정
        target = word_index[word]
        print(f"target: {target}")

        context_words = []
        # 타겟 단어 주변의 window_size 기준으로 순회
        for j in range(max(0, i - window_size), min(i + window_size + 1, len(words))):
            # 타겟 단어가 아니면
            if j != i:
                context_words.append(word_index[words[j]])
                print(f"context_words: {context_words}")

        # 컨텍스트 단어를 컨텍스트 리스트에 적절히 붙여줌
        context[:len(context_words)] = context_words
        # 전체 페어에 업데이트
        cbow_pairs.append((context, target))
        break
    break
print(f"cbow_pairs: {cbow_pairs}")

Sentence: the cat sat on the mat
words: ['the', 'cat', 'sat', 'on', 'the', 'mat']
context: [0, 0, 0, 0]
target: 1
context_words: [2]
context_words: [2, 3]
cbow_pairs: [([2, 3, 0, 0], 1)]


In [38]:
# convert CBOW pairs to numy arrays
X, y = zip(*cbow_pairs)
X = np.array(X)
y = to_categorical(y, num_classes=vocab_size)
X

array([[2, 3, 0, 0]])

In [39]:
y

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]])

In [44]:
embed_size = 100

In [45]:
# CBOW model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1), output_shape=(embed_size,)))
model.add(Dense(vocab_size, activation='softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# fit the model
model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 304ms/step - loss: 2.9919
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 2.9740
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 2.9562
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 2.9383
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 2.9203
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 2.9023
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 2.8842
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 2.8659
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 2.8475
Epoch 10/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 2.8290
Epoch 11/100
[1m1/1

<keras.src.callbacks.history.History at 0x24fe5fad3f0>

In [46]:
model.summary()

In [47]:
# call fitted embedding layer
embeddings = model.get_layer('embedding_3').get_weights()[0]

# find similar words
def find_similar_words(word, embeddings, word_index, index_word, num_similar=5):
    # 조회하려는 단어 인덱스
    word_idx = word_index[word]
    
    # 해당 단어의 임베딩
    word_embedding = embeddings[word_idx]
    
    # 다른 단어들과의 임베딩 유사도 계산
    dot_products = np.dot(embeddings, word_embedding)
    norms = np.linalg.norm(embeddings, axis=1) * np.linalg.norm(word_embedding)
    similarities = dot_products / norms
    
    # top n words
    top_indices = np.argsort(similarities)[::-1][:num_similar + 1]
    
    # word info
    similar_words = [index_word[i] for i in top_indices if i != word_idx]
    
    return similar_words

In [48]:
# index-word dictionary
index_word = {v: k for k, v in word_index.items()}

# 유사한 단어 조회
word = 'dogs'
similar_words = find_similar_words(word, embeddings, word_index, index_word)
print(f"Words similar to {word}:", similar_words)

Words similar to dogs: ['companion', 'fun', 'is', 'and', 'a']


# gensim

In [8]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
sentences = brown.sents()

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\ahn28\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [9]:
from gensim.models import Word2Vec

# skip-gram
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)

In [10]:
embeddings = model.wv.vectors

In [11]:
embeddings[0]

array([ 0.28386372,  0.16857015,  0.02566925,  0.12797242, -0.16663218,
       -0.18080568,  0.4037561 ,  0.74926656, -0.5022986 , -0.35059214,
        0.148686  ,  0.14265426,  0.06597403, -0.0648908 , -0.13575295,
       -0.25068554,  0.26755822,  0.35911483, -0.1750535 , -0.68587005,
        0.24349415,  0.34507617,  0.7936737 ,  0.12782378, -0.02200755,
       -0.09693623, -0.40120584, -0.06938685, -0.25231257, -0.05461184,
        0.35753107, -0.03850134,  0.28087682, -0.03143716,  0.06614433,
        0.01403842,  0.43434635, -0.34011945, -0.00513735, -0.02668072,
        0.10534933, -0.02426742,  0.1346769 ,  0.28468725,  0.21306434,
       -0.29480767, -0.21846709,  0.02209007, -0.254086  ,  0.5008504 ,
       -0.43472016, -0.09542044, -0.43034324, -0.13490951, -0.26371765,
       -0.34530982,  0.26402265, -0.14021672, -0.18306564,  0.19043384,
       -0.03649803, -0.07938743,  0.06226121,  0.03157981, -0.7338423 ,
        0.3755388 ,  0.64164317,  0.01932277, -0.07967233, -0.14

In [12]:
word = 'college'
similar_words = model.wv.most_similar(word)
print(f"Words similar to '{word}':", similar_words)

Words similar to 'college': [('university', 0.9198744297027588), ('mature', 0.9194548726081848), ('student', 0.9132493734359741), ('finding', 0.9063502550125122), ('friendly', 0.8989046812057495), ('academic', 0.8983222842216492), ('spending', 0.8981701731681824), ('romantic', 0.896339476108551), ('childhood', 0.8946845531463623), ('school', 0.8939958810806274)]


In [14]:
from scipy.spatial.distance import cosine

def sentence_avg_embedding(sentence, embeddings, wv_index):
    words = sentence.split()
    avg_embedding = np.mean([embeddings[wv_index[word]] for word in words if word in wv_index], axis=0)
    return avg_embedding

sentences = [
    'a sweater with diamond pattern',
    'a white top of jewerly image',
    'black jeans with dark stripes'
]

avg_embeddings = [sentence_avg_embedding(sentence, embeddings, model.wv.key_to_index) for sentence in sentences]

similarities = [
    cosine(avg_embeddings[0], avg_embeddings[1]),
    cosine(avg_embeddings[0], avg_embeddings[2]),
    cosine(avg_embeddings[1], avg_embeddings[2]),
]

print('Cosine similarities: ', similarities)

Cosine similarities:  [0.09836462932179657, 0.11110471945308831, 0.17547510167991065]
