In [1]:
import numpy as np
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
# Define small corpus
corpus = ["the quick brown fox jumped over the lazy dog"]

In [3]:
# Tokenize the corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1  # Adding 1 for padding token
window_size = 2  # Context window size

In [4]:
# Prepare training data (context, target word pairs)
data, labels = [], []
for sentence in tokenizer.texts_to_sequences(corpus):
    for i in range(window_size, len(sentence) - window_size):
        context = [sentence[j] for j in range(i - window_size, i + window_size + 1) if j != i]
        target = sentence[i]
        data.append(context)
        labels.append(target)

In [5]:
# Convert to numpy arrays
data = np.array(data)
labels = np.array(labels)

In [6]:
# Define and train the model
model = models.Sequential([
    layers.Embedding(input_dim=total_words, output_dim=50, input_length=window_size * 2),
    layers.GlobalAveragePooling1D(),
    layers.Dense(total_words, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(data, labels, epochs=50, verbose=1)



Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.2000 - loss: 2.1889
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step - accuracy: 0.4000 - loss: 2.1835
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step - accuracy: 0.4000 - loss: 2.1780
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step - accuracy: 0.6000 - loss: 2.1726
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step - accuracy: 0.6000 - loss: 2.1671
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step - accuracy: 0.6000 - loss: 2.1616
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - accuracy: 0.6000 - loss: 2.1562
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 0.6000 - loss: 2.1507
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

<keras.src.callbacks.history.History at 0x260f180d3d0>

In [7]:
# Get word embeddings and find most similar words to a target
word_embeddings = model.layers[0].get_weights()[0]
target_word = 'quick'  # Example target word

In [8]:
target_idx = tokenizer.word_index.get(target_word)
if target_idx:
    target_embedding = word_embeddings[target_idx - 1]
    similarities = np.dot(word_embeddings, target_embedding) / (np.linalg.norm(word_embeddings, axis=1) * np.linalg.norm(target_embedding))
    similar_idx = similarities.argsort()[-5:][::-1]
    similar_words = [word for word, idx in tokenizer.word_index.items() if idx in similar_idx]
    print(f"Most similar words to '{target_word}': {similar_words}")

Most similar words to 'quick': ['the', 'quick', 'brown', 'fox', 'over']
