## 1. Import required libraries

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## 2. Tokenize data and build vocabulary
The Tokenizer class is used to tokenize the text and create a vocabulary. The out-of-vocabulary token (<OOV>) is specified to handle words that are not present in the vocabulary.

In [None]:
# Sample corpus
corpus = [
    "the quick brown fox jumps",
    "over the lazy dog",
]


# Tokenize and create vocabulary
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

## 3.	Preprocess data to generate our targets and context words in specified window


1.   **texts_to_sequences** method of the tokenizer is used to convert the input sentences in the corpus into sequences of integers. Each unique word in the corpus is assigned a unique integer index. The resulting sequences variable is a list of lists, where each inner list represents the sequence of word indices.
2.   For each **target word**, a context window is defined by selecting the words within a certain range around the **target word**. The **left_window** and **right_window** variables determine the boundaries of the context window. The **context words** are then extracted from the document by slicing it based on these boundaries.



In [None]:
import numpy as np

context_window = 2

def generate_data(corpus, window_size, tokenizer):
    sequences = tokenizer.texts_to_sequences(corpus)
    contexts, targets= [], []
    for doc in sequences:
        current_index = 0

        doc_len = len(doc)
        # grab center word and its context words
        while current_index < doc_len:
          # target word
          target_word = doc[current_index]

          # context words in window size
          left_window = max(0, current_index - window_size)
          right_window= min(current_index + window_size, doc_len)
          context_words = doc[left_window:current_index] + doc[current_index+1: right_window]

          # add conext and target to our training data
          contexts.append(context_words)
          targets.append(target_word)


          current_index += 1
    contexts = pad_sequences(contexts, maxlen=context_window*2)

    return np.array(contexts), np.array(targets)

X_train, y_train = generate_data(corpus, context_window, tokenizer)

## 4.	Define our model


*   **Embedding Layer:** This layer is responsible for creating word embeddings. It takes **one-hot encoded** words as input and converts them into dense vectors of fixed size (**embedding_dim**). The input_dim is set to vocab_size, which is the size of the vocabulary, and input_length is set to **context_window*2**, the length of the padded context sequences.
*   **GlobalAveragePooling1D Layer:** This layer calculates the average of all the embeddings in the sequence dimension. It helps reduce the dimensionality of the data before passing it to the next layer.
*  ** Dense Layer:** This is the output layer with a number of units equal to the vocabulary size (**vocab_size**). The activation function is set to 'softmax', which is appropriate for a multi-class classification problem. It outputs a probability distribution over the vocabulary, indicating the likelihood of each word being the target word.


In [None]:
embedding_dim = 100

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=context_window*2),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            1000      
                                                                 
 global_average_pooling1d (  (None, 100)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 10)                1010      
                                                                 
Total params: 2010 (7.85 KB)
Trainable params: 2010 (7.85 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
epochs = 50
batch_size = 16

model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x791c7db57760>

## 7.	Test our Learned weights of the words



*   **get_word_vector** function. This function retrieves the learned embedding vector for a given word from the **learned_embeddings **matrix.
*   Once the target word vector is obtained, the function computes the **cosine similarity** between the target word vector and all other word vectors in the embedding space. This is done by taking the **dot product** of the **target vector** with each row of the **learned_embeddings** matrix. The result is a vector of cosine similarity scores between the target word and all other words in the vocabulary.
*   The indices of the words with the **highest cosine similarity** scores are then identified. The np.argsort function returns the indices that would sort the distances array in **ascending order**. By taking the **last -top_n elements**, we get the indices of the top **top_n **words with the highest similarity scores.
*   **The indices** are used to retrieve the actual words from the index_to_word dictionary, excluding the **padding token** (index 0). The result is a list of words that are most similar to the input word, based on the learned word embeddings.


In [None]:
# learned embeddings
learned_embeddings = model.layers[0].get_weights()[0]

index_to_word = {i: w for w, i in word_index.items()}

def get_word_vector(word):
    index = word_index.get(word, word_index['<OOV>'])
    return learned_embeddings[index]

word = 'fox'
# Example: Get the word vector for the word 'bank'
word_vector = get_word_vector(word)

# Find similar words to a given word
def find_similar_words(word, top_n=2):
    target_vector = get_word_vector(word)
    distances = learned_embeddings @ target_vector
    closest_indices = np.argsort(distances)[-top_n:]
    similar_words = [index_to_word[index] for index in closest_indices if index!=0]
    return similar_words

# Example: Find similar words to 'bank'
similar_words = find_similar_words(word, 3)
print(f"Similar words to {word}:", similar_words)

Similar words to fox: ['brown', 'quick', 'fox']
