In [1]:
# Import necessary libraries
import numpy as np                              # Used for numerical operations and array handling

from tensorflow.keras.models import Sequential   # Allows building a neural network model layer-by-layer
from tensorflow.keras.layers import Dense, Embedding, Lambda  # Required layers for CBOW model

from tensorflow.keras.preprocessing.text import Tokenizer     # Converts text into sequences of integers
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Helps to pad sequences to equal length

from tensorflow.keras.utils import to_categorical             # Converts target values to one-hot encoding

import tensorflow.keras.backend as K                          # Allows defining custom operations (used for averaging embeddings)


In [2]:
# If dataset is to be imported in exam, we can write:
# Example: loading a text dataset from a file
# with open("dataset.txt", "r") as f:
#     text = f.read()
#
# Or from CSV:
# import pandas as pd
# data = pd.read_csv("file.csv")
# text = " ".join(data['column_name'])

In [3]:
# If no file is loaded, fallback sample sentences:
#text = [
# "Machine learning models can learn word embeddings",
#  "Continuous Bag of Words is one Word2Vec model",
#   "Neural networks are powerful tools for NLP tasks"
#]

In [4]:
# convert words to numerical indices
tokenizer = Tokenizer()

# Fit the tokenizer on the given text data 
tokenizer.fit_on_texts(text)

# Vocabulary size = total unique words + 1 (indexing starts from 1)
vocab_size = len(tokenizer.word_index) + 1


# Convert the text into sequences of integers based on the learned vocabulary
sequences = tokenizer.texts_to_sequences(text)


# Window size: Number of context words to take from each side of the target word
window_size = 2

# Dimension of the embedding vector for each word
embedding_dim = 10


# Lists to store context words (inputs) and target word (output)
contexts = []
targets = []


In [6]:
# Generate Training Data for CBOW Model
for sentence in sequences:                          # Go through each sentence in the dataset
    for i, word in enumerate(sentence):             # For each word in the sentence
        # Define the context window boundaries around the target word
        start = max(0, i - window_size)
        end = i + window_size + 1

        # Collect context words (exclude the target word itself)
        context_words = [sentence[j] for j in range(start, end)
                         if j != i and j < len(sentence)]

        # Pad context if fewer words (for words near the beginning or end of sentence)
        if len(context_words) < window_size * 2:
            context_words = [0]*(window_size*2 - len(context_words)) + context_words
        
        # Store context and corresponding target word
        contexts.append(context_words)
        targets.append(word)

# Convert lists to NumPy arrays for model training
X = np.array(contexts)

# Convert target words to one-hot encoded format
y = to_categorical(targets, num_classes=vocab_size)


In [9]:
# Build the CBOW Model
model = Sequential([
    # Embedding layer converts word indices into dense vector representations
    Embedding(vocab_size, embedding_dim, input_length=window_size * 2),

    # Lambda layer takes the average of all context word embeddings (CBOW concept)
    Lambda(lambda x: K.mean(x, axis=1)),

    # Output layer with softmax to predict the target word from vocabulary
    Dense(vocab_size, activation='softmax')
])

# Compile the model with appropriate loss function and optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Display model structure
model.summary()

# Train the model with training data
model.fit(X, y, epochs=10, batch_size=16)


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 4, 10)             240       
                                                                 
 lambda_2 (Lambda)           (None, 10)                0         
                                                                 
 dense_2 (Dense)             (None, 24)                264       
                                                                 
Total params: 504
Trainable params: 504
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1765ebace20>

In [8]:
# Extract the learned word embeddings (weights of the embedding layer)
weights = model.get_weights()[0]

# Open a file to save the word vectors in a readable format
with open("vectors_simple.txt", "w") as f:
    # Write vocabulary size and embedding dimension in the first line (standard format)
    f.write(f"{vocab_size} {embedding_dim}\n")

    # For each word in the vocabulary, write the word followed by its vector values
    for word, i in tokenizer.word_index.items():
        vector = weights[i]                          # Get embedding vector of the word
        vector_str = ' '.join(map(str, vector))      # Convert numeric values to string
        f.write(f"{word} {vector_str}\n")            # Write to file in "word val1 val2 ..." format

# Final message to indicate completion
print("Training complete and embeddings saved to vectors_simple.txt")


Training complete and embeddings saved to vectors_simple.txt
