In [None]:
Implement the Continuous Bag of Words (CBOW) Model for the given (textual
document 1) using the below steps:
a. Data preparation
b. Generate training data
c. Train model
d. Output

In [2]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting click (from nltk)
  Downloading click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.11.3-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 1.1 MB/s eta 0:00:01
   -------------------- ------------------- 0.8/1.5 MB 1.2 MB/s eta 0:00:01
   --------------------------- ------------ 1.0/1.5 MB 1.2 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 1.3 MB/s  0:00:01



In [3]:
# Step a: Data Preparation
import nltk
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import numpy as np

In [9]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vaishnavi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Vaishnavi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [10]:
# Sample text (you can change it for your own document)
text = "I love natural language processing and I love deep learning"

# Tokenize the text
tokens = word_tokenize(text.lower())

print("Tokens:", tokens)

Tokens: ['i', 'love', 'natural', 'language', 'processing', 'and', 'i', 'love', 'deep', 'learning']


In [11]:
# Create a word-to-index dictionary
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokens)
word2idx = tokenizer.word_index
idx2word = {v: k for k, v in word2idx.items()}

vocab_size = len(word2idx) + 1  # +1 for padding if needed

print("\nWord to Index Mapping:", word2idx)


Word to Index Mapping: {'i': 1, 'love': 2, 'natural': 3, 'language': 4, 'processing': 5, 'and': 6, 'deep': 7, 'learning': 8}


In [12]:
# Step b: Generate Training Data for CBOW
window_size = 2
data = []

for i in range(window_size, len(tokens) - window_size):
    context = []
    for j in range(i - window_size, i + window_size + 1):
        if j != i:
            context.append(word2idx[tokens[j]])
    target = word2idx[tokens[i]]
    data.append((context, target))

print("\nSample training data (context → target):")
for c, t in data[:3]:
    print([idx2word[i] for i in c], "→", idx2word[t])


Sample training data (context → target):
['i', 'love', 'language', 'processing'] → natural
['love', 'natural', 'processing', 'and'] → language
['natural', 'language', 'and', 'i'] → processing


In [13]:
# Step c: Train CBOW Model using Keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Lambda, Dense
import tensorflow.keras.backend as K

In [14]:
# Prepare input and output
contexts = np.array([x[0] for x in data])
targets = np.array([x[1] for x in data])
targets = to_categorical(targets, vocab_size)

In [15]:
# Define the CBOW model
context_input = Input(shape=(2 * window_size,))
embedding = Embedding(input_dim=vocab_size, output_dim=8, input_length=2 * window_size)(context_input)
avg = Lambda(lambda x: K.mean(x, axis=1))(embedding)
output = Dense(vocab_size, activation='softmax')(avg)

model = Model(inputs=context_input, outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(contexts, targets, epochs=100, verbose=0)






<keras.src.callbacks.history.History at 0x2b36305a900>

In [16]:
# Step d: Output - Display Word Embeddings
weights = model.get_weights()[0]
print("\nWord Embeddings (each row corresponds to a word):")
for word, idx in word2idx.items():
    print(word, ":", weights[idx])


Word Embeddings (each row corresponds to a word):
i : [ 0.06297023  0.06411362 -0.04012642  0.01483477 -0.08171912 -0.11863741
 -0.19943026  0.21736698]
love : [ 0.22274497  0.03360213 -0.09703432 -0.02216818  0.0513423   0.22848998
 -0.0599842   0.02821499]
natural : [ 0.10064117 -0.07367028  0.05822498  0.02383697 -0.15704936  0.00652158
  0.08021242 -0.05821476]
language : [ 0.18608834  0.02572224 -0.19292735 -0.02684356  0.06667159 -0.04226843
 -0.1634473   0.18225907]
processing : [ 0.24213779  0.01662255 -0.05952685  0.00363164  0.00861415  0.18526335
 -0.14050922  0.06213596]
and : [-0.01709026 -0.0552813   0.07804796  0.09277067 -0.07956363 -0.09527668
 -0.02475209 -0.063516  ]
deep : [-0.07836723 -0.05767421 -0.00531642  0.09657529  0.0087047  -0.06305738
 -0.17726074  0.10483325]
learning : [-0.05968617  0.09768292  0.08049244  0.09734058 -0.07399023 -0.12950647
 -0.10908422  0.13061422]
