In [1]:
# ============================================
# Continuous Bag of Words (CBOW) Model - Hybrid Version
# ============================================
# 1️⃣ Import Libraries
import numpy as np
import re
import random


In [8]:
# -------------------------
# 2️⃣ Data Preparation
# -------------------------
# A slightly bigger corpus (like your advanced file)
text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules called a program.
People create programs to direct processes."""

# Basic text cleaning
text = re.sub('[^A-Za-z]+', ' ', text)   # remove special characters
text = text.lower()                      # lowercase
corpus = text.split()                    # split into words

print("Sample Corpus:", corpus[:20])
print("Total words:", len(corpus))

Sample Corpus: ['we', 'are', 'about', 'to', 'study', 'the', 'idea', 'of', 'a', 'computational', 'process', 'computational', 'processes', 'are', 'abstract', 'beings', 'that', 'inhabit', 'computers', 'as']
Total words: 50


In [9]:
# Build vocabulary
vocab = set(corpus)
vocab_size = len(vocab)
word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = {i: word for word, i in word2idx.items()}

print("Vocabulary Size:", vocab_size)

Vocabulary Size: 36


In [10]:
# -------------------------
# 3️⃣ Generate Training Data
# -------------------------

data = []
for i in range(2, len(corpus) - 2):
    context = [corpus[i - 2], corpus[i - 1], corpus[i + 1], corpus[i + 2]]
    target = corpus[i]
    data.append((context, target))

print("\nSample Training Data:")
for c, t in data[:5]:
    print(f"{c} -> {t}")

print("\nTotal (context, target) pairs:", len(data))


Sample Training Data:
['we', 'are', 'to', 'study'] -> about
['are', 'about', 'study', 'the'] -> to
['about', 'to', 'the', 'idea'] -> study
['to', 'study', 'idea', 'of'] -> the
['study', 'the', 'of', 'a'] -> idea

Total (context, target) pairs: 46


In [11]:
# -------------------------
# 4️⃣ Model Setup (CBOW)
# -------------------------
def one_hot_encoding(word):
    one_hot = np.zeros(vocab_size)
    one_hot[word2idx[word]] = 1
    return one_hot

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

embedding_dim = 10
W1 = np.random.rand(vocab_size, embedding_dim)
W2 = np.random.rand(embedding_dim, vocab_size)


In [12]:
# -------------------------
# 5️⃣ Training the Model
# -------------------------
def cbow_forward(context_words):
    hidden = np.mean([W1[word2idx[w]] for w in context_words], axis=0)
    output = np.dot(W2.T, hidden)
    prediction = softmax(output)
    return prediction, hidden

lr = 0.01
epochs = 1000

for epoch in range(epochs):
    loss = 0
    for context, target in data:
        y_pred, h = cbow_forward(context)
        target_one_hot = one_hot_encoding(target)
        error = y_pred - target_one_hot

        # Gradients
        dW2 = np.outer(h, error)
        dW1 = np.zeros_like(W1)
        for w in context:
            dW1[word2idx[w]] += np.dot(W2, error)

        # Update weights
        W1 -= lr * dW1
        W2 -= lr * dW2

        loss += -np.sum(target_one_hot * np.log(y_pred + 1e-9))

    if (epoch + 1) % 200 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss:.4f}")


Epoch 200/1000, Loss: 24.7859
Epoch 400/1000, Loss: 4.2255
Epoch 600/1000, Loss: 1.8380
Epoch 800/1000, Loss: 1.1076
Epoch 1000/1000, Loss: 0.7731


In [7]:
# -------------------------
# 6️⃣ Output Predictions
# -------------------------
print("\n=== Sample Predictions ===")
for context, target in random.sample(data, 5):
    prediction, _ = cbow_forward(context)
    predicted_word = idx2word[np.argmax(prediction)]
    print(f"Context: {context} | Predicted: {predicted_word} | Actual: {target}")


=== Sample Predictions ===
Context: ['is', 'directed', 'a', 'pattern'] | Predicted: by | Actual: by
Context: ['of', 'a', 'process', 'computational'] | Predicted: computational | Actual: computational
Context: ['the', 'idea', 'a', 'computational'] | Predicted: of | Actual: of
Context: ['idea', 'of', 'computational', 'process'] | Predicted: a | Actual: a
Context: ['program', 'people', 'programs', 'to'] | Predicted: create | Actual: create
