In [128]:
import matplotlib.pyplot as plt #used for creating static, animated visulaisation like graphs, histograms, etc
import seaborn as sns #it is also used for data visulaisation
import matplotlib as mpl #it is base library used for adusting font size, color, etc
import matplotlib.pylab as pylab #it is collection of function
import numpy as np #used for data manipulation and numerical operations
%matplotlib inline

Data Preparation

In [129]:
import re #used to import regular expression for tokenisation, stemming, lemmatisation

In [130]:
# Example sentence
sentences = """We are about to study the idea of a computational process. Computational processes are abstract beings that inhabit computers. As they evolve, processes manipulate other abstract things called data. The evolution of a process is directed by a pattern of rules called a program. People create programs to direct processes. In effect, we conjure the spirits of the computer with our spells."""

Clean Data

In [131]:
# Remove special characters
sentences = re.sub('[^A-Za-z\s]', '', sentences)

# Remove single-letter words (optional)
sentences = re.sub(r'\b\w\b', '', sentences).strip()

# Lowercase all characters
sentences = sentences.lower()


Vocabulary

In [132]:
# Split sentence into words
words = sentences.split()

# Create vocabulary set
vocab = set(words)  #It remove duplicates words as it is set


In [133]:
vocab_size = len(vocab) #it gives the total number of unique words in vocab
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}
embed_dim= 10
context_size= 2

Context-target pairs

In [134]:
# Prepare the CBOW context-target data
data = []
for i in range(2, len(words) - 2):
    context = [words[i - 2], words[i - 1], words[i + 1], words[i + 2]]
    target = words[i]
    data.append((context, target))

In [135]:
# Print first 5 context-target pairs
print(data[:5])

[(['we', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'computational'], 'idea')]


Embeddings and Theta Initialisation

In [136]:
embeddings= np.random.random_sample((vocab_size, embed_dim))
theta = np.random.uniform(-1, 1, (2 * context_size * embed_dim, vocab_size))

Models

In [137]:
def linear(m, theta): #it performs matrix multiplication of m and theta
    w=theta
    return m.dot(w)

In [138]:
def log_softmax(x):
    e_x = np.exp(x - np.max(x))
    return np.log(e_x / e_x.sum(axis=-1, keepdims=True))

In [139]:
def NLLLoss(logs, targets):
    out = logs[range(len(targets)), targets]
    return -out.mean()

In [140]:
def forward(context_idxs, theta):
    m = embeddings[context_idxs].reshape(1, -1)
    n = linear(m, theta)
    o = log_softmax(n)
    return m, n, o

In [141]:
def backward(preds, theta, target_idx):
    m, n, o = preds
    target_vector = np.zeros_like(o)
    target_vector[0, target_idx] = 1
    dlog = o - target_vector
    dw = m.T.dot(dlog)
    return dw

In [142]:
def optimize(theta, grad, lr=0.03):
    theta -= lr * grad
    return theta


In [143]:
# Training loop

In [144]:
epochs = 100
lr = 0.01
for epoch in range(epochs):
    total_loss = 0
    for context, target in data:
        context_idxs = np.array([word_to_ix[w] for w in context])
        target_idx = word_to_ix[target]
        
        # Forward pass
        preds = forward(context_idxs, theta)
        
        # Loss calculation
        _, _, log_probs = preds
        loss = NLLLoss(log_probs, [target_idx])
        total_loss += loss
        
        # Backward pass and optimization
        grad = backward(preds, theta, target_idx)
        theta = optimize(theta, grad, lr)

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss / len(data)}")


Epoch 0, Loss: 4.562663621333068
Epoch 10, Loss: 3.7171285190997
Epoch 20, Loss: 3.5160265076750554
Epoch 30, Loss: 3.416271123564801
Epoch 40, Loss: 3.3542948994820905
Epoch 50, Loss: 3.3112002352466083
Epoch 60, Loss: 3.2792036867228056
Epoch 70, Loss: 3.254395790415956
Epoch 80, Loss: 3.234549855081747
Epoch 90, Loss: 3.218287292183718


In [145]:
# Prediction function

In [146]:
def predict(words):
    context_idxs = np.array([word_to_ix[w] for w in words])
    _, _, log_probs = forward(context_idxs, theta)
    word = ix_to_word[np.argmax(log_probs)]
    return word

In [147]:
# Testing the prediction

In [148]:
print(predict(['we', 'are', 'to', 'study']))

about


In [149]:
# Accuracy function

In [150]:
def accuracy():
    wrong = 0
    for context, target in data:
        if predict(context) != target:
            wrong += 1
    return 1 - (wrong / len(data))

In [151]:
print("Accuracy:", accuracy())

Accuracy: 0.9814814814814815


In [152]:
predict(['are','abstract','that','inhabit'])

'beings'