In [None]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt


nltk.download("punkt")

In [None]:
nltk.data.path.append('.')

In [28]:
path = "shakespeare.txt"
with open(path, 'r') as file:
     data: str = file.read()

data = re.sub(r'[,!?;-]', '.',data)
data = word_tokenize(data)
data = [token.lower() for token in data if token.isalpha() or token == '.']

In [29]:
print(f"The Number of tokens = {len(data)} \n {data[:20]}")

The Number of tokens = 1084877 
 ['this', 'is', 'the', 'etext', 'file', 'presented', 'by', 'project', 'gutenberg', '.', 'and', 'is', 'presented', 'in', 'cooperation', 'with', 'world', 'library', '.', 'inc']


In [30]:
fdist = nltk.FreqDist(word for word in data)
print("Size of vocabulary: ",len(fdist) )
print("Most frequent tokens: ",fdist.most_common(20) )

Size of vocabulary:  22892
Most frequent tokens:  [('.', 197374), ('the', 27597), ('and', 26724), ('i', 22059), ('to', 19188), ('of', 18180), ('a', 14654), ('you', 13828), ('my', 12465), ('that', 11503), ('is', 10995), ('in', 10990), ('not', 9481), ('for', 8241), ('with', 7994), ('me', 7769), ('it', 7718), ('be', 7081), ('your', 6873), ('his', 6851)]


In [31]:
def get_dict(data: list[str]) -> tuple:
    words = sorted(list(set(data)))
    n = len(words)
    idx = 0
    word2Ind = {}
    Ind2word = {}
    for k in words:
        word2Ind[k] = idx
        Ind2word[idx] = k
        idx += 1
    return word2Ind, Ind2word


word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
print("Size of vocabulary: ", V)

Size of vocabulary:  22892


In [32]:
print("Index of the word 'king' :  ",word2Ind['king'] )
print("Word which has index 2743:  ",Ind2word[2743] )

Index of the word 'king' :   11116
Word which has index 2743:   butcherly


In [33]:
def init_parameters(N: int, V: int, random_seed: int= 1) -> dict:
    np.random.seed(random_seed)
    parameters = {}

    parameters["W1"] = np.random.rand(N, V)
    parameters["b1"] = np.zeros(shape=(N, 1))

    parameters["W2"] = np.random.rand(V, N)
    parameters["b2"] = np.zeros(shape=(V, 1))

    return parameters

In [34]:
tmp_N = 4
tmp_V = 10
params = init_parameters(tmp_N,tmp_V)
assert params['W1'].shape == ((tmp_N,tmp_V))
assert params['W2'].shape == ((tmp_V,tmp_N))
print(f"W1 shape: {params['W1'].shape}")
print(f"W2 shape: {params['W2'].shape}")
print(f"b1 shape: {params['b1'].shape}")
print(f"b2 shape: {params['b2'].shape}")

W1 shape: (4, 10)
W2 shape: (10, 4)
b1 shape: (4, 1)
b2 shape: (10, 1)


In [35]:
from typing import List, Union

def softmax(z: Union[float, List[float]]) -> Union[float, List[float]]:
    y = np.exp(z) / np.sum(np.exp(z), 0, keepdims=True)

    return y

def sigmoid(z: Union[float, List[float]]) -> Union[float, List[float]]:
    return 1.0 / (1.0 + np.exp(-z))

def relu(z: Union[float, List[float]]) -> Union[float, List[float]]:
    return np.maximum(z, 0)

In [36]:
tmp = np.array([[1,2,3],
                [1,1,1]
               ])
tmp_sm = softmax(tmp)
display(tmp_sm)

array([[0.5       , 0.73105858, 0.88079708],
       [0.5       , 0.26894142, 0.11920292]])

In [37]:
def forward_propagation(x: np.ndarray, params: dict) -> tuple:
    z1 = np.dot(params['W1'], x) + params['b1']
    h = relu(z1)

    z2 = np.dot(params['W2'], h) + params['b2']
    y_hat = z2

    return y_hat, h

In [38]:
tmp_x = np.array([[0,1,0,0,0,0,0,0,0,0]]).T
tmp_z, tmp_h = forward_propagation(tmp_x, params)
print("call forward_prop")
print()
# Look at output
print(f"z has shape {tmp_z.shape}")
print("z has values:")
print(tmp_z)

call forward_prop

z has shape (10, 1)
z has values:
[[1.82887324]
 [1.38466287]
 [0.60100484]
 [0.83284144]
 [1.37936774]
 [1.20420827]
 [1.26273995]
 [2.0149547 ]
 [1.10825153]
 [1.93910889]]


In [39]:
def cross_entropy_loss(y_hat: np.ndarray, y: np.ndarray,
                       batch_size: int) -> Union[float, List[float]]:
    logprobs = np.multiply(np.log(y_hat), y)
    cost = -1/batch_size * np.sum(logprobs)
    cost = np.squeeze(cost)
    return cost

In [40]:
def backward_propagation(x: np.ndarray, y: np.ndarray, y_hat: np.ndarray, h: np.ndarray,
                         params: dict, batch_size: int) -> dict:
    grads_params = {}

    l1 = np.dot(params['W2'].T ,(y_hat - y))

    grads_params['W1'] = np.dot(l1, x.T) / batch_size
    grads_params['W2'] = np.dot((y_hat - y), h.T) / batch_size

    grads_params['b1'] = np.sum(l1, axis=1, keepdims= True) / batch_size
    grads_params['b2'] = np.sum((y_hat - y), axis= 1, keepdims= True) / batch_size

    return grads_params

In [41]:
from collections import defaultdict

def get_idx(words, word2Ind):
    idx = []
    for word in words:
        idx = idx + [word2Ind[word]]
    return idx

def pack_idx_with_frequency(context_words, word2Ind):
    freq_dict = defaultdict(int)
    for word in context_words:
        freq_dict[word] += 1
    idxs = get_idx(context_words, word2Ind)
    packed = []
    for i in range(len(idxs)):
        idx = idxs[i]
        freq = freq_dict[context_words[i]]
        packed.append((idx, freq))
    return packed

def get_vectors(data, word2Ind, V, C):
    i = C
    while True:
        y = np.zeros(V)
        x = np.zeros(V)
        center_word = data[i]
        y[word2Ind[center_word]] = 1
        context_words = data[(i - C) : i] + data[(i + 1) : (i + C + 1)]
        num_ctx_words = len(context_words)
        for idx, freq in pack_idx_with_frequency(context_words, word2Ind):
            x[idx] = freq / num_ctx_words
        yield x, y
        i += 1
        if i >= len(data) - C:
            print("i is being set to", C)
            i = C

def get_batches(data, word2Ind, V, C, batch_size):
    batch_x = []
    batch_y = []
    for x, y in get_vectors(data, word2Ind, V, C):
        if len(batch_x) < batch_size:
            batch_x.append(x)
            batch_y.append(y)
        else:
            yield np.array(batch_x).T, np.array(batch_y).T
            batch_x = []
            batch_y = []

In [42]:
def gradient_descent(data: list[str], word2Ind: dict, N: int,
                     V: int, epochs: int, alpha: float= 0.03,
                     random_seed: int= 282) -> dict:

    param = init_parameters(N,V, random_seed=random_seed)

    batch_size = 128
    epoch = 0
    C = 2

    for x, y in get_batches(data, word2Ind, V, C, batch_size):

        z, h = forward_propagation(x, param)
        yhat = softmax(z)

        cost = cross_entropy_loss(yhat, y, batch_size)
        if ( (epoch+1) % 10 == 0):
            print(f"iters: {epoch + 1} cost: {cost:.6f}")


        grads = backward_propagation(x,y, yhat, h, param, batch_size)


        param['W1'] = param['W1'] - alpha * grads['W1']
        param['W2'] = param['W2'] - alpha * grads['W2']
        param['b1'] = param['b1'] - alpha * grads['b1']
        param['b2'] = param['b2'] - alpha * grads['b2']

        epoch +=1
        if epoch == epochs:
            break
        if epoch % 100 == 0:
            alpha *= 0.66

    return param

In [43]:
C = 2
N = 50
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
num_iters = 150
print("Call gradient_descent")
params = gradient_descent(data, word2Ind, N, V, num_iters)

Call gradient_descent
iters: 10 cost: 10.515535
iters: 20 cost: 10.707387
iters: 30 cost: 10.320237
iters: 40 cost: 10.211154
iters: 50 cost: 10.123117
iters: 60 cost: 10.107501
iters: 70 cost: 9.980620
iters: 80 cost: 9.905205
iters: 90 cost: 9.822018
iters: 100 cost: 9.705264
iters: 110 cost: 9.742620
iters: 120 cost: 9.485970
iters: 130 cost: 9.366885
iters: 140 cost: 9.579672
iters: 150 cost: 9.518378
