In [22]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from collections import defaultdict


In [23]:
import re                                                           #  Load the Regex-modul
with open('../data/shakespeare.txt') as f:
    data = f.read()                                                 #  Read in the data
data = re.sub(r'[,!?;-]', '.',data)                                 #  Punktuations are replaced by .
data = nltk.word_tokenize(data)                                     #  Tokenize string to words
data = [ ch.lower() for ch in data if ch.isalpha() or ch == '.'] 

In [24]:
def get_dict(data):
    words = sorted(list(set(data)))
    n = len(words)
    idx = 0
    word2Ind = {}
    Ind2word = {}
    for k in words:
        word2Ind[k] = idx
        Ind2word[idx] = k
        idx += 1
    return word2Ind, Ind2word

In [25]:
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
print("Size of vocabulary: ", V)

Size of vocabulary:  5778


## make a NN eith one hiddeb layer. 
* it's input and output have V*1 dimmention. V is size of vocabulary
* first layer activation function is ReLu
$$f(h)=\max (0,h)$$
* second layer activation function is softMax
$$ \text{softmax}(z_i) = \frac{e^{z_i} }{\sum_{i=0}^{V-1} e^{z_i} }$$

* Implement the forward propagation $z$ according to equations (1) to (3).

$$\begin{align}
 h= W_1 \  X + b_1  \tag{1} \\
 a= ReLU(h)  \tag{2} \\
 z= W_2 \  a + b_2   \tag{3} \\
\end{align}$$

In [26]:
def initialize_model(N,V, random_seed=1):
    np.random.seed(random_seed)
    W1 = np.random.rand(N,V)
    W2 = np.random.rand(V,N)
    b1 = np.random.rand(N,1)
    b2 = np.random.rand(V,1)

    return W1, W2, b1, b2

In [27]:
def softmax(z):
    e_z = np.exp(z)
    yhat = e_z/np.sum(e_z,axis=0)
    return yhat

In [28]:
def forward_prop(x, W1, W2, b1, b2):
    h = np.dot(W1,x)+b1
    h = np.maximum(0,h)
    z = np.dot(W2,h)+b2
    return z, h

## Training the Model - Backpropagation
The formulas that you will implement for backpropagation are the following.

$$\begin{align}
 \frac{\partial J}{\partial \mathbf{W_1}} = \rm{ReLU}\left ( \mathbf{W_2^\top} (\mathbf{\hat{y}} - \mathbf{y})\right )\mathbf{x}^\top \\
 \frac{\partial J}{\partial \mathbf{W_2}} = (\mathbf{\hat{y}} - \mathbf{y})\mathbf{h^\top} \\
 \frac{\partial J}{\partial \mathbf{b_1}} = \rm{ReLU}\left ( \mathbf{W_2^\top} (\mathbf{\hat{y}} - \mathbf{y})\right ) \\
 \frac{\partial J}{\partial \mathbf{b_2}} = \mathbf{\hat{y}} - \mathbf{y} 
\end{align}$$

In [29]:
def back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size):
 
    l1 = np.dot(W2.T,(yhat-y))
    l1 = np.maximum(0,l1)
    grad_W1 = (1/batch_size)*np.dot(l1,x.T)
    grad_W2 = (1/batch_size)*np.dot(yhat-y,h.T)
    grad_b1 = np.sum((1/batch_size)*np.dot(l1,x.T),axis=1,keepdims=True)
    grad_b2 = np.sum((1/batch_size)*np.dot(yhat-y,h.T),axis=1,keepdims=True)
    
    return grad_W1, grad_W2, grad_b1, grad_b2

## Gradient Descent

In [30]:
def compute_cost(y, yhat, batch_size):
    # cost function 
    logprobs = np.multiply(np.log(yhat),y) + np.multiply(np.log(1 - yhat), 1 - y)
    cost = - 1/batch_size * np.sum(logprobs)
    cost = np.squeeze(cost)
    return cost

In [31]:
def get_idx(words, word2Ind):
    idx = []
    for word in words:
        idx = idx + [word2Ind[word]]
    return idx

In [32]:
def pack_idx_with_frequency(context_words, word2Ind):
    freq_dict = defaultdict(int)
    for word in context_words:
        freq_dict[word] += 1
    idxs = get_idx(context_words, word2Ind)
    packed = []
    for i in range(len(idxs)):
        idx = idxs[i]
        freq = freq_dict[context_words[i]]
        packed.append((idx, freq))
    return packed

In [33]:

def get_vectors(data, word2Ind, V, C):
    i = C
    while True:
        y = np.zeros(V)
        x = np.zeros(V)
        center_word = data[i]
        y[word2Ind[center_word]] = 1
        context_words = data[(i - C):i] + data[(i+1):(i+C+1)]
        num_ctx_words = len(context_words)
        for idx, freq in pack_idx_with_frequency(context_words, word2Ind):
            x[idx] = freq/num_ctx_words
        yield x, y
        i += 1
        if i >= len(data):
            print('i is being set to 0')
            i = 0


In [34]:
def get_batches(data, word2Ind, V, C, batch_size):
    batch_x = []
    batch_y = []
    for x, y in get_vectors(data, word2Ind, V, C):
        while len(batch_x) < batch_size:
            batch_x.append(x)
            batch_y.append(y)
        else:
            yield np.array(batch_x).T, np.array(batch_y).T
            batch = []


In [35]:
def gradient_descent(data, word2Ind, N, V, num_iters, alpha=0.03):
    

    W1, W2, b1, b2 = initialize_model(N,V, random_seed=200)
    batch_size = 128
    iters = 0
    C = 2
    for x, y in get_batches(data, word2Ind, V, C, batch_size):
        z, h = forward_prop(x, W1, W2, b1, b2)
        yhat = softmax(z)
        cost = compute_cost(y, yhat, batch_size)
        if ( (iters+1) % 10 == 0):
            print(f"iters: {iters + 1} cost: {cost:.6f}")
        grad_W1, grad_W2, grad_b1, grad_b2 = back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size)
        
        # Update weights and biases
        W1 -= alpha*grad_W1 
        W2 -= alpha*grad_W2
        b1 -= alpha*grad_b1
        b2 -= alpha*grad_b2
        
        iters += 1 
        if iters == num_iters: 
            break
        if iters % 100 == 0:
            alpha *= 0.66
            
    return W1, W2, b1, b2

In [36]:
# test your function
C = 2
N = 50
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
num_iters = 150
print("Call gradient_descent")
W1, W2, b1, b2 = gradient_descent(data, word2Ind, N, V, num_iters)

Call gradient_descent
iters: 10 cost: 0.107177
iters: 20 cost: 0.038644
iters: 30 cost: 0.023685
iters: 40 cost: 0.017095
iters: 50 cost: 0.013380
iters: 60 cost: 0.010995
iters: 70 cost: 0.009333
iters: 80 cost: 0.008108
iters: 90 cost: 0.007168
iters: 100 cost: 0.006423
iters: 110 cost: 0.005992
iters: 120 cost: 0.005633
iters: 130 cost: 0.005314
iters: 140 cost: 0.005030
iters: 150 cost: 0.004774


## Embeding vector is:


In [39]:
embed = (W1.T+W2)*0.5
embed.shape

(5778, 50)