# Download & Preprocess the IMDB Dataset

In [34]:
# Download reviews.txt and labels.txt from here: https://github.com/udacity/deep-learning/tree/master/sentiment-network

def pretty_print_review_and_label(i):
   print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()


# Preprocess dataset:

import sys

f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()

tokens = list(map(lambda x:set(x.split(" ")),raw_reviews))

vocab = set()
for sent in tokens:
    for word in sent:
        if(len(word)>0):
            vocab.add(word)
vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))

target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

# The Surprising Power of Averaged Word Vectors

In [35]:
import numpy as np
norms = np.sum(weights_0_1 * weights_0_1,axis=1)
norms.resize(norms.shape[0],1)
normed_weights = weights_0_1 * norms

def make_sent_vect(words):
    indices = list(map(lambda x:word2index[x],filter(lambda x:x in word2index,words)))
    return np.mean(normed_weights[indices],axis=0)

reviews2vectors = list()
for review in tokens: # tokenized reviews
    reviews2vectors.append(make_sent_vect(review))
reviews2vectors = np.array(reviews2vectors)

def most_similar_reviews(review):
    v = make_sent_vect(review)
    scores = Counter()
    for i,val in enumerate(reviews2vectors.dot(v)):
        scores[i] = val
    most_similar = list()
    
    for idx,score in scores.most_common(3):
        most_similar.append(raw_reviews[idx][0:40])
    return most_similar

most_similar_reviews(['boring','awful'])

['this tim burton remake of the original  ',
 'certainly one of the dozen or so worst m',
 'boring and appallingly acted  summer phe']

# Matrices that Change Absolutely Nothing

In [37]:
import numpy as np

a = np.array([1,2,3])
b = np.array([0.1,0.2,0.3])
c = np.array([-1,-0.5,0])
d = np.array([0,0,0])

identity = np.eye(3)
print(identity)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [38]:
print(a.dot(identity))
print(b.dot(identity))
print(c.dot(identity))
print(d.dot(identity))

[1. 2. 3.]
[0.1 0.2 0.3]
[-1.  -0.5  0. ]
[0. 0. 0.]


In [39]:
this = np.array([2,4,6])
movie = np.array([10,10,10])
rocks = np.array([1,1,1])

print(this + movie + rocks)
print((this.dot(identity) + movie).dot(identity) + rocks)

[13 15 17]
[13. 15. 17.]


# Forward Propagation in Python

In [40]:
import numpy as np

def softmax(x_):
    x = np.atleast_2d(x_)
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

word_vects = {}
word_vects['yankees'] = np.array([[0.,0.,0.]])
word_vects['bears'] = np.array([[0.,0.,0.]])
word_vects['braves'] = np.array([[0.,0.,0.]])
word_vects['red'] = np.array([[0.,0.,0.]])
word_vects['socks'] = np.array([[0.,0.,0.]])
word_vects['lose'] = np.array([[0.,0.,0.]])
word_vects['defeat'] = np.array([[0.,0.,0.]])
word_vects['beat'] = np.array([[0.,0.,0.]])
word_vects['tie'] = np.array([[0.,0.,0.]])

sent2output = np.random.rand(3,len(word_vects))

identity = np.eye(3)

In [41]:
layer_0 = word_vects['red']
layer_1 = layer_0.dot(identity) + word_vects['socks']
layer_2 = layer_1.dot(identity) + word_vects['defeat']

pred = softmax(layer_2.dot(sent2output))
print(pred)

[[0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111
  0.11111111 0.11111111 0.11111111]]


# How do we Backpropagate into this?

In [46]:
y = np.array([1,0,0,0,0,0,0,0,0]) # target one-hot vector for "yankees"

pred_delta = pred - y
layer_2_delta = pred_delta.dot(sent2output.T)
defeat_delta = layer_2_delta * 1 # can ignore the "1" like prev. chapter
layer_1_delta = layer_2_delta.dot(identity.T)
socks_delta = layer_1_delta * 1 # again... can ignore the "1"
layer_0_delta = layer_1_delta.dot(identity.T)
alpha = 0.01
word_vects['red'] -= layer_0_delta * alpha
word_vects['socks'] -= socks_delta * alpha
word_vects['defeat'] -= defeat_delta * alpha
identity -= np.outer(layer_0,layer_1_delta) * alpha
identity -= np.outer(layer_1,layer_2_delta) * alpha
sent2output -= np.outer(layer_2,pred_delta) * alpha

# Let's Train it!

In [135]:
import sys,random,math
from collections import Counter
import numpy as np

f = open('tasksv11/en/qa1_single-supporting-fact_train.txt','r')
raw = f.readlines()
f.close()

tokens = list()
for line in raw[0:1000]:
    tokens.append(line.lower().replace("\n","").split(" ")[1:])
    
tokens2 = [["Mary","had","a","little","lamb"],
         ["Its","fleec","was","white","as","snow"],
          ["And","everywhere","that","Mary","went"],
          ["The","lamb","was","sure","to","go"]]


tokens = tokens2
print(tokens[0:3])

[['Mary', 'had', 'a', 'little', 'lamb'], ['Its', 'fleec', 'was', 'white', 'as', 'snow'], ['And', 'everywhere', 'that', 'Mary', 'went']]


In [136]:
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)

vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i
    
def words2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
    return idx

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [137]:
np.random.seed(1)
embed_size = 10

# word embeddings
embed = (np.random.rand(len(vocab),embed_size) - 0.5) * 0.1 #[19,10]

# embedding -> embedding (initially the identity matrix)
recurrent = np.eye(embed_size) #[10,10]

# sentence embedding for empty sentence
start = np.zeros(embed_size) #[10,]

# embedding -> output weights
decoder = (np.random.rand(embed_size, len(vocab)) - 0.5) * 0.1 #[10,19]

# one hot lookups (for loss function)
one_hot = np.eye(len(vocab)) #[19,19]

In [143]:
print(one_hot.shape)

(19, 19)


# Forward Propagation with Arbitrary Length

In [123]:
def predict(sent):
    
    layers = list()
    layer = {}
    layer['hidden'] = start
    layers.append(layer)

    loss = 0

    # forward propagate
    preds = list()
    for target_i in range(len(sent)):

        layer = {}

        # try to predict the next term
        layer['pred'] = softmax(layers[-1]['hidden'].dot(decoder)) #[10,][10,19] = [1,19]

        loss += -np.log(layer['pred'][sent[target_i]]) #will be zero if 1

        # generate the next hidden state
        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) + embed[sent[target_i]] #[10,1][10,10] + [1,10]
        layers.append(layer)
        
    return layers, loss

In [147]:
np.log(-.1)

  """Entry point for launching an IPython kernel.


nan

In [149]:
for i in reversed(range(10)):
    print(i)

9
8
7
6
5
4
3
2
1
0


# Backpropagation with Arbitrary Length

In [160]:
# forward
for iter in range(8):
    alpha = 0.001
    sent = words2indices(tokens[iter%len(tokens)][1:]) #Sentence less the first word, with an interator 
    
    layers,loss = predict(sent) #returns multiple pred layers for each word [19,]

    # back propagate
    for layer_idx in reversed(range(len(layers))):
        print(layer_idx)
        layer = layers[layer_idx]
        target = sent[layer_idx-1] #because the sent is reduced by 1 in length and there is a start layer added when 
        #passed to predict(sent)

        if(layer_idx > 0):  # if not the first layer
            layer['output_delta'] = layer['pred'] - one_hot[target] #takes a particular row away [19,]
            new_hidden_delta = layer['output_delta'].dot(decoder.transpose()) #[19,][10,19] = [10,]
            
            # if the last layer - don't pull from a later one becasue it doesn't exist
            if(layer_idx == len(layers)-1):
                layer['hidden_delta'] = new_hidden_delta
            else:
                layer['hidden_delta'] = new_hidden_delta + layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())
        else: # if the first layer
            layer['hidden_delta'] = layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())

4
3
2
1
0
5
4
3
2
1
0
4
3
2
1
0
5
4
3
2
1
0
4
3
2
1
0
5
4
3
2
1
0
4
3
2
1
0
5
4
3
2
1
0


# Weight Update with Arbitrary Length

In [23]:
# forward
for iter in range(30000):
    alpha = 0.001
    sent = words2indices(tokens[iter%len(tokens)][1:])

    layers,loss = predict(sent) 

    # back propagate
    for layer_idx in reversed(range(len(layers))):
        layer = layers[layer_idx]
        target = sent[layer_idx-1]

        if(layer_idx > 0):
            layer['output_delta'] = layer['pred'] - one_hot[target]
            new_hidden_delta = layer['output_delta'].dot(decoder.transpose())

            # if the last layer - don't pull from a 
            # later one becasue it doesn't exist
            if(layer_idx == len(layers)-1):
                layer['hidden_delta'] = new_hidden_delta
            else:
                layer['hidden_delta'] = new_hidden_delta + layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())
        else:
            layer['hidden_delta'] = layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())

    # update weights
    start -= layers[0]['hidden_delta'] * alpha / float(len(sent))
    for layer_idx,layer in enumerate(layers[1:]):
        
        decoder -= np.outer(layers[layer_idx]['hidden'], layer['output_delta']) * alpha / float(len(sent))
        
        embed_idx = sent[layer_idx]
        embed[embed_idx] -= layers[layer_idx]['hidden_delta'] * alpha / float(len(sent))
        recurrent -= np.outer(layers[layer_idx]['hidden'], layer['hidden_delta']) * alpha / float(len(sent))
        
    if(iter % 1000 == 0):
        print("Perplexity:" + str(np.exp(loss/len(sent))))

Perplexity:81.85296380363512
Perplexity:81.78970032261698
Perplexity:81.67314284979052
Perplexity:81.42144676213766
Perplexity:80.86988263139762
Perplexity:79.6221240578163
Perplexity:76.46406101512835
Perplexity:65.40885487605719
Perplexity:36.68148775577676
Perplexity:21.711917220385033
Perplexity:19.28387588531635
Perplexity:18.091844701810007
Perplexity:16.627776483301755
Perplexity:14.40479654576648
Perplexity:11.247572071819219
Perplexity:8.310679694809787
Perplexity:6.80480088929391
Perplexity:5.937614049200221
Perplexity:5.386824398662348
Perplexity:5.0412438406364535
Perplexity:4.800284865691207
Perplexity:4.6356249865486
Perplexity:4.536240952832585
Perplexity:4.4824700901374905
Perplexity:4.43308159829629
Perplexity:4.367723190561668
Perplexity:4.288501110533486
Perplexity:4.202243417729626
Perplexity:4.115490271573415
Perplexity:4.03263619290129


# Execution and Output Analysis

In [24]:
sent_index = 4

l,_ = predict(words2indices(tokens[sent_index]))

print(tokens[sent_index])

for i,each_layer in enumerate(l[1:-1]):
    input = tokens[sent_index][i]
    true = tokens[sent_index][i+1]
    pred = vocab[each_layer['pred'].argmax()]
    print("Prev Input:" + input + (' ' * (12 - len(input))) +\
          "True:" + true + (" " * (15 - len(true))) + "Pred:" + pred)

['sandra', 'moved', 'to', 'the', 'garden.']
Prev Input:sandra      True:moved          Pred:is
Prev Input:moved       True:to             Pred:to
Prev Input:to          True:the            Pred:the
Prev Input:the         True:garden.        Pred:bedroom.
