In [1]:
import sys,random,math
from collections import Counter
import numpy as np

f = open('../original/tasksv11/en/qa1_single-supporting-fact_train.txt','r')
raw = f.readlines()
f.close()


In [2]:
for line in raw[0:5]:
    print(line.lower().
          replace("\n","").
          replace("\t","").
          replace(".","").
          replace("?","").
          replace("1","").split(" ")[1:]) 

['mary', 'moved', 'to', 'the', 'bathroom']
['john', 'went', 'to', 'the', 'hallway']
['where', 'is', 'mary', 'bathroom']
['daniel', 'went', 'back', 'to', 'the', 'hallway']
['sandra', 'moved', 'to', 'the', 'garden']


In [3]:
tokens = list()
for line in raw[0:5]:
    tokens.append(line.lower().
          replace("\n","").
          replace("\t","").
          replace(".","").
          replace("?","").
          replace("1","").split(" ")[1:]
                 )
    
print(tokens)

[['mary', 'moved', 'to', 'the', 'bathroom'], ['john', 'went', 'to', 'the', 'hallway'], ['where', 'is', 'mary', 'bathroom'], ['daniel', 'went', 'back', 'to', 'the', 'hallway'], ['sandra', 'moved', 'to', 'the', 'garden']]


In [4]:
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)

vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i
    
def words2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
    return idx

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [6]:
print(len(vocab))
print(sum([len(x) for x in tokens]))

14
25


In [11]:
np.random.seed(1)
embed_size = 10

# word embeddings
embed = (np.random.rand(len(vocab),embed_size) - 0.5) * 0.1 #[14,10]

# embedding -> embedding (initially the identity matrix)
recurrent = np.eye(embed_size) #[10,10]

# sentence embedding for empty sentence
start = np.zeros(embed_size) #[10,]

# embedding -> output weights
decoder = (np.random.rand(embed_size, len(vocab)) - 0.5) * 0.1 #[10,14]

# one hot lookups (for loss function)
one_hot = np.eye(len(vocab)) #[14,14]

In [12]:
def predict(sent):
    
    layers = list()
    layer = {}
    layer['hidden'] = start
    layers.append(layer)

    loss = 0

    # forward propagate
    preds = list()
    for target_i in range(len(sent)):

        layer = {}

        # try to predict the next term
        layer['pred'] = softmax(layers[-1]['hidden'].dot(decoder)) #[10,][10,14] = [1,19]

        loss += -np.log(layer['pred'][sent[target_i]]) #will be zero if 1

        # generate the next hidden state
        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) + embed[sent[target_i]] #[10,1][10,10] + [1,10]
        layers.append(layer)
        
    return layers, loss

In [15]:
tokens[1]

['john', 'went', 'to', 'the', 'hallway']

In [16]:
tokens[1][1:]

['went', 'to', 'the', 'hallway']

In [32]:
tmp_sent = words2indices(tokens[1][1:])
tmp_sent

[13, 6, 5, 12]

In [47]:
layer,_ = predict([13,6,5,12])

In [48]:
layer

[{'hidden': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])},
 {'pred': array([0.07142857, 0.07142857, 0.07142857, 0.07142857, 0.07142857,
         0.07142857, 0.07142857, 0.07142857, 0.07142857, 0.07142857,
         0.07142857, 0.07142857, 0.07142857, 0.07142857]),
  'hidden': array([ 0.00857593,  0.04695957,  0.00610302, -0.04813527,  0.03006327,
         -0.02670257,  0.03071052, -0.01121394,  0.03635419,  0.02471216])},
 {'pred': array([0.0710932 , 0.07162197, 0.07144105, 0.07157902, 0.07184681,
         0.07153482, 0.07137346, 0.07120162, 0.07166611, 0.07176285,
         0.07120926, 0.0712069 , 0.07125751, 0.07120542]),
  'hidden': array([-0.03119063,  0.03836517,  0.02554304, -0.05671734, -0.01494139,
         -0.02311293,  0.04708998, -0.00972502,  0.08081366,  0.03336767])},
 {'pred': array([0.07112876, 0.07197887, 0.07149297, 0.07165562, 0.07181289,
         0.07158648, 0.07119637, 0.07107706, 0.07179413, 0.07189793,
         0.07122969, 0.07105498, 0.07108696, 0.0710073 ]),
 

In [24]:
layer[2]

{'pred': array([0.0710932 , 0.07162197, 0.07144105, 0.07157902, 0.07184681,
        0.07153482, 0.07137346, 0.07120162, 0.07166611, 0.07176285,
        0.07120926, 0.0712069 , 0.07125751, 0.07120542]),
 'hidden': array([-0.03119063,  0.03836517,  0.02554304, -0.05671734, -0.01494139,
        -0.02311293,  0.04708998, -0.00972502,  0.08081366,  0.03336767])}

In [40]:
#step0 
h0 = start
#step1
p1 = softmax(h0.dot(decoder)) #equal odds for all words
h1 = h0.dot(recurrent) + embed[tmp_sent[0]] #random vector weights one row for each token
#step2
p2 = softmax(h1.dot(decoder))
h2 = h1.dot(recurrent) + embed[tmp_sent[1]]
#and so on

In [43]:
# forward
for iter in range(1):
    alpha = 0.001
    sent = words2indices(tokens[iter%len(tokens)][1:]) #Sentence less the first word, with an interator 
    
    layers,loss = predict(sent) #returns multiple pred layers for each word [19,]

    # back propagate
    for layer_idx in reversed(range(len(layers))):
        layer = layers[layer_idx]
        target = sent[layer_idx-1] #because the sent is reduced by 1 in length and there is a start layer added when 
        #passed to predict(sent)

        if(layer_idx > 0):  # if not the first layer
            layer['output_delta'] = layer['pred'] - one_hot[target] #takes a particular row away [19,]
            new_hidden_delta = layer['output_delta'].dot(decoder.transpose()) #[19,][10,19] = [10,]
            
            # if the last layer - don't pull from a later one becasue it doesn't exist
            if(layer_idx == len(layers)-1):
                layer['hidden_delta'] = new_hidden_delta
            else:
                layer['hidden_delta'] = new_hidden_delta + layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())
        else: # if the first layer
            layer['hidden_delta'] = layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())

In [58]:
np.outer(layers[2]['hidden'],layers[2]['output_delta']).shape

(10, 14)

In [46]:
layers

[{'hidden': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
  'hidden_delta': array([ 0.01240052,  0.01205607,  0.02418878, -0.05836742, -0.02181459,
         -0.00636029, -0.0034963 , -0.00752625,  0.03472336,  0.04612733])},
 {'pred': array([0.07142857, 0.07142857, 0.07142857, 0.07142857, 0.07142857,
         0.07142857, 0.07142857, 0.07142857, 0.07142857, 0.07142857,
         0.07142857, 0.07142857, 0.07142857, 0.07142857]),
  'hidden': array([ 0.04034019, -0.03625253, -0.03607237,  0.03073913, -0.01023232,
         -0.03346458,  0.04275086, -0.01522341,  0.02508121,  0.0225998 ]),
  'output_delta': array([ 0.07142857,  0.07142857,  0.07142857,  0.07142857,  0.07142857,
          0.07142857,  0.07142857, -0.92857143,  0.07142857,  0.07142857,
          0.07142857,  0.07142857,  0.07142857,  0.07142857]),
  'hidden_delta': array([ 0.01240052,  0.01205607,  0.02418878, -0.05836742, -0.02181459,
         -0.00636029, -0.0034963 , -0.00752625,  0.03472336,  0.04612733])},
 {'pred': arr