In [1]:
import numpy as np
from collections import Counter

In [2]:
f = open('reviews.txt','r')
raw_reviews = f.readlines()
f.close

f = open("labels.txt",'r')
raw_labels = f.readlines()
f.close()

In [3]:
tokens = list(map(lambda x:set(x.split(" ")),raw_reviews))

In [4]:
vocab = set()

for token in tokens:
    for word in token:
        if(len(word) > 0):
            vocab.add(word)
vocab = list(vocab)

In [5]:
word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i

In [6]:
input_dataset = []

for sent in tokens:
    sentence = []
    for word in sent:
        if len(word) > 0:            
            sentence.append(word2index[word])
    input_dataset.append(sentence)

In [7]:
len(input_dataset)

25000

In [8]:
target_dataset = []
for score in raw_labels:
    if(score == "positive\n"):
        target_dataset.append(1)
    else:
        target_dataset.append(0)

In [9]:
weights_0_1 = np.load('weights/c_11.npy')

In [10]:
norms = np.sum(weights_0_1 * weights_0_1,axis=1)
norms.resize(norms.shape[0],1)
normed_weights = weights_0_1 * norms

def make_sent_vect(words):
    indices = list(map(lambda x:word2index[x],filter(lambda x:x in word2index,words)))
    return np.mean(normed_weights[indices],axis=0)

reviews2vectors = list()
for review in tokens: # tokenized reviews
    reviews2vectors.append(make_sent_vect(review))
reviews2vectors = np.array(reviews2vectors)

def most_similar_reviews(review):
    v = make_sent_vect(review)
    scores = Counter()
    for i,val in enumerate(reviews2vectors.dot(v)):
        scores[i] = val
    most_similar = list()
    
    for idx,score in scores.most_common(3):
        most_similar.append(raw_reviews[idx][0:40])
    return most_similar



In [11]:
most_similar_reviews(['awful'])

['at a panel discussion that i attended af',
 'photography was too jumpy to follow . da',
 'my short comment for this flick is go pi']

# sentence embedding using identity matrix

In [12]:
a = np.array([1,2,3])
b = np.array([0.1,0.2,0.3])
c = np.array([-1,-0.5,0])
d = np.array([0,0,0])

identity = np.eye(3)

In [13]:
print(identity)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [14]:
print(a.dot(identity))
print(b.dot(identity))
print(c.dot(identity))

[1. 2. 3.]
[0.1 0.2 0.3]
[-1.  -0.5  0. ]


In [15]:
this = np.array([2,4,6])
movie = np.array([10,10,10])
rocks = np.array([1,1,1])

print(this + movie + rocks)
print((this.dot(identity) + movie).dot(identity) + rocks)

[13 15 17]
[13. 15. 17.]


# forward probagation

In [16]:
def softmax(x_):
    x = np.atleast_2d(x_)
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

In [17]:
word_vects = {}
word_vects['yankees'] = np.array([[0.,0.,0.]]) # word embeddings
word_vects['bears'] = np.array([[0.,0.,0.]])
word_vects['braves'] = np.array([[0.,0.,0.]])
word_vects['red'] = np.array([[0.,0.,0.]])
word_vects['sox'] = np.array([[0.,0.,0.]])
word_vects['lose'] = np.array([[0.,0.,0.]])
word_vects['defeat'] = np.array([[0.,0.,0.]])
word_vects['beat'] = np.array([[0.,0.,0.]])
word_vects['tie'] = np.array([[0.,0.,0.]])

In [18]:
# sentence embedding to output classification weights
sent2output = np.random.rand(3,len(word_vects))
identity = np.eye(3) # transition weights

In [19]:
# “red sox defeat” -> “yankees”
# forward probagation
layer_0 = word_vects['red']
layer_1 = layer_0.dot(identity) + word_vects['sox'] # create sentence embedding
layer_2 = layer_1.dot(identity) + word_vects['defeat']

In [20]:
pred = softmax(layer_2.dot(sent2output)) # predicts all over the vocab
print(pred)
print(word_vects['red'])

[[0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111
  0.11111111 0.11111111 0.11111111]]
[[0. 0. 0.]]


# backprobagate

In [21]:
y = np.array([1,0,0,0,0,0,0,0,0]) # one hot vector for yankess
pred_delta = pred - y
layer_2_delta = pred_delta.dot(sent2output.T)
defeat_delta = layer_2_delta * 1
layer_1_delta = layer_2_delta.dot(identity.T)
sox_delta = layer_1_delta * 1
layer_0_delta = layer_1_delta.dot(identity.T)
alpha = 0.01

word_vects['red'] -= layer_0_delta * alpha
word_vects['sox'] -= sox_delta * alpha
word_vects['defeat'] -= defeat_delta * alpha

identity -= np.outer(layer_0,layer_1_delta) * alpha
identity -= np.outer(layer_1, layer_2_delta) * alpha
sent2output -= np.outer(layer_2,pred_delta) * alpha

# time for real shit

In [22]:
x = np.array([1,2,3])
xx = np.array([4,5,6])

print(np.outer(x,xx))

print('----------')
print(np.inner(x,xx))
print(x * xx)
print(14+18)

[[ 4  5  6]
 [ 8 10 12]
 [12 15 18]]
----------
32
[ 4 10 18]
32


In [23]:
x = np.array([[1,2,3],[4,5,6]])

print(np.sum(x,axis=0))
print(x)


[5 7 9]
[[1 2 3]
 [4 5 6]]


In [24]:
print(np.sum(x,axis=1))
print(x)

[ 6 15]
[[1 2 3]
 [4 5 6]]


In [25]:
import sys,random,math
from collections import Counter
import numpy as np


In [26]:
f = open('data/tasksv11/en/qa1_single-supporting-fact_train.txt','r')
raw = f.readlines()
f.close()

In [27]:
tokens = []
for line in raw[:1000]:
    tokens.append(line.lower().replace("\n",'').split(' ')[1:])
print(tokens[0:4])

[['mary', 'moved', 'to', 'the', 'bathroom.'], ['john', 'went', 'to', 'the', 'hallway.'], ['where', 'is', 'mary?', '\tbathroom\t1'], ['daniel', 'went', 'back', 'to', 'the', 'hallway.']]


In [28]:
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)

vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

In [29]:
def word2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
    return idx

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [30]:
np.random.seed(1)
embed_size = 10

# word embeddings
embed = (np.random.rand(len(vocab),embed_size) - 0.5) * 0.1

# embedding -> embedding (initially the identity matrix)
recurrent = np.eye(embed_size)

# sentence embedding for empty sentence
start = np.zeros(embed_size)

# embedding -> output weights
decoder = (np.random.rand(embed_size, len(vocab)) - 0.5) * 0.1

# one hot lookups (for loss function)
one_hot = np.eye(len(vocab))

In [31]:
start

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [32]:
def predict(sent):
    
    layers = list()
    layer = {}
    layer['hidden'] = start
    layers.append(layer)
    
    loss = 0
    preds = [] # forward probagation
    for target_i in range(len(sent)):
        layer = {}
        # predict next term
        layer['pred'] = softmax(layers[-1]['hidden'].dot(decoder))
        loss += -np.log(layer['pred'][sent[target_i]])
        #generate next hidden state
        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) +\
                            embed[sent[target_i]]
        layers.append(layer)
    return layers, loss

In [33]:
for iter in range(30000): # forward
    alpha = 0.001
    
    sent = word2indices(tokens[iter%len(tokens)][1:])
    layers, loss = predict(sent)
    
    for layer_idx in reversed(range(len(layers))): # backprobagate
        layer = layers[layer_idx]
        
        target = sent[layer_idx-1]
        
        if(layer_idx > 0): # if not the first layer
            layer['output_delta'] = layer['pred'] - one_hot[target]
            new_hidden_delta = layer['output_delta'].dot(decoder.transpose())
            
            if(layer_idx == len(layers) -1): # if not the last layer of the list, 1 before last
                layer['hidden_delta'] = new_hidden_delta
            else:
                layer['hidden_delta'] = new_hidden_delta +\
                layers[layer_idx +1]['hidden_delta'].dot(recurrent.transpose())
        
        else: # if the first layer
            layer['hidden_delta'] = layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())
    
    # update weights
    
    start -= layers[0]['hidden_delta'] * alpha / float(len(sent))
    
    for layer_idx, layer in enumerate(layers[1:]):
        
        decoder -= np.outer(layers[layer_idx]['hidden'],layer['output_delta']) * \
                           alpha  / float(len(sent)) # layer_2 , pred_delta
        embed_idx = sent[layer_idx]
        embed[embed_idx] -= layers[layer_idx]['hidden_delta'] * \
                            alpha / float(len(sent)) # layer_1 layer_2 delta
        recurrent -= np.outer(layers[layer_idx] ['hidden'],\
                                layer['hidden_delta']) * alpha / float(len(sent))
                                # layer_0, layer_1_delta
    if(iter % 1000 == 0):
        print("Perplexity: "+str(np.exp(loss/len(sent))))

Perplexity: 82.01779692170828
Perplexity: 81.73470648410135
Perplexity: 81.35553094905504
Perplexity: 80.72991918489456
Perplexity: 79.55243749150551
Perplexity: 77.03505237345513
Perplexity: 70.27199258597715
Perplexity: 42.242267815761544
Perplexity: 23.70721676631971
Perplexity: 19.816426169789054
Perplexity: 18.527723962281033
Perplexity: 17.312146066720306
Perplexity: 15.561221454971136
Perplexity: 12.819843954001655
Perplexity: 9.526145929743759
Perplexity: 7.4141972341255595
Perplexity: 6.373127080517985
Perplexity: 5.659862351996081
Perplexity: 5.180206882104858
Perplexity: 4.895841973175592
Perplexity: 4.703595379698135
Perplexity: 4.573120558832889
Perplexity: 4.492772790751694
Perplexity: 4.4423439237315625
Perplexity: 4.396430529486272
Perplexity: 4.341330964755117
Perplexity: 4.27714591602516
Perplexity: 4.212008611316266
Perplexity: 4.156077957531741
Perplexity: 4.11730670251008


In [34]:
def predict_sentence(sent_index):
    for i,each_layer in enumerate(l[1:-1]):
        input = tokens[sent_index][i]
        true = tokens[sent_index][i+1]
        pred = vocab[each_layer['pred'].argmax()]
        print("Prev Input: "+input + (' ' * (12 - len(input)))+\
             "True: "+true+ (" "*(15-len(true))) + ' Pred: '+pred)

In [35]:
sent_index = random.randrange(1,1000)

l,_ = predict(word2indices(tokens[sent_index]))

print(tokens[sent_index])

predict_sentence(sent_index)
    

['mary', 'moved', 'to', 'the', 'kitchen.']
Prev Input: mary        True: moved           Pred: is
Prev Input: moved       True: to              Pred: to
Prev Input: to          True: the             Pred: the
Prev Input: the         True: kitchen.        Pred: bedroom.


In [36]:
len(layers)

5