# Read in Data

In [74]:
import sys

f = open("../original/reviews.txt")
raw_reviews = f.readlines()
f.close()

print(type(raw_reviews))
print(len(raw_reviews))

f = open("../original/labels.txt")
raw_labels = f.readlines()
f.close()

<class 'list'>
25000


# Preprocess Data

In [75]:
#practice
tmp_review = raw_reviews[234]
print(f"There are {len(tmp)} characters in this review.")
tmp_words = tmp_review.split(" ")
print(f"Composed of {len(tmp_words)} words defined by space.")
tmp_unique_words = set(tmp_words)
print(f"Composed of {len(tmp_unique_words)} unique words")

There are 2220 characters in this review.
Composed of 472 words defined by space.
Composed of 224 unique words


In [76]:
tokens = list(map(lambda x:set(x.split(" ")), raw_reviews))
print(len(tokens))

25000


In [77]:
#Get the unique vocab
vocab = set()
for review in tokens:
    for word in review:
        if(len(word)>0):
            vocab.add(word)
vocab = list(vocab)
print(f"There are {len(vocab)} unique words across all {len(tokens)} reviews in the dataset.")

There are 74074 unique words across all 25000 reviews in the dataset.


In [78]:
#Give it numbers
word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

#each word gets a number
word2index["brave"]

16240

In [79]:
#covert review to a set of unique numbers representing the words in it. 
input_dataset = list()
for review in tokens:
    review_index = list()
    for word in review:
        try:
            review_index.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(review_index)))
len(input_dataset)

25000

In [80]:
#explainer, input dataset - just numbers related to words
input_dataset[342][0:4]

[27140, 41479, 1042, 40467]

In [81]:
target_dataset = list()
for label in raw_labels:
    if label == "positive\n":
        target_dataset.append(1)
    else:
        target_dataset.append(0)

# Model

In [110]:
import numpy as np
np.random.seed(42)

alpha, iterations = (0.01, 2)
hidden_size = 100

w01 = 0.2*np.random.random((len(vocab), hidden_size)) - 0.1 #74074, 100
w12 = 0.2*np.random.random((hidden_size, 1)) - 0.1

correct, total = (0,0)

sigmoid = lambda x: 1/(1+np.exp(-x))

In [111]:
#explainer, the input dataset contains list of number related to a word 
#rather than multiple can just add the weights

tmp_input = input_dataset[342][0:4]
print(w01[tmp_input].shape) #grab 4 of the weight vectors 
tmp_sum = np.sum(w01[tmp_input], axis = 1)
sigmoid(tmp_sum)

(4, 100)


array([0.26509341, 0.69362389, 0.63203359, 0.75871518])

In [112]:
for i in range(len(input_dataset)-1000): #first 24k reviews
    
    error = 0
    x,y = (input_dataset[i], target_dataset[i])
    layer_1 = sigmoid(np.sum(w01[x], axis = 0)) #sum across the 4 rows of weights, will always be (100,)
    layer_2 = sigmoid(np.dot(layer_1, w12)) #(1)
    
    error += (layer_2 - y)**2
    
    layer_2_delta = layer_2 - y
    layer_1_delta = layer_2_delta.dot(w12.T) #? no derivative ? (100,)
    
    w01[x] -= layer_1_delta* alpha #only update the relevant weights
    w12 -= np.outer(layer_1, layer_2_delta) * alpha
    
    if(np.abs(layer_2_delta) < 0.5):
        correct += 1
    total +=1
    
    if(i % 1000 == 9):
        print(f"Iter: {i}, Trn-Error: {error}, Correct: {correct}, Train-Acc: {correct/float(total):.3f}")

Iter: 9, Trn-Error: [0.34185575], Correct: 4, Train-Acc: 0.400
Iter: 1009, Trn-Error: [0.11091182], Correct: 535, Train-Acc: 0.530
Iter: 2009, Trn-Error: [0.31535068], Correct: 1287, Train-Acc: 0.640
Iter: 3009, Trn-Error: [0.0547328], Correct: 2114, Train-Acc: 0.702
Iter: 4009, Trn-Error: [0.13114956], Correct: 2915, Train-Acc: 0.727
Iter: 5009, Trn-Error: [0.00477812], Correct: 3715, Train-Acc: 0.742
Iter: 6009, Trn-Error: [0.44216895], Correct: 4536, Train-Acc: 0.755
Iter: 7009, Trn-Error: [0.00066642], Correct: 5390, Train-Acc: 0.769
Iter: 8009, Trn-Error: [0.00644237], Correct: 6255, Train-Acc: 0.781
Iter: 9009, Trn-Error: [0.86182665], Correct: 7107, Train-Acc: 0.789
Iter: 10009, Trn-Error: [0.48895955], Correct: 7973, Train-Acc: 0.797
Iter: 11009, Trn-Error: [0.0004263], Correct: 8825, Train-Acc: 0.802
Iter: 12009, Trn-Error: [2.91075966e-06], Correct: 9672, Train-Acc: 0.805
Iter: 13009, Trn-Error: [0.0006242], Correct: 10547, Train-Acc: 0.811
Iter: 14009, Trn-Error: [0.11730786

In [113]:
correct, total = (0,0)
for i in range(len(input_dataset) - 1000, len(input_dataset)):
    
    x = input_dataset[i]
    y = target_dataset[i]
    
    layer_1 = sigmoid(np.sum(w01[x], axis = 0))
    layer_2 = sigmoid(np.dot(layer_1, w12))
    
    if(np.abs(layer_2 -y) < 0.5):
        correct += 1
    total += 1
    
print(f"Test Correct {correct}, Test Accuracy {correct/float(total):.3f}.")
        

Test Correct 850, Test Accuracy 0.850.


# Comparing the Word Embeddings

In [119]:
w_brave = w01[16240]
w_beautiful = w01[21333]
square_diff = sum((w_brave - w_beautiful)**2)

In [123]:
import math
-math.sqrt(square_diff)

-1.0330431891639722

In [131]:
from collections import Counter
import math

def similar(target = "brave"):
    scores = Counter()
    target_index = word2index[target] #16420
    for word, index in word2index.items():
        raw_diff = w01[index] - w01[target_index] #100 - 100
        square_diff = raw_diff * raw_diff #square the diff
        scores[word] = -math.sqrt(sum(square_diff)) #sqrt the sum of all the diff, using a counter here and then most common 
    return scores

In [132]:
brave_score = similar("brave")

In [138]:
brave_score.most_common(10)

[('brave', -0.0),
 ('ramone', -0.6518620020234145),
 ('cautions', -0.6570388689372177),
 ('culminating', -0.660379190166895),
 ('risdon', -0.664876917580303),
 ('familiarly', -0.6661246325585951),
 ('ncos', -0.6673909117361397),
 ('scorpion', -0.6679715838956195),
 ('wobbles', -0.6689169531906572),
 ('maniquen', -0.6703335445914319)]