In [2]:
import sys

f = open('reviews-full.txt','r')
#f = open('reviews_10.txt','r')
raw_reviews = f.readlines() #returns 10 lines
f.close()
#print(len(raw_reviews))

f = open('labels-full.txt', 'r')
#f = open('labels_10.txt', 'r')
raw_labels = f.readlines() #returns 10 labels
f.close()
#print(len(raw_labels))

fn = lambda x:set(x.split(" ")) #function that splits the review and creates a set of words in a single review

#tokens is a list of sets of words in each review word 
tokens = list(map(fn,raw_reviews)) #takes the set of words for a given review and converts into list of words
#print(tokens) #-- len=10 -- [{'believe','pomp',..} ,{'insane','it',...} ...]

vocab = set() #vocab of all unique words in all reviews
for sent in tokens: #for each review
    #print(sent)
    for word in sent: #for each word in review
        if(len(word)>0): #if word is not empty space, add to vocab
            vocab.add(word)
vocab = list(vocab)
#print(vocab) #-- len=989 -- ['michael', 'bet', 'slow',...]

word2index = {} #dictionary of words and corresponding index
for i,word in enumerate(vocab):
    word2index[word]=i
#print(word2index) #-- {'michael': 0, 'bet': 1, 'slow': 2, ...}
#print(word2index["expect"]) = 512

input_dataset = list() # this is just a list of index of the words in review
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))
#print(len(input_dataset)) #-- len=10 -- [512, 514, 515,...]


target_dataset = list() #list of labels, converted into 0 and 1s
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)
#print(target_dataset) - [1, 0, 1,...]

In [3]:
import numpy as np
np.random.seed(1)

#Sigmoid function returns a value between 0 and 1, only positive numbers, treating it as a probability
def sigmoid(x): 
    #print(f"pre-sigmoid= {x}")
    #a = 1/(1 + np.exp(-x))
    #print(f"sigmoid = {a}")
    return 1/(1 + np.exp(-x))

alpha, iterations = (0.01, 1)
hidden_size = 100

weights_0_1 = 0.2*np.random.random((len(vocab),hidden_size)) - 0.1 #shape=(989,100)
weights_1_2 = 0.2*np.random.random((hidden_size,1)) - 0.1 #shape=(100,1)
#print("weights_0_1", weights_0_1[512])

correct,total = (0,0)
for iter in range(iterations):
    for i in range(len(input_dataset)-1000):  #for each review in form of index numbers [512, 514, 515,...]
        x,y = (input_dataset[i],target_dataset[i]) #take each review in form of index of words
        print(x)
        
        
        #Since the onehot vector is a bunch of 0 and 1, its dotproduct with weights matrix will be equal to sum of weights (weights*1+weights*0)
        #Thus, lets save time and sum up weights of all words in review, axis=0 means add first column to give us layer-1
        #This layer_1 is called "embedding layer" and we do not need layer_0.
        #weights_0_1[x].shape = (100,) - Its a vector of 100 weights
        layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0)) #weights[nparray] to get weights only of indexes in nparray
        #print(f"layer_1={i}",layer_1)

        layer_2 = sigmoid(np.dot(layer_1,weights_1_2))
        layer_2_delta = layer_2 - y                        
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) #No reludervative since we used sigmoid  fn  
        weights_0_1[x] -= layer_1_delta * alpha #adjust weight for each review
        weights_1_2 -= np.outer(layer_1,layer_2_delta) * alpha #Outer multiplication produces a matrix
        #print(weights_1_2.shape) #-(100,)

        if(np.abs(layer_2_delta) < 0.5):
            correct += 1
        total += 1
        if(i % 10 == 9):
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write('\rIter:'+str(iter)\
                             +' Progress:'+progress[2:4]\
                             +'.'+progress[4:6]\
                             +'% Training Accuracy:'\
                             + str(correct/float(total)) + '%')
    print()
    
correct,total = (0,0)
for i in range(len(input_dataset)-1000,len(input_dataset)):
    x = input_dataset[i]
    y = target_dataset[i]

    layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0))
    layer_2 = sigmoid(np.dot(layer_1,weights_1_2))

    if(np.abs(layer_2 - y) < 0.5):
        correct += 1
        total += 1
        

print("Test Accuracy:" + str(correct / float(total)))
print("end of test")

Iter:0 Progress:95.99% Training Accuracy:0.832125%75031263%%%
Iter:1 Progress:95.99% Training Accuracy:0.8662916666666667%
Iter:2 Progress:95.99% Training Accuracy:0.88425%7030143075%
Iter:3 Progress:95.99% Training Accuracy:0.8973125%00229191%
Iter:4 Progress:95.99% Training Accuracy:0.9075083333333334%
Iter:5 Progress:95.99% Training Accuracy:0.9159166666666667%
Iter:6 Progress:95.99% Training Accuracy:0.9230773809523809%
Iter:7 Progress:95.99% Training Accuracy:0.9294166666666667%
Iter:8 Progress:95.99% Training Accuracy:0.9350925925925926%
Iter:9 Progress:95.99% Training Accuracy:0.9401875%77086545%
Test Accuracy:1.0
end of test


In [97]:
from collections import Counter
import math 

def similar(target='believe'):
    target_index = word2index[target]
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))

    return scores.most_common(10)

In [111]:
print(similar('fellow'))

[('fellow', -0.0), ('celebrities', -0.6802127785236843), ('covers', -0.6850515746886674), ('facing', -0.699074121432117), ('extravagant', -0.6994251782421944), ('fascinating', -0.7008795225057505), ('campers', -0.7031968054894266), ('photograph', -0.7060434355660766), ('faults', -0.7079402503159996), ('tend', -0.7083191561637148)]


## Averaged word vector from Chap 12

In [8]:
import numpy as np
from collections import Counter

norms = np.sum(weights_0_1 * weights_0_1,axis=1)
#print(norms.shape) #--shape=(74074,) -- [0.35052413 0.40099685 0.32382926 ...]
norms.resize(norms.shape[0],1)
normed_weights = weights_0_1 * norms

def make_sent_vect(words):
    indices = list(map(lambda x:word2index[x],filter(lambda x:x in word2index,words)))
    return np.mean(normed_weights[indices],axis=0)

reviews2vectors = list()
for review in tokens: # tokenized reviews
    reviews2vectors.append(make_sent_vect(review))
reviews2vectors = np.array(reviews2vectors)

def most_similar_reviews(review):
    v = make_sent_vect(review)
    scores = Counter()
    for i,val in enumerate(reviews2vectors.dot(v)):
        scores[i] = val
    most_similar = list()
    
    for idx,score in scores.most_common(3):
        most_similar.append(raw_reviews[idx][0:40])
    return most_similar

#most_similar_reviews(['boring','awful'])

74074
