### Capturing word correlation in input data

In [1]:
import numpy as np
onehots = {}
onehots['cat'] = np.array([1,0,0,0])
onehots['the'] = np.array([0,1,0,0])
onehots['dog'] = np.array([0,0,1,0])
onehots['sat'] = np.array([0,0,0,1])

sentence = ['the','cat','sat']
x = onehots[sentence[0]]+onehots[sentence[1]]+onehots[sentence[2]]

print("Sentence Encoding:" + str(x))

Sentence Encoding:[1 1 0 1]


### Predicitng movie reviews

In [2]:
import sys


In [3]:
# reading the reviews and labels into a list
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()

In [4]:
# mapping each review as tokens
tokens = list(map(lambda x:set(x.split(" ")),raw_reviews))

In [5]:
"""
creating a set of vocab taking each word from tokens and if len > 0
we add it to the list of words
"""
vocab = set()
for sent in tokens:
    for word in sent:
        if(len(word)>0):
            vocab.add(word)
vocab = list(vocab)

In [6]:
#enumerating each word to find assign index for each of the words

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

In [7]:
# Converting each sentence to index and then creating set of each token to avoid duplicates

input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))

In [8]:
# Converting labels to 0 and 1s
target_dataset = list()
for labels in raw_labels:
    if labels == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

In [9]:
import numpy as np
np.random.seed(1)

In [10]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [11]:
alpha, iteration = (0.01,2)
hidden_size = 100

In [12]:
weights_0_1 = 0.2*np.random.random((len(vocab),hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size,1)) - 0.1

#### Training with 2 layer network

In [14]:
correct,total = (0,0)
for iter in range(iteration):
    for i in range(len(input_dataset)-1000):
        x,y = (input_dataset[i],target_dataset[i])
        layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0))
        layer_2 = sigmoid(np.dot(layer_1,weights_1_2))
        
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        
        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1,layer_2_delta)*alpha
        
        if(np.abs(layer_2_delta)<0.5):
            correct += 1
        total += 1
        if(i % 10 == 9):
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write('\rIter:'+str(iter)\
                +' Progress:'+progress[2:4]\
                             +'.'+progress[4:6]\
                +'% Training Accuracy:'\
                + str(correct/float(total)) + '%')
    print()
correct,total = (0,0)
for i in range(len(input_dataset)-1000,len(input_dataset)):
    x = input_dataset[i]
    y = target_dataset[i]
    layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0)) 
    layer_2 = sigmoid(np.dot(layer_1,weights_1_2))
    correct += 1
    total += 1
print("Test Accuracy:" + str(correct / float(total)))

Iter:0 Progress:00.03% Training Accuracy:0.2%Iter:0 Progress:00.07% Training Accuracy:0.1%Iter:0 Progress:00.11% Training Accuracy:0.06666666666666667%Iter:0 Progress:00.15% Training Accuracy:0.15%Iter:0 Progress:00.19% Training Accuracy:0.18%Iter:0 Progress:00.23% Training Accuracy:0.15%Iter:0 Progress:00.27% Training Accuracy:0.15714285714285714%Iter:0 Progress:00.31% Training Accuracy:0.15%Iter:0 Progress:00.35% Training Accuracy:0.14444444444444443%Iter:0 Progress:00.39% Training Accuracy:0.16%Iter:0 Progress:00.43% Training Accuracy:0.17272727272727273%Iter:0 Progress:00.47% Training Accuracy:0.2%Iter:0 Progress:00.51% Training Accuracy:0.19230769230769232%Iter:0 Progress:00.55% Training Accuracy:0.19285714285714287%Iter:0 Progress:00.59% Training Accuracy:0.20666666666666667%Iter:0 Progress:00.63% Training Accuracy:0.2125%Iter:0 Progress:00.67% Training Accuracy:0.2235294117647059%Iter:0 Progress:00.71% Training Accuracy:0.22777777777777777%Iter:0 Progress:00.7

Iter:0 Progress:05.63% Training Accuracy:0.5765957446808511%Iter:0 Progress:05.67% Training Accuracy:0.576056338028169%Iter:0 Progress:05.71% Training Accuracy:0.5783216783216784%Iter:0 Progress:05.75% Training Accuracy:0.5791666666666667%Iter:0 Progress:05.79% Training Accuracy:0.58%Iter:0 Progress:05.83% Training Accuracy:0.5821917808219178%Iter:0 Progress:05.87% Training Accuracy:0.582312925170068%Iter:0 Progress:05.91% Training Accuracy:0.5831081081081081%Iter:0 Progress:05.95% Training Accuracy:0.5852348993288591%Iter:0 Progress:05.99% Training Accuracy:0.5873333333333334%Iter:0 Progress:06.03% Training Accuracy:0.590066225165563%Iter:0 Progress:06.07% Training Accuracy:0.5921052631578947%Iter:0 Progress:06.11% Training Accuracy:0.5941176470588235%Iter:0 Progress:06.15% Training Accuracy:0.5954545454545455%Iter:0 Progress:06.19% Training Accuracy:0.5961290322580645%Iter:0 Progress:06.23% Training Accuracy:0.5967948717948718%Iter:0 Progress:06.27% Training Accuracy

Iter:0 Progress:11.67% Training Accuracy:0.6914383561643835%Iter:0 Progress:11.71% Training Accuracy:0.6918088737201366%Iter:0 Progress:11.75% Training Accuracy:0.6918367346938775%Iter:0 Progress:11.79% Training Accuracy:0.6922033898305084%Iter:0 Progress:11.83% Training Accuracy:0.6922297297297297%Iter:0 Progress:11.87% Training Accuracy:0.6922558922558922%Iter:0 Progress:11.91% Training Accuracy:0.6919463087248322%Iter:0 Progress:11.95% Training Accuracy:0.6919732441471572%Iter:0 Progress:11.99% Training Accuracy:0.6923333333333334%Iter:0 Progress:12.03% Training Accuracy:0.6930232558139535%Iter:0 Progress:12.07% Training Accuracy:0.6927152317880795%Iter:0 Progress:12.11% Training Accuracy:0.6937293729372938%Iter:0 Progress:12.15% Training Accuracy:0.6947368421052632%Iter:0 Progress:12.19% Training Accuracy:0.6950819672131148%Iter:0 Progress:12.23% Training Accuracy:0.696078431372549%Iter:0 Progress:12.27% Training Accuracy:0.6960912052117264%Iter:0 Progress:12.31% T

Iter:0 Progress:17.83% Training Accuracy:0.7278026905829597%Iter:0 Progress:17.87% Training Accuracy:0.7281879194630873%Iter:0 Progress:17.91% Training Accuracy:0.7279017857142858%Iter:0 Progress:17.95% Training Accuracy:0.7280623608017818%Iter:0 Progress:17.99% Training Accuracy:0.7282222222222222%Iter:0 Progress:18.03% Training Accuracy:0.7288248337028825%Iter:0 Progress:18.07% Training Accuracy:0.7292035398230089%Iter:0 Progress:18.11% Training Accuracy:0.7291390728476821%Iter:0 Progress:18.15% Training Accuracy:0.7290748898678414%Iter:0 Progress:18.19% Training Accuracy:0.729010989010989%Iter:0 Progress:18.23% Training Accuracy:0.7291666666666666%Iter:0 Progress:18.27% Training Accuracy:0.7297592997811816%Iter:0 Progress:18.31% Training Accuracy:0.7296943231441048%Iter:0 Progress:18.35% Training Accuracy:0.7294117647058823%Iter:0 Progress:18.39% Training Accuracy:0.7295652173913043%Iter:0 Progress:18.43% Training Accuracy:0.7297180043383948%Iter:0 Progress:18.47% T

Iter:0 Progress:24.03% Training Accuracy:0.75108153078203%Iter:0 Progress:24.07% Training Accuracy:0.751328903654485%Iter:0 Progress:24.11% Training Accuracy:0.751575456053068%Iter:0 Progress:24.15% Training Accuracy:0.7509933774834437%Iter:0 Progress:24.19% Training Accuracy:0.7510743801652893%Iter:0 Progress:24.23% Training Accuracy:0.7504950495049505%Iter:0 Progress:24.27% Training Accuracy:0.7505766062602965%Iter:0 Progress:24.31% Training Accuracy:0.7508223684210527%Iter:0 Progress:24.35% Training Accuracy:0.7510673234811166%Iter:0 Progress:24.39% Training Accuracy:0.7513114754098361%Iter:0 Progress:24.43% Training Accuracy:0.7515548281505728%Iter:0 Progress:24.47% Training Accuracy:0.7519607843137255%Iter:0 Progress:24.51% Training Accuracy:0.7522022838499184%Iter:0 Progress:24.55% Training Accuracy:0.7522801302931597%Iter:0 Progress:24.59% Training Accuracy:0.7526829268292683%Iter:0 Progress:24.63% Training Accuracy:0.7530844155844156%Iter:0 Progress:24.67% Trai

Iter:0 Progress:30.03% Training Accuracy:0.7716378162450066%Iter:0 Progress:30.07% Training Accuracy:0.7716755319148936%Iter:0 Progress:30.11% Training Accuracy:0.7717131474103586%Iter:0 Progress:30.15% Training Accuracy:0.7718832891246684%Iter:0 Progress:30.19% Training Accuracy:0.7720529801324504%Iter:0 Progress:30.23% Training Accuracy:0.7723544973544973%Iter:0 Progress:30.27% Training Accuracy:0.772655217965654%Iter:0 Progress:30.31% Training Accuracy:0.7726912928759895%Iter:0 Progress:30.35% Training Accuracy:0.7728590250329381%Iter:0 Progress:30.39% Training Accuracy:0.7726315789473684%Iter:0 Progress:30.43% Training Accuracy:0.7725361366622865%Iter:0 Progress:30.47% Training Accuracy:0.7724409448818897%Iter:0 Progress:30.51% Training Accuracy:0.772608125819135%Iter:0 Progress:30.55% Training Accuracy:0.7727748691099476%Iter:0 Progress:30.59% Training Accuracy:0.7729411764705882%Iter:0 Progress:30.63% Training Accuracy:0.7728459530026109%Iter:0 Progress:30.67% Tr

Iter:0 Progress:35.43% Training Accuracy:0.7826185101580135%Iter:0 Progress:35.47% Training Accuracy:0.782750845546787%Iter:0 Progress:35.51% Training Accuracy:0.7828828828828829%Iter:0 Progress:35.55% Training Accuracy:0.7830146231721035%Iter:0 Progress:35.59% Training Accuracy:0.7832584269662921%Iter:0 Progress:35.63% Training Accuracy:0.7832772166105499%Iter:0 Progress:35.67% Training Accuracy:0.7835201793721973%Iter:0 Progress:35.71% Training Accuracy:0.7836506159014558%Iter:0 Progress:35.75% Training Accuracy:0.7835570469798657%Iter:0 Progress:35.79% Training Accuracy:0.7837988826815643%Iter:0 Progress:35.83% Training Accuracy:0.7838169642857142%Iter:0 Progress:35.87% Training Accuracy:0.7836120401337793%Iter:0 Progress:35.91% Training Accuracy:0.7837416481069043%Iter:0 Progress:35.95% Training Accuracy:0.7839822024471635%Iter:0 Progress:35.99% Training Accuracy:0.7841111111111111%Iter:0 Progress:36.03% Training Accuracy:0.7842397336293008%Iter:0 Progress:36.07% T

Iter:0 Progress:41.03% Training Accuracy:0.7947368421052632%Iter:0 Progress:41.07% Training Accuracy:0.7949367088607595%Iter:0 Progress:41.11% Training Accuracy:0.7948443579766536%Iter:0 Progress:41.15% Training Accuracy:0.7947521865889213%Iter:0 Progress:41.19% Training Accuracy:0.7947572815533981%Iter:0 Progress:41.23% Training Accuracy:0.7948593598448108%Iter:0 Progress:41.27% Training Accuracy:0.7948643410852713%Iter:0 Progress:41.31% Training Accuracy:0.7947725072604066%Iter:0 Progress:41.35% Training Accuracy:0.7946808510638298%Iter:0 Progress:41.39% Training Accuracy:0.7948792270531401%Iter:0 Progress:41.43% Training Accuracy:0.79507722007722%Iter:0 Progress:41.47% Training Accuracy:0.795274831243973%Iter:0 Progress:41.51% Training Accuracy:0.7953757225433526%Iter:0 Progress:41.55% Training Accuracy:0.7951876804619826%Iter:0 Progress:41.59% Training Accuracy:0.7951923076923076%Iter:0 Progress:41.63% Training Accuracy:0.7953890489913544%Iter:0 Progress:41.67% Tra

Iter:0 Progress:47.03% Training Accuracy:0.8016156462585035%Iter:0 Progress:47.07% Training Accuracy:0.8016992353440952%Iter:0 Progress:47.11% Training Accuracy:0.8016129032258065%Iter:0 Progress:47.15% Training Accuracy:0.8014418999151823%Iter:0 Progress:47.19% Training Accuracy:0.8015254237288135%Iter:0 Progress:47.23% Training Accuracy:0.8016088060965284%Iter:0 Progress:47.27% Training Accuracy:0.8016920473773266%Iter:0 Progress:47.31% Training Accuracy:0.8018596787827557%Iter:0 Progress:47.35% Training Accuracy:0.802027027027027%Iter:0 Progress:47.39% Training Accuracy:0.8020253164556962%Iter:0 Progress:47.43% Training Accuracy:0.8019392917369309%Iter:0 Progress:47.47% Training Accuracy:0.802106149957877%Iter:0 Progress:47.51% Training Accuracy:0.8021885521885522%Iter:0 Progress:47.55% Training Accuracy:0.8021026072329689%Iter:0 Progress:47.59% Training Accuracy:0.802016806722689%Iter:0 Progress:47.63% Training Accuracy:0.8020990764063812%Iter:0 Progress:47.67% Tra

Iter:0 Progress:52.79% Training Accuracy:0.8076515151515151%Iter:0 Progress:52.83% Training Accuracy:0.8076457229371689%Iter:0 Progress:52.87% Training Accuracy:0.8076399394856278%Iter:0 Progress:52.91% Training Accuracy:0.8077097505668934%Iter:0 Progress:52.95% Training Accuracy:0.8077794561933535%Iter:0 Progress:52.99% Training Accuracy:0.8079245283018868%Iter:0 Progress:53.03% Training Accuracy:0.8080693815987934%Iter:0 Progress:53.07% Training Accuracy:0.8080633006782215%Iter:0 Progress:53.11% Training Accuracy:0.8079819277108434%Iter:0 Progress:53.15% Training Accuracy:0.8080511662904439%Iter:0 Progress:53.19% Training Accuracy:0.8080451127819549%Iter:0 Progress:53.23% Training Accuracy:0.8081141998497371%Iter:0 Progress:53.27% Training Accuracy:0.8081081081081081%Iter:0 Progress:53.31% Training Accuracy:0.8080270067516879%Iter:0 Progress:53.35% Training Accuracy:0.8078710644677661%Iter:0 Progress:53.39% Training Accuracy:0.8077153558052435%Iter:0 Progress:53.43% 

Iter:0 Progress:95.99% Training Accuracy:0.83275%6515214673%
Iter:1 Progress:95.99% Training Accuracy:0.8665%38570535528%
Test Accuracy:1.0


#### Comparing word embeddings

In [18]:
from collections import Counter
import math 

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))

    return scores.most_common(10)

In [19]:
print(similar('beautiful'))

[('beautiful', -0.0), ('great', -0.7200525239027686), ('appreciated', -0.7336346571486223), ('true', -0.7431760229710972), ('masterpiece', -0.748550344826609), ('ride', -0.7499993307722741), ('innocent', -0.7524499504157217), ('simple', -0.7651353019024564), ('sweet', -0.7683005370533182), ('worlds', -0.7689800163230051)]


In [20]:
print(similar('terrible'))

[('terrible', -0.0), ('badly', -0.7505059803595728), ('avoid', -0.756439386511795), ('boring', -0.7942688893924995), ('annoying', -0.7945404447524166), ('worse', -0.810109328606382), ('horrible', -0.8172600822957408), ('disappointment', -0.8269188772893417), ('fails', -0.8394884990136618), ('lacks', -0.8435591977629718)]
