### Capturing word correlation in input data

In [1]:
import numpy as np
onehots = {}
onehots['cat'] = np.array([1,0,0,0])
onehots['the'] = np.array([0,1,0,0])
onehots['dog'] = np.array([0,0,1,0])
onehots['sat'] = np.array([0,0,0,1])

sentence = ['the','cat','sat']
x = onehots[sentence[0]]+onehots[sentence[1]]+onehots[sentence[2]]

print("Sentence Encoding:" + str(x))

Sentence Encoding:[1 1 0 1]


### Predicitng movie reviews

In [2]:
import sys


In [3]:
# reading the reviews and labels into a list
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()

In [4]:
# mapping each review as tokens
tokens = list(map(lambda x:set(x.split(" ")),raw_reviews))

In [5]:
"""
creating a set of vocab taking each word from tokens and if len > 0
we add it to the list of words
"""
vocab = set()
for sent in tokens:
    for word in sent:
        if(len(word)>0):
            vocab.add(word)
vocab = list(vocab)

In [6]:
#enumerating each word to find assign index for each of the words

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

In [7]:
# Converting each sentence to index and then creating set of each token to avoid duplicates

input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))

In [9]:
# Converting labels to 0 and 1s
target_dataset = list()
for labels in raw_labels:
    if labels == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

In [11]:
import numpy as np
np.random.seed(1)

In [12]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [13]:
alpha, iteration = (0.01,2)
hidden_size = 100

In [14]:
weights_0_1 = 0.2*np.random.random((len(vocab),hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size,1)) - 0.1

In [21]:
correct,total = (0,0)
for iter in range(iteration):
    for i in range(len(input_dataset)-1000):
        x,y = (input_dataset[i],target_dataset[i])
        layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0))
        layer_2 = sigmoid(np.dot(layer_1,weights_1_2))
        
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        
        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1,layer_2_delta)*alpha
        
        if(np.abs(layer_2_delta)<0.5):
            correct += 1
        total += 1
        if(i % 10 == 9):
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write('\rIter:'+str(iter)\
                +' Progress:'+progress[2:4]\
                             +'.'+progress[4:6]\
                +'% Training Accuracy:'\
                + str(correct/float(total)) + '%')
    print()
correct,total = (0,0)
for i in range(len(input_dataset)-1000,len(input_dataset)):
    x = input_dataset[i]
    y = target_dataset[i]
    layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0)) 
    layer_2 = sigmoid(np.dot(layer_1,weights_1_2))
        correct += 1
    total += 1
print("Test Accuracy:" + str(correct / float(total)))

Iter:0 Progress:00.03% Training Accuracy:0.3%Iter:0 Progress:00.07% Training Accuracy:0.25%Iter:0 Progress:00.11% Training Accuracy:0.2%Iter:0 Progress:00.15% Training Accuracy:0.2%Iter:0 Progress:00.19% Training Accuracy:0.2%Iter:0 Progress:00.23% Training Accuracy:0.2%Iter:0 Progress:00.27% Training Accuracy:0.18571428571428572%Iter:0 Progress:00.31% Training Accuracy:0.2%Iter:0 Progress:00.35% Training Accuracy:0.2222222222222222%Iter:0 Progress:00.39% Training Accuracy:0.21%Iter:0 Progress:00.43% Training Accuracy:0.21818181818181817%Iter:0 Progress:00.47% Training Accuracy:0.20833333333333334%Iter:0 Progress:00.51% Training Accuracy:0.2%Iter:0 Progress:00.55% Training Accuracy:0.2%Iter:0 Progress:00.59% Training Accuracy:0.2%Iter:0 Progress:00.63% Training Accuracy:0.19375%Iter:0 Progress:00.67% Training Accuracy:0.20588235294117646%Iter:0 Progress:00.71% Training Accuracy:0.21666666666666667%Iter:0 Progress:00.75% Training Accuracy:0.22631578947368422%Iter:0 P

Iter:0 Progress:05.67% Training Accuracy:0.545774647887324%Iter:0 Progress:05.71% Training Accuracy:0.5475524475524476%Iter:0 Progress:05.75% Training Accuracy:0.5486111111111112%Iter:0 Progress:05.79% Training Accuracy:0.5503448275862068%Iter:0 Progress:05.83% Training Accuracy:0.5506849315068493%Iter:0 Progress:05.87% Training Accuracy:0.5510204081632653%Iter:0 Progress:05.91% Training Accuracy:0.5527027027027027%Iter:0 Progress:05.95% Training Accuracy:0.553020134228188%Iter:0 Progress:05.99% Training Accuracy:0.556%Iter:0 Progress:06.03% Training Accuracy:0.5582781456953643%Iter:0 Progress:06.07% Training Accuracy:0.5605263157894737%Iter:0 Progress:06.11% Training Accuracy:0.5627450980392157%Iter:0 Progress:06.15% Training Accuracy:0.564935064935065%Iter:0 Progress:06.19% Training Accuracy:0.5651612903225807%Iter:0 Progress:06.23% Training Accuracy:0.5653846153846154%Iter:0 Progress:06.27% Training Accuracy:0.5668789808917197%Iter:0 Progress:06.31% Training Accurac

Iter:0 Progress:11.71% Training Accuracy:0.6802047781569965%Iter:0 Progress:11.75% Training Accuracy:0.6802721088435374%Iter:0 Progress:11.79% Training Accuracy:0.680677966101695%Iter:0 Progress:11.83% Training Accuracy:0.6807432432432432%Iter:0 Progress:11.87% Training Accuracy:0.6804713804713804%Iter:0 Progress:11.91% Training Accuracy:0.6805369127516778%Iter:0 Progress:11.95% Training Accuracy:0.6806020066889632%Iter:0 Progress:11.99% Training Accuracy:0.6813333333333333%Iter:0 Progress:12.03% Training Accuracy:0.6813953488372093%Iter:0 Progress:12.07% Training Accuracy:0.680794701986755%Iter:0 Progress:12.11% Training Accuracy:0.6818481848184819%Iter:0 Progress:12.15% Training Accuracy:0.6825657894736842%Iter:0 Progress:12.19% Training Accuracy:0.6826229508196722%Iter:0 Progress:12.23% Training Accuracy:0.6833333333333333%Iter:0 Progress:12.27% Training Accuracy:0.6833876221498372%Iter:0 Progress:12.31% Training Accuracy:0.6837662337662338%Iter:0 Progress:12.35% Tr

Iter:0 Progress:17.67% Training Accuracy:0.7194570135746606%Iter:0 Progress:17.71% Training Accuracy:0.7198645598194131%Iter:0 Progress:17.75% Training Accuracy:0.7193693693693693%Iter:0 Progress:17.79% Training Accuracy:0.7195505617977528%Iter:0 Progress:17.83% Training Accuracy:0.7201793721973094%Iter:0 Progress:17.87% Training Accuracy:0.7203579418344519%Iter:0 Progress:17.91% Training Accuracy:0.7198660714285714%Iter:0 Progress:17.95% Training Accuracy:0.7200445434298441%Iter:0 Progress:17.99% Training Accuracy:0.7204444444444444%Iter:0 Progress:18.03% Training Accuracy:0.7210643015521064%Iter:0 Progress:18.07% Training Accuracy:0.7214601769911504%Iter:0 Progress:18.11% Training Accuracy:0.7216335540838852%Iter:0 Progress:18.15% Training Accuracy:0.7215859030837004%Iter:0 Progress:18.19% Training Accuracy:0.7215384615384616%Iter:0 Progress:18.23% Training Accuracy:0.7214912280701754%Iter:0 Progress:18.27% Training Accuracy:0.7221006564551422%Iter:0 Progress:18.31% 

Iter:0 Progress:23.51% Training Accuracy:0.7433673469387755%Iter:0 Progress:23.55% Training Accuracy:0.7432937181663837%Iter:0 Progress:23.59% Training Accuracy:0.7433898305084746%Iter:0 Progress:23.63% Training Accuracy:0.7431472081218274%Iter:0 Progress:23.67% Training Accuracy:0.7432432432432432%Iter:0 Progress:23.71% Training Accuracy:0.7433389544688027%Iter:0 Progress:23.75% Training Accuracy:0.7436026936026936%Iter:0 Progress:23.79% Training Accuracy:0.7440336134453781%Iter:0 Progress:23.83% Training Accuracy:0.7439597315436242%Iter:0 Progress:23.87% Training Accuracy:0.7442211055276382%Iter:0 Progress:23.91% Training Accuracy:0.7441471571906354%Iter:0 Progress:23.95% Training Accuracy:0.7442404006677796%Iter:0 Progress:23.99% Training Accuracy:0.7443333333333333%Iter:0 Progress:24.03% Training Accuracy:0.7444259567387688%Iter:0 Progress:24.07% Training Accuracy:0.7446843853820598%Iter:0 Progress:24.11% Training Accuracy:0.7449419568822554%Iter:0 Progress:24.15% 

Iter:0 Progress:29.79% Training Accuracy:0.7667114093959732%Iter:0 Progress:29.83% Training Accuracy:0.7667560321715817%Iter:0 Progress:29.87% Training Accuracy:0.7666666666666667%Iter:0 Progress:29.91% Training Accuracy:0.7667112299465241%Iter:0 Progress:29.95% Training Accuracy:0.7667556742323097%Iter:0 Progress:29.99% Training Accuracy:0.7668%Iter:0 Progress:30.03% Training Accuracy:0.7669773635153129%Iter:0 Progress:30.07% Training Accuracy:0.7670212765957447%Iter:0 Progress:30.11% Training Accuracy:0.7670650730411687%Iter:0 Progress:30.15% Training Accuracy:0.7672413793103449%Iter:0 Progress:30.19% Training Accuracy:0.7674172185430463%Iter:0 Progress:30.23% Training Accuracy:0.7677248677248677%Iter:0 Progress:30.27% Training Accuracy:0.7680317040951122%Iter:0 Progress:30.31% Training Accuracy:0.7680738786279684%Iter:0 Progress:30.35% Training Accuracy:0.7682476943346509%Iter:0 Progress:30.39% Training Accuracy:0.7680263157894737%Iter:0 Progress:30.43% Training Acc

Iter:0 Progress:35.59% Training Accuracy:0.7795505617977528%Iter:0 Progress:35.63% Training Accuracy:0.7795735129068463%Iter:0 Progress:35.67% Training Accuracy:0.7798206278026906%Iter:0 Progress:35.71% Training Accuracy:0.7799552071668533%Iter:0 Progress:35.75% Training Accuracy:0.7798657718120805%Iter:0 Progress:35.79% Training Accuracy:0.7801117318435754%Iter:0 Progress:35.83% Training Accuracy:0.7801339285714286%Iter:0 Progress:35.87% Training Accuracy:0.7799331103678929%Iter:0 Progress:35.91% Training Accuracy:0.7800668151447662%Iter:0 Progress:35.95% Training Accuracy:0.7803114571746385%Iter:0 Progress:35.99% Training Accuracy:0.7804444444444445%Iter:0 Progress:36.03% Training Accuracy:0.7805771365149834%Iter:0 Progress:36.07% Training Accuracy:0.7803769401330377%Iter:0 Progress:36.11% Training Accuracy:0.7805094130675526%Iter:0 Progress:36.15% Training Accuracy:0.7807522123893805%Iter:0 Progress:36.19% Training Accuracy:0.7808839779005525%Iter:0 Progress:36.23% 

Iter:0 Progress:41.71% Training Accuracy:0.7922339405560882%Iter:0 Progress:41.75% Training Accuracy:0.7923371647509578%Iter:0 Progress:41.79% Training Accuracy:0.7923444976076555%Iter:0 Progress:41.83% Training Accuracy:0.7924474187380497%Iter:0 Progress:41.87% Training Accuracy:0.7924546322827125%Iter:0 Progress:41.91% Training Accuracy:0.792557251908397%Iter:0 Progress:41.95% Training Accuracy:0.7926596758817922%Iter:0 Progress:41.99% Training Accuracy:0.7926666666666666%Iter:0 Progress:42.03% Training Accuracy:0.7927687916270219%Iter:0 Progress:42.07% Training Accuracy:0.7929657794676807%Iter:0 Progress:42.11% Training Accuracy:0.7930674264007598%Iter:0 Progress:42.15% Training Accuracy:0.7930740037950664%Iter:0 Progress:42.19% Training Accuracy:0.7930805687203791%Iter:0 Progress:42.23% Training Accuracy:0.7932765151515152%Iter:0 Progress:42.27% Training Accuracy:0.7929990539262063%Iter:0 Progress:42.31% Training Accuracy:0.7929111531190927%Iter:0 Progress:42.35% T

Iter:0 Progress:47.79% Training Accuracy:0.7990794979079497%Iter:0 Progress:47.83% Training Accuracy:0.7990802675585285%Iter:0 Progress:47.87% Training Accuracy:0.7992481203007519%Iter:0 Progress:47.91% Training Accuracy:0.7994156928213689%Iter:0 Progress:47.95% Training Accuracy:0.7994995829858215%Iter:0 Progress:47.99% Training Accuracy:0.7996666666666666%Iter:0 Progress:48.03% Training Accuracy:0.7998334721065778%Iter:0 Progress:48.07% Training Accuracy:0.8%Iter:0 Progress:48.11% Training Accuracy:0.800166251039069%Iter:0 Progress:48.15% Training Accuracy:0.8001661129568106%Iter:0 Progress:48.19% Training Accuracy:0.8003319502074688%Iter:0 Progress:48.23% Training Accuracy:0.8004145936981758%Iter:0 Progress:48.27% Training Accuracy:0.8005799502899752%Iter:0 Progress:48.31% Training Accuracy:0.8006622516556291%Iter:0 Progress:48.35% Training Accuracy:0.8006617038875103%Iter:0 Progress:48.39% Training Accuracy:0.8006611570247933%Iter:0 Progress:48.43% Training Accurac

Iter:0 Progress:54.11% Training Accuracy:0.8059866962305987%Iter:0 Progress:54.15% Training Accuracy:0.8059822747415066%Iter:0 Progress:54.19% Training Accuracy:0.8059778597785978%Iter:0 Progress:54.23% Training Accuracy:0.806047197640118%Iter:0 Progress:54.27% Training Accuracy:0.8059690493736182%Iter:0 Progress:54.31% Training Accuracy:0.8060382916053019%Iter:0 Progress:54.35% Training Accuracy:0.8060338484179543%Iter:0 Progress:54.39% Training Accuracy:0.8061764705882353%Iter:0 Progress:54.43% Training Accuracy:0.8061719324026451%Iter:0 Progress:54.47% Training Accuracy:0.8061674008810573%Iter:0 Progress:54.51% Training Accuracy:0.8062362435803375%Iter:0 Progress:54.55% Training Accuracy:0.8062316715542522%Iter:0 Progress:54.59% Training Accuracy:0.8063736263736264%Iter:0 Progress:54.63% Training Accuracy:0.8065153733528551%Iter:0 Progress:54.67% Training Accuracy:0.8065106071689832%Iter:0 Progress:54.71% Training Accuracy:0.806359649122807%Iter:0 Progress:54.75% Tr

Iter:0 Progress:95.99% Training Accuracy:0.830625%497707378%
Iter:1 Progress:95.99% Training Accuracy:0.8655%44863513232%
Test Accuracy:0.846
