In [23]:
import sys

In [2]:
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open("labels.txt")
raw_labels = f.readlines()
f.close()

In [3]:
tokens = list(map(lambda x:set(x.split(" ")),raw_reviews))

# or 
#def test(x):
#    return set(x.split(" "))

#xx = list(map(test,raw_reviews))

#for review in raw_reviews:
#    review = set(review.split(" "))
#    tokens.append(review)

In [4]:
vocab = set()
for sentence in tokens:
    for word in sentence:
        if(len(word) > 0):
            vocab.add(word)
vocab = list(vocab)
# the vocab is just words

In [5]:
len(vocab)

74074

In [6]:
word2index = {}
index2word = {}
for i,word in enumerate(vocab):
    word2index[word] = i
    index2word[i] = word

In [7]:
input_dataset = list()
for sentence in tokens:
    sent_indices = list()
    for word in sentence:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))

In [8]:
target_dataset = list()
for label in raw_labels:
    if label == "positive\n":
        target_dataset.append(1)
    else:
        target_dataset.append(0)

In [9]:
import numpy as np
import time
np.random.seed(1)

In [10]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))


alpha, iterations = (0.01, 10)
hidden_size = 100

# test vocab

In [11]:
weights_0_1 = 0.2 * np.random.random((len(vocab),hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size,1)) -0.1

print(weights_0_1.shape)
print(weights_1_2.shape)

(74074, 100)
(100, 1)


In [12]:
correct, total = (0,0)

start_time = time.time()

for iter in range(iterations):
    for i in range(len(input_dataset) - 1000):
        
        x,y = (input_dataset[i],target_dataset[i])
        
        layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0))
        layer_2 = sigmoid(np.dot(layer_1,weights_1_2))
        
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        
        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1,layer_2_delta) * alpha
        
        if(np.abs(layer_2_delta) < 0.5): # we are only targeting for NEG anything is near 0 is NEG
            # remember sigmoid only returns one label in our case is [0,1][NEG,POS]
            correct +=1
        total +=1
        
        if(i % 10 == 9):
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write("\rIter:"+str(iter)\
                            +" Progress:"+progress[2:4] \
                            +'.'+progress[4:6]\
                            +"% Training Accuracy:"\
                            + str(correct/float(total)) + "%")
    print("")
print("DONE: ",time.time()-start_time," seconds")

Iter:0 Progress:95.99% Training Accuracy:0.8315416666666666%%
Iter:1 Progress:95.99% Training Accuracy:0.8658958333333333%
Iter:2 Progress:95.99% Training Accuracy:0.8840416666666666%
Iter:3 Progress:95.99% Training Accuracy:0.8972916666666667%
Iter:4 Progress:95.99% Training Accuracy:0.9074666666666666%
Iter:5 Progress:95.99% Training Accuracy:0.9158263888888889%
Iter:6 Progress:95.99% Training Accuracy:0.9229940476190476%
Iter:7 Progress:95.99% Training Accuracy:0.9293177083333334%
Iter:8 Progress:95.99% Training Accuracy:0.9349907407407407%
Iter:9 Progress:95.99% Training Accuracy:0.940075%030209592%
DONE:  92.77481365203857  seconds


In [13]:
x = np.array([0,1,2,3,4,5,6,7,8,9,10])
y = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13])

y[x]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [14]:
correct,total = (0,0)
start_time = time.time()
for i in range(len(input_dataset)-1000, len(input_dataset)):
    x,y = (input_dataset[i], target_dataset[i])
    
    layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0))
    layer_2 = sigmoid(np.dot(layer_1,weights_1_2))
    
    if (np.abs(layer_2-y) < 0.5):
        correct +=1
    #else:
        #print(np.abs(layer_2-y))
        #print(x)
        sent_word = []
        #for words in x:
            #sent_word.append(index2word[words])
        #print(sent_word)
        
    total +=1
    
print("Test Acc:"+str(correct/float(total)))
print("Test took: ",time.time()-start_time," seconds")

Test Acc:0.835
Test took:  0.26628732681274414  seconds


In [15]:
from collections import Counter
import math

In [2]:
def similar(target="beautiful"):
    target_index = word2index[target]
    scores = Counter()
    for word, index in word2index.items():
        raw_differenece = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_differenece * raw_differenece
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

In [18]:
similar('best')

[('best', -0.0),
 ('cry', -0.7256825122909031),
 ('sixties', -0.7386168825350984),
 ('rule', -0.741802668068826),
 ('fever', -0.7436934215771109),
 ('expressions', -0.7524943933825698),
 ('columbine', -0.7557919424609318),
 ('cb', -0.7566002488482337),
 ('energetic', -0.760822806053922),
 ('contrast', -0.761974052904571)]

# fill in blank

In [1]:
import random
import sys
import numpy as np
from collections import Counter
np.random.seed(1)
import math
random.seed(1)

In [2]:
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

In [3]:
tokens = list(map(lambda x:(x.split(" ")),raw_reviews))
wordcnt = Counter()

In [4]:
for sent in tokens:
    for word in sent:
        wordcnt[word] -=1
vocab = list(set(map(lambda x:x[0], wordcnt.most_common())))

In [5]:
word2index = {}

for i,word in enumerate(vocab):
    word2index[word] = i    

In [6]:
concatenated = list()
input_dataset = list()

for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sent_indices)
concatenated = np.array(concatenated)

random.shuffle(input_dataset)

In [7]:
len(concatenated)

7459318

In [8]:
alpha, iterations = (0.05, 2)
hidden_size, window, negative = (50,3,5)

# the 5 is for negative sampling six words removes one 5 remains 

weights_0_1 = (np.random.rand(len(vocab),hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab),hidden_size) * 0

In [9]:
layer_2_target = np.zeros(negative + 1)
layer_2_target[0] = 1

In [10]:
def similar(target="beautiful"):
    target_index = word2index[target]
    scores = Counter()
    
    for word,index in word2index.items():
        raw_differenece = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_differenece * raw_differenece
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

In [11]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [12]:
i = 0
for rev_i, review in enumerate(input_dataset * iterations):
    #print("-"*50)
    #print(rev_i)
    for target_i in range(len(review)):
        
        
        target_samples = [review[target_i]] + list(concatenated\
                                                   [(np.random.rand(negative) * len(concatenated)).astype('int').tolist()])
       
    
        left_context = review[max(0,target_i-window):target_i]
        right_context = review[target_i+1:min(len(review),target_i+window)]
        
        
        #print("-"*50)
        #print("L:",left_context," INDEX: ",max(0,target_i-window),"::",target_i)
        #print("R:",right_context, "INDEX: ",target_i+1,"::",min(len(review),target_i+window))
        
        layer_1 = np.mean(weights_0_1[left_context+right_context],axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])
        
        weights_0_1[left_context + right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta,layer_1) * alpha
    
    if(rev_i % 250 ==0):
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
            *iterations)) + "   " + str(similar('terrible')))
    sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
        *iterations)))
print(similar('terrible'))

Progress:0.99998 [('terrible', -0.0), ('horrible', -4.580052125015765), ('brilliant', -4.707014959228188), ('pathetic', -4.935225742906431), ('fantastic', -5.204615310376495), ('marvelous', -5.219520020225393), ('bad', -5.307147113307617), ('dreadful', -5.345749368099789), ('lousy', -5.353383335854187), ('remarkable', -5.377272808568338)]145)]]0906)]854)][('terrible', -0.0), ('horrible', -4.3843805721269975), ('brilliant', -4.949278670212796), ('bad', -5.121799003280398), ('pathetic', -5.129414793143894), ('lousy', -5.372828029238985), ('dreadful', -5.4468148398323635), ('marvelous', -5.468793923687044), ('remarkable', -5.570266412630283), ('mediocre', -5.600927031848216)]


In [13]:
print(similar('terrible'))

[('terrible', -0.0), ('horrible', -4.3843805721269975), ('brilliant', -4.949278670212796), ('bad', -5.121799003280398), ('pathetic', -5.129414793143894), ('lousy', -5.372828029238985), ('dreadful', -5.4468148398323635), ('marvelous', -5.468793923687044), ('remarkable', -5.570266412630283), ('mediocre', -5.600927031848216)]


In [16]:
def analogy(positive=['terrible','good'], negative=['bad']):
    norms = np.sum(weights_0_1 * weights_0_1,axis=1)
    norms.resize(norms.shape[0],1)
    normed_weights = weights_0_1 * norms
    query_vect = np.zeros(len(weights_0_1[0]))
    
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    for word in negative:
        query_vect -= normed_weights[word2index[word]]
    
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - query_vect
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)[1:]
    

In [18]:
analogy(['terrible','good'],['bad'])

[('superb', -399.544790747415),
 ('nice', -399.79417955982973),
 ('perfect', -399.848305619875),
 ('fine', -399.9527703380172),
 ('great', -400.2069702613134),
 ('solid', -400.38308684476567),
 ('decent', -400.3948964892706),
 ('limited', -400.4099071858375),
 ('terrible', -400.53290829492676)]

In [31]:
analogy(['elizabeth','he'],['she'])

[('allen', -321.99842325630004),
 ('fred', -322.17980396516015),
 ('brian', -322.27220933016065),
 ('john', -322.32048774625423),
 ('morgan', -322.51835834233833),
 ('smith', -322.60271100207007),
 ('glenn', -322.60417461397964),
 ('laurel', -322.6131991568636),
 ('elizabeth', -322.70200124078)]

In [32]:
analogy(['smith','she'],['he'])

[('j', -576.1628630899065),
 ('dr', -576.1831012053653),
 ('smith', -576.4603420725058),
 ('ms', -576.51740999599),
 ('mr', -576.8740918910538),
 ('br', -577.4360748035441),
 ('p', -577.4789729506316),
 ('mrs', -577.5421953642326),
 ('l', -577.5973639681565)]