# Preprocessing

In [1]:
import sys , numpy as np

f = open("reviews.txt")
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()

tokens = list(map(lambda x:set(x.split(' ')),raw_reviews))

vocab = set()
for sent in tokens:
    for word in sent:
        if(len(word)>0):
            vocab.add(word)
vocab = list(vocab)

word2index = {}
index2word = {}
for i, word in enumerate(vocab):
    word2index[word] = i
    index2word[i] = word

    
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))

target_dataset = list()

for label in raw_labels:
    if label == "positive\n":
        target_dataset.append(1)
    else:
        target_dataset.append(0)
        

In [2]:
test = []
for sent in tokens:
    sentence = list()
    print(sent)
    for word in sent:
        try:
            sentence.append(word2index[word])
        except:
            ""
        
    print(sentence)
    break
    

{'', 'knew', 'the', 'in', 'what', 'comedy', 'life', 'than', 'survive', 'of', 'that', 'can', 'welcome', 'bromwell', 'expect', 'programs', 'students', 'which', 'scramble', 'burn', 'think', 'see', 'right', 'profession', 'immediately', 'it', 'pity', 'my', 'believe', 'repeatedly', 'pomp', 'teaching', 'remind', 'down', 'fetched', 'who', 'much', 'at', 'episode', 'pathetic', 'is', 'student', 'isn', 'lead', 'time', '\n', 'as', 'me', 'insightful', 'satire', 'your', 'cartoon', 'here', 'high', 'whole', 'one', 'pettiness', 'line', 'inspector', 'same', 'closer', 'ran', 'i', 'about', 'many', 'financially', 'situation', 'recalled', 'saw', 'their', 'schools', 'to', 'sack', 'm', 'some', 'a', 'all', 'years', 'adults', 'when', 'such', 'far', 's', 'classic', '.', 'age', 'tried', 'other', 'school', 'reality', 'through', 't', 'and', 'teachers'}
[36614, 41228, 53226, 70571, 32940, 55234, 43594, 68870, 64530, 47077, 49104, 40383, 1754, 21380, 64288, 10916, 59930, 50904, 42757, 49459, 60262, 11224, 15838, 31887

In [3]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [4]:

alpha, iterations = (0.01,2)
hidden_size = 100

weights_0_1 = 0.2 * np.random.random((len(vocab),hidden_size)) -0.1
weights_1_2 = 0.2 * np.random.random((hidden_size,1)) - 0.1

correct, total = (0,0)
for iter in range(iterations):
    for i in range(len(input_dataset) - 1000): # train on the first 24k reviews
        
        x,y  = (input_dataset[i], target_dataset[i]) # embed + sigmoid, softmax + linear
        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
        
        # backprobagation
        
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        
        # updating weights
        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1, layer_2_delta) * alpha
        
        if(np.abs(layer_2_delta) < 0.5):
            correct += 1
        total += 1
        if(i % 10 == 9):
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write('\rIter:'+str(iter)\
                             +' Progress:'+progress[2:4]\
                             +'.'+progress[4:6]\
                             +'% Training Accuracy:'\
                             + str(correct/float(total)) + '%')
    print()
correct,total = (0,0)
for i in range(len(input_dataset)-1000,len(input_dataset)):
    x = input_dataset[i]    
    y = target_dataset[i]
    layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0))
    layer_2 = sigmoid(np.dot(layer_1,weights_1_2))
    #print("layer_2: ",layer_2, " y:",y," abs: ",np.abs(layer_2 - y))
    if(np.abs(layer_2 - y) < 0.5):
        
        correct += 1
    total += 1
print("Test Accuracy:" + str(correct / float(total)))
        

Iter:0 Progress:95.99% Training Accuracy:0.8334166666666667%%
Iter:1 Progress:95.99% Training Accuracy:0.866625%83100646%%
Test Accuracy:0.85


In [5]:
x = np.array( [ [[1,2],[3,4]],[[5,6],[7,8]] ] )

xx = np.sum(x,axis=0)

print(x[0] + x[1])

print(xx)

[[ 6  8]
 [10 12]]
[[ 6  8]
 [10 12]]


In [6]:
from collections import Counter
import math

In [7]:
def similar(target="beautiful"):
    target_index = word2index[target]
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)


In [8]:
similar()

[('beautiful', -0.0),
 ('innocent', -0.71915803977794),
 ('enjoy', -0.7221940370957898),
 ('best', -0.7366459998658659),
 ('appreciate', -0.7427724438020783),
 ('enjoyed', -0.7547304406986338),
 ('recommended', -0.7608382050958593),
 ('each', -0.7615837983147733),
 ('normal', -0.7621274484641513),
 ('believable', -0.772758476175448)]

In [9]:
import random,sys, numpy as np
from collections import Counter
import math

In [10]:
np.random.seed(1)
random.seed(1)
f = open("reviews.txt")
raw_reviews = f.readlines()
f.close()

In [11]:
tokens = list(map(lambda x:(x.split(" ")),raw_reviews))

wordcnt = Counter()
for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1
vocab = list(set(map(lambda x:x[0], wordcnt.most_common())))

word2index = {}
index2word={}
for i,word in enumerate(vocab):
    word2index[word] = i
    index2word[i] = word
concatenated = list()
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sent_indices)
concatenated = np.array(concatenated)
random.shuffle(input_dataset)

# Fill in the blank

In [12]:
alpha, iterations = (0.05,2)
hidden_size, window, negative = (50,2,5)
weights_0_1  = (np.random.rand(len(vocab),hidden_size)-0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab),hidden_size) * 0

layer_2_target = np.zeros(negative +1 )
layer_2_target[0] = 1

def similar(target="beautiful"):
    target_index = word2index[target]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

for rev_i, review in enumerate(input_dataset * iterations):
    for target_i in range(len(review)):
        target_samples = [review[target_i]] + list(concatenated\
                                                  [(np.random.rand(negative) * len(concatenated)).astype('int').tolist()])
        
        left_context = review[max(0, target_i- window): target_i]
        right_context = review[target_i+1:min(len(review), target_i+ window)]
        
        layer_1 = np.mean(weights_0_1[left_context+right_context], axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])
        
        weights_0_1[left_context + right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta, layer_1) * alpha
    if(rev_i % 250 == 0):
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
            *iterations)) + "   " + str(similar('terrible')))
    sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
        *iterations)))
print(similar('terrible'))
        

Progress:0.99998 [('terrible', -0.0), ('horrible', -3.0418519348493334), ('brilliant', -3.0659159417986097), ('superb', -3.5914897457022836), ('phenomenal', -3.7419380497490566), ('pathetic', -3.7928123086276204), ('masterful', -3.8638942709585518), ('marvelous', -4.090077240728033), ('miserable', -4.163623533084494), ('mediocre', -4.197460186227692)]]379)]]][('terrible', -0.0), ('horrible', -2.947738785698142), ('brilliant', -3.3190312240670092), ('pathetic', -3.748914761300403), ('superb', -3.8098457789498843), ('phenomenal', -3.817682213748524), ('masterful', -3.9697964128553425), ('bad', -4.066490330857878), ('marvelous', -4.1540812847915385), ('dreadful', -4.249103866468683)]


In [13]:
np.save("weights/c_11.npy",weights_0_1)
index2word[2483]

'actionless'

In [26]:
print(similar("suck"))
weights_0_1 = np.load("weights/c_11.npy")

[('suck', -0.0), ('disappoint', -3.0310371031641217), ('frighten', -3.135028002060621), ('possess', -3.161809523983756), ('recite', -3.307001844290352), ('cancel', -3.3083585580891013), ('entice', -3.364506092089718), ('boil', -3.367613788176068), ('attest', -3.3917474763751283), ('deter', -3.402522953069194)]


In [27]:
def analogy(positive=['terrible','good'],negative=['bad']):
    norms = np.sum(weights_0_1 * weights_0_1,axis=1)
    norms.resize(norms.shape[0],1)
    normed_weights = weights_0_1 * norms
    query_vect = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    for word in negative:
        query_vect -= normed_weights[word2index[word]]
    
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - query_vect
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)[1:]

In [28]:
analogy(['elizabeth','he'],['she'])

[('christopher', -187.44533320219765),
 ('tom', -187.97540856574366),
 ('you', -188.02273718883094),
 ('fred', -188.06487692373773),
 ('william', -188.08266801592822),
 ('this', -188.0962465666065),
 ('him', -188.17000086532454),
 ('been', -188.17238878207488),
 ('mr', -188.17536554105487)]

In [62]:
norms = np.sum(weights_0_1 * weights_0_1,axis=1)
norms.resize(norms.shape[0],1)
normed_weights = weights_0_1 * norms

def make_sent_vect(words):
    indices = list(map(lambda x:word2index[x],\
                      filter(lambda x:x in word2index,words)))
    return np.mean(normed_weights[indices],axis=0)

reviews2vectors = list()

In [63]:
for review in tokens:
    reviews2vectors.append(make_sent_vect(review))
reviews2vectors = np.array(reviews2vectors)

In [64]:
def most_similar_reviews(review):
    v = make_sent_vect(review)
    scores = Counter()
    for i, val in enumerate(reviews2vectors.dot(v)):
        scores[i] = val
    most_similar = list()
    
    for idx, score in scores.most_common(3):
        most_similar.append(raw_reviews[idx][0:100])
    return most_similar

In [66]:
most_similar_reviews(['great'])

['i think this show is definitely the greatest show . jessica alba does such a great job in it . micha',
 'i have seen this movie . this movie is the best according today  s need . dowry in marriages is the ',
 'some of the filmmakers who are participating in this series have made some really great films but th']