# First we will begin with a basic review of numpy

In [48]:
import numpy as np

In [2]:
a = np.arange(5)

In [4]:
b = np.arange(10,15)

In [5]:
a.dot(b)

130

In [7]:
np.sum(a*b)

130

In [8]:
a*b

array([ 0, 11, 24, 39, 56])

In [9]:
row_vector = np.array([[1, 3, 2]])

In [10]:
col_vector = np.array([[2], [0], [1]])

In [11]:
row_vector.dot(col_vector)

array([[4]])

In [12]:
col_vector.dot(row_vector)

array([[2, 6, 4],
       [0, 0, 0],
       [1, 3, 2]])

In [14]:
col_vector.ravel(),row_vector.ravel()

(array([2, 0, 1]), array([1, 3, 2]))

In [15]:
np.outer(col_vector.ravel(),row_vector.ravel())

array([[2, 6, 4],
       [0, 0, 0],
       [1, 3, 2]])

# Now onto the real stuff
### Comparing word embeddings to fill in the blank

In [65]:
import sys,random,math
from collections import Counter
from nltk import word_tokenize

In [66]:
#Add a seed for randomizing choice which will be explained later
np.random.seed(1)

In [88]:
#This is a list of movie reviews of Imdb file will be uploaded to github later.Example from Andrew Trask's book Grokking
#Deep Learning

f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

In [70]:
#tokens = word_tokenize(raw_reviews)

In [89]:
#tokens
#split the review array into small arrays of words of each sentence

tokens = list(map(lambda x:(x.split(" ")),raw_reviews))

In [90]:
wordcnt = Counter()

In [91]:
for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1

In [92]:
# String operations to remove punctuations
import string

In [75]:
## We use the popular stopwords defined in the nltk corpus. 
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


In [76]:
stop_words = set(stopwords.words('english'))

In [78]:
tokens = [w for w in tokens if w not in stop_words]

In [79]:
for word in tokens:
    wordcnt[word] -= 1

In [93]:
vocab = list(set(map(lambda x:x[0],wordcnt.most_common())))

In [94]:
len(vocab)

74075

### Create word to index and reverse dictionary

In [95]:
word2index = {}
index2word = {}
for i,word in enumerate(vocab):
    word2index[word]=i
    index2word[i]=word

In [96]:
index2word[6085],word2index['scopes']

('instigation', 9357)

In [97]:
concatenated = list()
input_dataset = list()

In [98]:
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sent_indices)
    

In [102]:
#concatenated is a list of all the indices of the vocab
#input_dataset is a list of sentences with indices of words replacing the actual words
concatenated = np.array(concatenated)
random.shuffle(input_dataset)

In [115]:
alpha, iterations = (0.05, 2)
hidden_size,window,negative = (50,2,5)
weights_0_1 = (np.random.rand(len(vocab),hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab),hidden_size)*0
layer_2_target = np.zeros(negative+1)
layer_2_target[0] = 1
def similar(target='beautiful'):
    target_index = word2index[target]

    scores = Counter()
    for word,index in word2index.items():
        
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)


In [116]:
weights_0_1

array([[ 0.04962836,  0.01988407, -0.02924894, ...,  0.02235409,
        -0.07866708, -0.08212026],
       [-0.02437445,  0.06014989, -0.04563997, ..., -0.03621135,
        -0.09106698, -0.0571341 ],
       [-0.05086565,  0.01672281,  0.02537102, ...,  0.04611479,
        -0.06692812, -0.09473534],
       ..., 
       [ 0.04327146,  0.06471183,  0.06579626, ..., -0.08170339,
         0.03575512, -0.09422431],
       [ 0.09846515,  0.05407922, -0.07341868, ...,  0.02439074,
        -0.01956672, -0.02716743],
       [ 0.09529182,  0.00331547, -0.05528874, ..., -0.08805801,
        -0.07556738, -0.00967705]])

In [117]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [118]:
import sys,random,math

In [119]:
for rev_i,review in enumerate(input_dataset * 10):
    for target_i in range(len(review)):
        
        target_samples = [review[target_i]]+list(concatenated\
    [(np.random.rand(negative)*len(concatenated)).astype('int').tolist()])
        left_context = review[max(0,target_i-window):target_i]
        right_context = review[target_i+1:min(len(review),target_i+window)]
        layer_1 = np.mean(weights_0_1[left_context+right_context],axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])
        weights_0_1[left_context+right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta,layer_1)*alpha
    if(rev_i % 250 == 0):
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
        *iterations)) + "   " + str(similar('terrible')))
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
        *iterations)))
        

Progress:4.995

In [120]:
print(similar('terrible'))

[('terrible', -0.0), ('horrible', -2.4754866203702597), ('lame', -3.4387048575178283), ('dreadful', -3.5539998565731437), ('ridiculous', -3.6751871768620124), ('pathetic', -3.6799033008784447), ('brilliant', -3.8682180919265168), ('bad', -3.8752407833336346), ('stupid', -3.9329262050358285), ('dull', -4.038289791874596)]


In [114]:
weights_0_1

array([[ 0.41776661,  0.42647084,  0.60673674, ..., -1.1630179 ,
         0.38765544,  0.4210408 ],
       [ 0.56871654,  0.46928897, -0.45401779, ..., -0.23364384,
         0.04506904,  0.29215164],
       [ 0.12500887, -0.0272987 , -0.12779363, ...,  0.34572179,
         1.13721267, -0.16440506],
       ..., 
       [-0.51866034,  0.2395577 ,  0.08290923, ...,  0.35521976,
        -0.10325015, -0.29946646],
       [ 0.10205121, -0.30473111, -0.01627451, ...,  0.59436886,
         0.13367221, -0.27562559],
       [-0.21120256,  0.254616  ,  0.43840107, ...,  0.23422909,
        -0.12361702,  0.10401656]])

In [121]:
weights_0_1

array([[-0.02309237, -0.33961461, -0.61860407, ..., -0.44784579,
        -0.8079342 ,  0.02609339],
       [-0.3543827 , -0.45861281,  0.04627791, ...,  0.098861  ,
         0.06744006,  0.609333  ],
       [-0.08620186,  0.24950204,  0.01597825, ...,  0.24110192,
         0.07037026,  0.76157882],
       ..., 
       [-0.16956576, -0.35832148,  0.29649288, ..., -0.25356524,
        -0.18207844,  0.29944431],
       [-0.70936176,  0.12017149,  0.41595295, ...,  0.52804974,
         0.20563586,  0.32338842],
       [ 0.35931363,  0.53415009,  0.0473822 , ..., -0.06943643,
        -0.08977681, -0.35579795]])