# Download the IMDB Dataset

In [1]:
# Download reviews.txt and labels.txt from here: https://github.com/udacity/deep-learning/tree/master/sentiment-network

def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

# Capturing Word Correlation in Input Data

In [1]:
import numpy as np

onehots = {}
onehots['cat'] = np.array([1,0,0,0])
onehots['the'] = np.array([0,1,0,0])
onehots['dog'] = np.array([0,0,1,0])
onehots['sat'] = np.array([0,0,0,1])

sentence = ['the','cat','sat']
x = onehots[sentence[0]] + \
    onehots[sentence[1]] + \
    onehots[sentence[2]]

print("Sent Encoding:" + str(x))

Sent Encoding:[1 1 0 1]


# Predicting Movie Reviews

In [3]:
import sys

f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()

tokens = list(map(lambda x:set(x.split(" ")),raw_reviews))

vocab = set()
for sent in tokens:
    for word in sent:
        if(len(word)>0):
            vocab.add(word)
vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))

target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

In [25]:
# len(input_dataset)
print(len(input_dataset[0]))
print(len(input_dataset[1]))

93
92


In [31]:
weights_0_1 = 0.2*np.random.random((len(vocab),hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size,1)) - 0.1
weights_0_1

array([[-0.00496829, -0.00584673,  0.08647689, ...,  0.07929343,
        -0.02462016, -0.09272003],
       [-0.07496109, -0.05818407,  0.036516  , ..., -0.06402329,
        -0.00368486, -0.07602195],
       [-0.05729516, -0.06305535,  0.08788789, ..., -0.07202472,
        -0.02461672,  0.06744295],
       ...,
       [ 0.01453033, -0.05312117, -0.06007531, ...,  0.06570694,
        -0.05802503, -0.04057838],
       [ 0.023655  ,  0.01387484, -0.06564111, ...,  0.0359326 ,
        -0.03080101,  0.06428635],
       [-0.02340417, -0.07429468,  0.03666998, ..., -0.00843881,
        -0.01721167, -0.08416783]])

In [51]:
print(weights_0_1.shape)
print(len(x))
print(weights_0_1[x].shape)
print(weights_0_1[x])
print(np.sum(weights_0_1[x],axis=0))
print(np.sum(weights_0_1[x],axis=0).shape)

(74074, 100)
85
(85, 100)
[[ 0.028412    0.01134977  0.07356857 ... -0.05182767 -0.01069697
   0.0379558 ]
 [-0.14794523  0.00178728  0.04969902 ... -0.10289187  0.04516299
   0.00676864]
 [-0.01113565 -0.07415001 -0.09439084 ...  0.02064655  0.07429557
  -0.08237898]
 ...
 [ 0.06019313  0.04028161  0.0222981  ...  0.06971372 -0.02259204
  -0.09873509]
 [ 0.04626867  0.00520403  0.0013368  ...  0.00294918 -0.06310901
   0.04443481]
 [-0.02790076 -0.04467161 -0.07923572 ... -0.09485783 -0.00940906
   0.06883097]]
[-0.9264784  -1.09000105 -1.65522704 -0.9767029  -0.70920182 -0.49662162
 -0.34869325  0.93603314 -1.43915881 -0.58335806 -1.78930383 -0.32259843
 -1.12824982 -0.71220311 -0.14746003 -0.25407647 -0.96259694 -1.3288857
 -0.58783149 -0.75463787 -0.20694016 -1.57972657  0.43943489 -0.63987251
 -0.28310225  0.56592062 -0.76709032 -0.74594112 -0.52977236 -0.15208438
 -0.57027488 -0.70012338 -0.24919539 -0.44112736 -0.97997608 -0.43385452
 -0.54893872  0.12955892  0.41970498 -0.22270

In [48]:
len(input_dataset[0])
len(input_dataset[2])

232

In [46]:
import numpy as np
np.random.seed(1)

def sigmoid(x):
    return 1/(1 + np.exp(-x))

alpha, iterations = (0.01, 2)
hidden_size = 100

weights_0_1 = 0.2*np.random.random((len(vocab),hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size,1)) - 0.1

correct,total = (0,0)
for iter in range(iterations):
    
    # train on first 24,000
    for i in range(len(input_dataset)-1000):

        x,y = (input_dataset[i],target_dataset[i])
        
        print(len(x), y)
        
        layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0)) #embed + sigmoid
        layer_2 = sigmoid(np.dot(layer_1,weights_1_2)) # linear + softmax

        layer_2_delta = layer_2 - y # compare pred with truth
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) #backprop

        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1,layer_2_delta) * alpha

        if(np.abs(layer_2_delta) < 0.5):
            correct += 1
        total += 1
        if(i % 10 == 9):
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write('\rIter:'+str(iter)\
                             +' Progress:'+progress[2:4]\
                             +'.'+progress[4:6]\
                             +'% Training Accuracy:'\
                             + str(correct/float(total)) + '%')
    print()
correct,total = (0,0)
for i in range(len(input_dataset)-1000,len(input_dataset)):

    x = input_dataset[i]
    y = target_dataset[i]

    layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0))
    layer_2 = sigmoid(np.dot(layer_1,weights_1_2))
    
    if(np.abs(layer_2 - y) < 0.5):
        correct += 1
    total += 1
print("Test Accuracy:" + str(correct / float(total)))

93 1
92 0
232 1
408 0
115 1
98 0
84 1
106 0
85 1
231 0
Iter:0 Progress:00.03% Training Accuracy:0.4%113 1
78 0
80 1
212 0
225 1
113 0
249 1
52 0
203 1
157 0
Iter:0 Progress:00.07% Training Accuracy:0.25%174 1
192 0
57 1
104 0
191 1
80 0
138 1
92 0
85 1
97 0
Iter:0 Progress:00.11% Training Accuracy:0.23333333333333334%110 1
100 0
138 1
53 0
213 1
64 0
86 1
97 0
94 1
137 0
Iter:0 Progress:00.15% Training Accuracy:0.225%153 1
18 0
299 1
246 0
130 1
116 0
156 1
128 0
98 1
89 0
Iter:0 Progress:00.19% Training Accuracy:0.24%78 1
431 0
76 1
157 0
192 1
124 0
88 1
266 0
106 1
129 0
Iter:0 Progress:00.23% Training Accuracy:0.25%136 1
84 0
75 1
179 0
101 1
177 0
126 1
106 0
212 1
273 0
Iter:0 Progress:00.27% Training Accuracy:0.24285714285714285%209 1
178 0
47 1
137 0
165 1
113 0
140 1
144 0
216 1
84 0
Iter:0 Progress:00.31% Training Accuracy:0.25%190 1
126 0
106 1
85 0
251 1
83 0
149 1
141 0
81 1
186 0
Iter:0 Progress:00.35% Training Accuracy:0.24444444444444444%185 1
172 0
410 1
375 0
333 1
13

170 1
107 0
Iter:0 Progress:03.95% Training Accuracy:0.5242424242424243%116 1
85 0
135 1
108 0
52 1
448 0
95 1
393 0
267 1
238 0
Iter:0 Progress:03.99% Training Accuracy:0.527%94 1
83 0
110 1
173 0
152 1
152 0
115 1
106 0
276 1
241 0
Iter:0 Progress:04.03% Training Accuracy:0.5316831683168317%152 1
142 0
117 1
89 0
77 1
125 0
65 1
175 0
74 1
85 0
Iter:0 Progress:04.07% Training Accuracy:0.5333333333333333%247 1
109 0
82 1
39 0
94 1
97 0
30 1
62 0
342 1
129 0
Iter:0 Progress:04.11% Training Accuracy:0.5368932038834952%157 1
117 0
97 1
200 0
129 1
275 0
127 1
119 0
77 1
65 0
Iter:0 Progress:04.15% Training Accuracy:0.5403846153846154%159 1
119 0
94 1
48 0
270 1
107 0
120 1
254 0
94 1
80 0
Iter:0 Progress:04.19% Training Accuracy:0.540952380952381%61 1
105 0
143 1
51 0
109 1
69 0
130 1
98 0
124 1
39 0
Iter:0 Progress:04.23% Training Accuracy:0.5424528301886793%150 1
75 0
93 1
236 0
153 1
187 0
41 1
131 0
149 1
73 0
Iter:0 Progress:04.27% Training Accuracy:0.5439252336448598%84 1
147 0
133

85 1
107 0
Iter:0 Progress:07.43% Training Accuracy:0.6204301075268818%109 1
233 0
149 1
101 0
73 1
96 0
214 1
116 0
36 1
164 0
Iter:0 Progress:07.47% Training Accuracy:0.6224598930481283%104 1
325 0
210 1
88 0
67 1
227 0
131 1
105 0
292 1
95 0
Iter:0 Progress:07.51% Training Accuracy:0.6234042553191489%82 1
99 0
188 1
221 0
82 1
145 0
134 1
97 0
104 1
154 0
Iter:0 Progress:07.55% Training Accuracy:0.6238095238095238%159 1
247 0
220 1
132 0
217 1
241 0
226 1
277 0
114 1
218 0
Iter:0 Progress:07.59% Training Accuracy:0.6252631578947369%125 1
163 0
96 1
109 0
91 1
56 0
127 1
101 0
73 1
221 0
Iter:0 Progress:07.63% Training Accuracy:0.6267015706806283%89 1
114 0
324 1
135 0
164 1
141 0
309 1
77 0
220 1
123 0
Iter:0 Progress:07.67% Training Accuracy:0.6270833333333333%93 1
77 0
116 1
154 0
114 1
83 0
105 1
175 0
108 1
71 0
Iter:0 Progress:07.71% Training Accuracy:0.627979274611399%83 1
231 0
91 1
134 0
128 1
381 0
185 1
124 0
124 1
96 0
Iter:0 Progress:07.75% Training Accuracy:0.6288659793

Iter:0 Progress:10.79% Training Accuracy:0.68%116 1
117 0
100 1
89 0
158 1
112 0
184 1
135 0
440 1
106 0
Iter:0 Progress:10.83% Training Accuracy:0.6808118081180812%101 1
94 0
188 1
85 0
30 1
66 0
52 1
119 0
208 1
77 0
Iter:0 Progress:10.87% Training Accuracy:0.6808823529411765%45 1
219 0
50 1
158 0
183 1
208 0
156 1
200 0
105 1
89 0
Iter:0 Progress:10.91% Training Accuracy:0.680952380952381%99 1
103 0
87 1
47 0
113 1
93 0
84 1
107 0
197 1
88 0
Iter:0 Progress:10.95% Training Accuracy:0.6817518248175183%211 1
95 0
83 1
102 0
241 1
255 0
139 1
80 0
101 1
109 0
Iter:0 Progress:10.99% Training Accuracy:0.6821818181818182%94 1
67 0
120 1
115 0
177 1
49 0
79 1
182 0
103 1
167 0
Iter:0 Progress:11.03% Training Accuracy:0.6833333333333333%89 1
104 0
96 1
312 0
96 1
206 0
45 1
231 0
112 1
251 0
Iter:0 Progress:11.07% Training Accuracy:0.6841155234657039%131 1
140 0
155 1
94 0
134 1
214 0
83 1
111 0
270 1
92 0
Iter:0 Progress:11.11% Training Accuracy:0.6848920863309352%302 1
279 0
254 1
319 0
1

Iter:0 Progress:13.67% Training Accuracy:0.7064327485380117%302 1
101 0
128 1
101 0
86 1
160 0
109 1
198 0
78 1
145 0
Iter:0 Progress:13.71% Training Accuracy:0.7067055393586006%350 1
93 0
259 1
129 0
89 1
433 0
76 1
95 0
174 1
143 0
Iter:0 Progress:13.75% Training Accuracy:0.7066860465116279%105 1
93 0
110 1
142 0
85 1
114 0
127 1
97 0
146 1
284 0
Iter:0 Progress:13.79% Training Accuracy:0.7066666666666667%249 1
95 0
339 1
102 0
78 1
128 0
105 1
75 0
90 1
112 0
Iter:0 Progress:13.83% Training Accuracy:0.707514450867052%97 1
108 0
104 1
299 0
66 1
105 0
33 1
314 0
172 1
294 0
Iter:0 Progress:13.87% Training Accuracy:0.7077809798270893%197 1
87 0
128 1
175 0
96 1
223 0
200 1
72 0
120 1
92 0
Iter:0 Progress:13.91% Training Accuracy:0.7077586206896552%291 1
101 0
169 1
236 0
160 1
115 0
95 1
207 0
130 1
110 0
Iter:0 Progress:13.95% Training Accuracy:0.707163323782235%172 1
251 0
261 1
89 0
206 1
192 0
62 1
161 0
112 1
111 0
Iter:0 Progress:13.99% Training Accuracy:0.7071428571428572%97 1


202 1
91 0
Iter:0 Progress:16.87% Training Accuracy:0.7251184834123223%358 1
141 0
83 1
123 0
81 1
80 0
112 1
170 0
131 1
145 0
Iter:0 Progress:16.91% Training Accuracy:0.724822695035461%213 1
143 0
289 1
136 0
147 1
110 0
98 1
165 0
111 1
409 0
Iter:0 Progress:16.95% Training Accuracy:0.725%133 1
88 0
362 1
93 0
98 1
101 0
44 1
209 0
148 1
190 0
Iter:0 Progress:16.99% Training Accuracy:0.7247058823529412%113 1
179 0
109 1
164 0
140 1
98 0
103 1
137 0
87 1
259 0
Iter:0 Progress:17.03% Training Accuracy:0.7251173708920188%177 1
131 0
33 1
109 0
106 1
65 0
78 1
107 0
54 1
108 0
Iter:0 Progress:17.07% Training Accuracy:0.7252927400468384%54 1
79 0
54 1
63 0
31 1
276 0
100 1
93 0
274 1
131 0
Iter:0 Progress:17.11% Training Accuracy:0.7259345794392523%90 1
126 0
172 1
229 0
90 1
67 0
261 1
96 0
90 1
240 0
Iter:0 Progress:17.15% Training Accuracy:0.7265734265734266%83 1
45 0
81 1
174 0
66 1
139 0
61 1
75 0
309 1
457 0
Iter:0 Progress:17.19% Training Accuracy:0.7272093023255813%117 1
44 0
88 

162 1
110 0
237 1
88 0
116 1
168 0
Iter:0 Progress:21.35% Training Accuracy:0.7426966292134831%127 1
153 0
93 1
115 0
82 1
93 0
221 1
193 0
307 1
125 0
Iter:0 Progress:21.39% Training Accuracy:0.7429906542056075%119 1
302 0
198 1
70 0
200 1
58 0
69 1
81 0
283 1
131 0
Iter:0 Progress:21.43% Training Accuracy:0.7434701492537313%343 1
305 0
57 1
224 0
71 1
159 0
80 1
127 0
48 1
258 0
Iter:0 Progress:21.47% Training Accuracy:0.7435754189944134%394 1
91 0
98 1
253 0
135 1
294 0
172 1
50 0
79 1
94 0
Iter:0 Progress:21.51% Training Accuracy:0.7434944237918215%33 1
132 0
54 1
111 0
37 1
148 0
112 1
114 0
45 1
176 0
Iter:0 Progress:21.55% Training Accuracy:0.7432282003710575%84 1
56 0
158 1
97 0
50 1
57 0
111 1
91 0
109 1
50 0
Iter:0 Progress:21.59% Training Accuracy:0.7435185185185185%64 1
208 0
76 1
206 0
203 1
67 0
108 1
95 0
83 1
100 0
Iter:0 Progress:21.63% Training Accuracy:0.7436229205175601%98 1
133 0
121 1
189 0
171 1
105 0
81 1
105 0
105 1
113 0
Iter:0 Progress:21.67% Training Accurac

162 0
152 1
133 0
111 1
131 0
111 1
104 0
87 1
165 0
Iter:0 Progress:25.55% Training Accuracy:0.7571205007824726%91 1
83 0
109 1
298 0
119 1
153 0
66 1
49 0
72 1
98 0
Iter:0 Progress:25.59% Training Accuracy:0.7565625%103 1
86 0
98 1
356 0
112 1
122 0
89 1
80 0
111 1
117 0
Iter:0 Progress:25.63% Training Accuracy:0.7564742589703588%137 1
246 0
98 1
58 0
130 1
48 0
85 1
245 0
92 1
69 0
Iter:0 Progress:25.67% Training Accuracy:0.7568535825545172%60 1
56 0
94 1
66 0
133 1
97 0
99 1
111 0
86 1
77 0
Iter:0 Progress:25.71% Training Accuracy:0.7572317262830482%106 1
73 0
61 1
134 0
45 1
76 0
45 1
172 0
106 1
97 0
Iter:0 Progress:25.75% Training Accuracy:0.7574534161490684%385 1
173 0
66 1
84 0
86 1
95 0
89 1
77 0
145 1
96 0
Iter:0 Progress:25.79% Training Accuracy:0.7575193798449612%87 1
141 0
150 1
403 0
89 1
171 0
87 1
231 0
130 1
55 0
Iter:0 Progress:25.83% Training Accuracy:0.7577399380804953%136 1
102 0
72 1
44 0
28 1
233 0
113 1
153 0
61 1
88 0
Iter:0 Progress:25.87% Training Accuracy:0

70 0
Iter:0 Progress:29.35% Training Accuracy:0.7698910081743869%113 1
68 0
275 1
193 0
46 1
99 0
131 1
60 0
91 1
52 0
Iter:0 Progress:29.39% Training Accuracy:0.769795918367347%100 1
95 0
90 1
118 0
293 1
212 0
72 1
123 0
32 1
90 0
Iter:0 Progress:29.43% Training Accuracy:0.7699728260869565%96 1
198 0
35 1
89 0
93 1
95 0
138 1
98 0
125 1
122 0
Iter:0 Progress:29.47% Training Accuracy:0.7701492537313432%108 1
237 0
240 1
100 0
45 1
131 0
86 1
99 0
122 1
178 0
Iter:0 Progress:29.51% Training Accuracy:0.7700542005420055%77 1
101 0
78 1
97 0
140 1
144 0
91 1
280 0
113 1
242 0
Iter:0 Progress:29.55% Training Accuracy:0.7700947225981055%96 1
109 0
181 1
87 0
117 1
94 0
100 1
119 0
143 1
66 0
Iter:0 Progress:29.59% Training Accuracy:0.7701351351351351%181 1
241 0
132 1
79 0
260 1
149 0
146 1
228 0
108 1
92 0
Iter:0 Progress:29.63% Training Accuracy:0.7700404858299595%144 1
122 0
200 1
145 0
180 1
248 0
160 1
79 0
80 1
94 0
Iter:0 Progress:29.67% Training Accuracy:0.7702156334231806%98 1
81 0

KeyboardInterrupt: 

In [5]:
tokens[0]

{'',
 '\n',
 '.',
 'a',
 'about',
 'adults',
 'age',
 'all',
 'and',
 'as',
 'at',
 'believe',
 'bromwell',
 'burn',
 'can',
 'cartoon',
 'classic',
 'closer',
 'comedy',
 'down',
 'episode',
 'expect',
 'far',
 'fetched',
 'financially',
 'here',
 'high',
 'i',
 'immediately',
 'in',
 'insightful',
 'inspector',
 'is',
 'isn',
 'it',
 'knew',
 'lead',
 'life',
 'line',
 'm',
 'many',
 'me',
 'much',
 'my',
 'of',
 'one',
 'other',
 'pathetic',
 'pettiness',
 'pity',
 'pomp',
 'profession',
 'programs',
 'ran',
 'reality',
 'recalled',
 'remind',
 'repeatedly',
 'right',
 's',
 'sack',
 'same',
 'satire',
 'saw',
 'school',
 'schools',
 'scramble',
 'see',
 'situation',
 'some',
 'student',
 'students',
 'such',
 'survive',
 't',
 'teachers',
 'teaching',
 'than',
 'that',
 'the',
 'their',
 'think',
 'through',
 'time',
 'to',
 'tried',
 'welcome',
 'what',
 'when',
 'which',
 'who',
 'whole',
 'years',
 'your'}

# Comparing Word Embeddings

In [16]:
from collections import Counter
import math 

def similar(target='beautiful', strategy='sum'):
    target_index = word2index[target]
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        
        if strategy=='sum':        
            scores[word] = -math.sqrt(sum(squared_difference))
        elif strategy=='mean':        
            scores[word] = -math.sqrt(np.mean(squared_difference))
        else:
            print('wrong choice!!')
            break

    return scores.most_common(10)

In [17]:
print(similar('beautiful','sum'))

[('beautiful', -0.0), ('fantastic', -0.7217545928774438), ('awesome', -0.7415184764049142), ('unique', -0.7735825515270454), ('remember', -0.7747145981969797), ('touching', -0.7751590504710988), ('intense', -0.7762884339129738), ('especially', -0.7791695161292743), ('outstanding', -0.7828840421855388), ('beauty', -0.7856081810873574)]


In [18]:
print(similar('beautiful','mean'))

[('beautiful', -0.0), ('fantastic', -0.07217545928774438), ('awesome', -0.07415184764049143), ('unique', -0.07735825515270453), ('remember', -0.07747145981969797), ('touching', -0.07751590504710988), ('intense', -0.07762884339129736), ('especially', -0.07791695161292742), ('outstanding', -0.07828840421855388), ('beauty', -0.07856081810873575)]


In [19]:
print(similar('terrible','sum'))

[('terrible', -0.0), ('disappointment', -0.8112848248491831), ('mess', -0.8245448413004519), ('disappointing', -0.8252756728072085), ('dull', -0.8254397211766038), ('boring', -0.8291808501980075), ('avoid', -0.8323411759011137), ('lacks', -0.8423434102990935), ('badly', -0.8457370735436928), ('horrible', -0.8519148477328443)]


In [20]:
print(similar('terrible','mean'))

[('terrible', -0.0), ('disappointment', -0.08112848248491832), ('mess', -0.08245448413004518), ('disappointing', -0.08252756728072085), ('dull', -0.08254397211766035), ('boring', -0.08291808501980073), ('avoid', -0.08323411759011137), ('lacks', -0.08423434102990934), ('badly', -0.08457370735436928), ('horrible', -0.08519148477328443)]


# Filling in the Blank

In [24]:
import sys,random,math
from collections import Counter
import numpy as np

np.random.seed(1)
random.seed(1)
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

tokens = list(map(lambda x:(x.split(" ")),raw_reviews))
wordcnt = Counter()
for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1
vocab = list(set(map(lambda x:x[0],wordcnt.most_common())))

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

concatenated = list()
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sent_indices)
concatenated = np.array(concatenated)
random.shuffle(input_dataset)

In [25]:
alpha, iterations = (0.05, 2)
hidden_size,window,negative = (50,2,5)

weights_0_1 = (np.random.rand(len(vocab),hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab),hidden_size)*0

layer_2_target = np.zeros(negative+1)
layer_2_target[0] = 1

def similar(target='beautiful'):
    target_index = word2index[target]

    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

def sigmoid(x):
    return 1/(1 + np.exp(-x))


<enumerate at 0x7f300c25d6e0>

In [35]:
for rev_i,review in enumerate(input_dataset * iterations):
    for target_i in range(len(review)):
        
        # since it's really expensive to predict every vocabulary
        # we're only going to predict a random subset
        target_samples = [review[target_i]]+list(concatenated\
        [(np.random.rand(negative)*len(concatenated)).astype('int').tolist()])

        left_context = review[max(0,target_i-window):target_i]
        right_context = review[target_i+1:min(len(review),target_i+window)]

        layer_1 = np.mean(weights_0_1[left_context+right_context],axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])

        weights_0_1[left_context+right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta,layer_1)*alpha

    if(rev_i % 250 == 0):
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
        *iterations)) + "   " + str(similar('terrible')))
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
        *iterations)))
print(similar('terrible'))

Progress:0.995   [('terrible', -0.0), ('horrible', -3.270958185212802), ('bad', -3.6983228478962484), ('great', -3.701502683594686), ('brilliant', -3.7670835913886442), ('pathetic', -3.768970167273131), ('marvelous', -3.963802649171542), ('superb', -3.997912597992555), ('phenomenal', -4.0486284426911405), ('fantastic', -4.095301542834451)]])]76)]11816)]4)]172)][('terrible', -0.0), ('horrible', -3.271244017284067), ('bad', -3.736405131069526), ('pathetic', -3.7774448238646148), ('brilliant', -4.010115442610921), ('lame', -4.079625928248646), ('marvelous', -4.133507091396703), ('phenomenal', -4.14003511358504), ('fantastic', -4.215900357350026), ('dreadful', -4.219385639824087)]


# King - Man + Woman ~= Queen

In [70]:
def analogy(positive=['terrible','good'],negative=['bad']):
    
    norms = np.sum(weights_0_1 * weights_0_1,axis=1)
    norms.resize(norms.shape[0],1)
    
    normed_weights = weights_0_1 * norms
    
    query_vect = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    for word in negative:
        query_vect -= normed_weights[word2index[word]]
    
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - query_vect
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
        
    return scores.most_common(10)[1:]

In [71]:
analogy(['terrible','good'],['bad'])

[('terrific', -210.46593317724228),
 ('perfect', -210.52652806032205),
 ('worth', -210.53162266358495),
 ('good', -210.55072184482773),
 ('terrible', -210.58429046605724),
 ('decent', -210.87945442008805),
 ('superb', -211.01143515971094),
 ('great', -211.1327058081335),
 ('worthy', -211.13577238103477)]

In [72]:
analogy(['elizabeth','he'],['she'])

[('simon', -193.82490698964878),
 ('obsessed', -193.91805919583555),
 ('stanwyck', -194.22311983847902),
 ('sandler', -194.22846640800597),
 ('branagh', -194.24551334589853),
 ('daniel', -194.24631020485714),
 ('peter', -194.29908544092078),
 ('tony', -194.31388897167716),
 ('aged', -194.35115773165094)]