In [1]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F
import nltk

In [2]:
from nltk.corpus import reuters
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',   
]



In [3]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)
print(tokenized_corpus)

[['he', 'is', 'a', 'king'], ['she', 'is', 'a', 'queen'], ['he', 'is', 'a', 'man'], ['she', 'is', 'a', 'woman'], ['warsaw', 'is', 'poland', 'capital'], ['berlin', 'is', 'germany', 'capital'], ['paris', 'is', 'france', 'capital']]


In [4]:
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

In [10]:
import numpy as np
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

In [11]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

In [39]:
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 1000
learning_rate = 0.01

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:
        print('epo =' + str(epo), 'loss = ' + str(loss_val/len(idx_pairs)))
#         print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

epo =0 loss = 4.414617017337254
epo =10 loss = 2.812441304751805
epo =20 loss = 2.4322106616837638
epo =30 loss = 2.23747455903462
epo =40 loss = 2.1135182227407183
epo =50 loss = 2.029567575454712
epo =60 loss = 1.968772886480604
epo =70 loss = 1.9210169128009251
epo =80 loss = 1.8814312934875488
epo =90 loss = 1.8480552094323295
epo =100 loss = 1.8200960516929627
epo =110 loss = 1.7969684038843428
epo =120 loss = 1.7779400382723127
epo =130 loss = 1.7621682252202715
epo =140 loss = 1.7488552689552308
epo =150 loss = 1.737350594997406
epo =160 loss = 1.727170237473079
epo =170 loss = 1.7179731096540178
epo =180 loss = 1.709526412827628
epo =190 loss = 1.701675043787275
epo =200 loss = 1.6943196586200169
epo =210 loss = 1.687399433340345
epo =220 loss = 1.6808804614203317
epo =230 loss = 1.6747456686837332
epo =240 loss = 1.6689882891518728
epo =250 loss = 1.663605192729405
epo =260 loss = 1.6585930057934353
epo =270 loss = 1.6539454392024449
epo =280 loss = 1.6496514575822012
epo =290

In [14]:
print(W1)

tensor([[ 0.6955, -0.6836,  1.3218,  1.5453,  0.2564,  0.5486,  1.8719,  1.7028,
          0.3864,  0.9638,  0.6274,  1.1836, -0.2145,  0.2023,  0.1888],
        [ 2.0977,  0.4495, -1.2802,  1.9009,  1.2276, -0.5630,  0.2409,  0.7693,
          1.6366,  0.4513,  1.7425,  1.0862,  0.0924,  1.1279,  0.7844],
        [ 0.5588,  0.5450,  0.9675, -0.1896,  1.8833,  1.8689, -0.3873,  0.4392,
         -1.1240,  0.2618, -1.4854, -0.4512,  0.5924, -0.6976,  0.1051],
        [-0.3624,  0.2806, -1.1393, -1.0024,  1.0812,  2.4830, -0.1696,  0.6498,
          1.1045,  0.4254,  0.9773, -1.5356,  0.9185,  1.6674,  1.0889],
        [ 0.4062, -0.2692,  0.1397, -0.7107,  0.5688,  1.1835,  0.6772, -1.3756,
          1.0456, -1.8099,  1.1467,  0.0613,  1.1736,  0.2118, -0.9420]],
       requires_grad=True)


In [18]:
print(W1.item)

<built-in method item of Tensor object at 0x7fa45e295dc8>


In [20]:
print(vocabulary)

['he', 'is', 'a', 'king', 'she', 'queen', 'man', 'woman', 'warsaw', 'poland', 'capital', 'berlin', 'germany', 'paris', 'france']


In [10]:
print(W1.data)

tensor([[-0.6650, -0.6515,  1.7643, -0.6612, -0.5184,  0.1936, -0.7601,  1.1543,
          0.0982,  1.0659,  1.0329,  0.8713, -1.1771,  1.2245, -0.0804],
        [-0.7244,  0.0624,  0.6872, -0.3318, -0.5093,  0.6087, -0.5052,  1.7931,
          0.3268, -0.1901,  0.4482, -0.5657, -2.5397, -0.2461, -1.9045],
        [-1.6269, -0.6633, -0.5409, -0.9472, -1.5115, -0.0293, -1.1832, -0.0305,
          0.3386, -1.2924,  1.9099,  0.0343,  1.6747,  1.2596, -0.2227],
        [ 0.2623, -0.3287,  1.0322, -0.5111, -0.2392,  1.1337,  0.1848,  0.3596,
          0.4911, -1.0449, -0.6596,  1.0359, -0.4786,  2.0907,  0.2720],
        [-0.9111,  0.5388, -0.0388, -0.9062, -1.2115, -1.7624, -0.6426, -2.1454,
         -0.9097, -0.1549, -2.4633, -0.5267, -1.9401, -1.0004, -0.3105]])


In [40]:
embedding = W1.data.numpy()

In [41]:
print(embedding)

[[-1.2215395   0.40352285  0.03259102 -1.550806   -1.7543684  -2.0483897
   0.11105409 -1.0981144  -1.8493615   0.8096374   0.26290727  2.3347015
  -1.2286545   0.26170424  0.22906108]
 [ 1.488256    0.5242639  -0.02792292  0.8890925   0.98666835  0.6626822
   2.5294178   1.160715   -2.804427    0.4686744  -2.4683678  -0.64828336
   0.16361573 -0.62877995  0.30988243]
 [ 0.94713867 -0.16445951 -1.6735532  -0.21499224  0.03861188  0.27641746
   1.8398671   0.5111441   1.3693995   3.1084561   0.74440926  0.40085322
   3.400279   -0.03269182  2.30318   ]
 [ 1.1791431  -0.29078224  2.9281821   0.6400242   0.90373707  0.39219022
   1.3582579   0.64811105  1.2456555   0.80786043  0.02840304  1.1714659
  -0.63218325 -1.1650461  -2.0187287 ]
 [ 0.78564245 -0.24239564  1.433433    1.776831    1.531726    1.4062285
   0.4802411   1.1585157   0.18140486 -0.21823777  1.6798475   2.0593903
  -0.07797164  3.0830123   1.5188963 ]]


In [44]:
print(embedding[0])

[-1.2215395   1.488256    0.94713867  1.1791431   0.78564245]


In [43]:
embedding = embedding.T

In [20]:
print(vocabulary)

['he', 'is', 'a', 'king', 'she', 'queen', 'man', 'woman', 'warsaw', 'poland', 'capital', 'berlin', 'germany', 'paris', 'france']


In [21]:
print(len(vocabulary))

15


In [45]:
def cosine_similarity(a,b):
    return np.dot(a,b) / ( (np.dot(a,a) **.5) * (np.dot(b,b) ** .5) )

In [46]:
e = {}
for i in range(len(vocabulary)):
    e[vocabulary[i]] = embedding[i]

In [47]:
print(e['king'])

[-1.550806    0.8890925  -0.21499224  0.6400242   1.776831  ]


In [49]:
print(cosine_similarity(e['poland'] - e['berlin'] + e['warsaw'], e['germany']))

0.7860721060454229


In [40]:
embedding2 = W2.data.numpy()
b = {}
for i in range(len(vocabulary)):
    b[vocabulary[i]] = embedding2[i]

In [41]:
b['he']

array([ 0.487155  ,  0.43970457, -0.3071918 ,  0.3034896 ,  0.4648041 ],
      dtype=float32)

In [53]:
print(np.dot(b['poland'], b['poland']))

4.370908


In [9]:
import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:6") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 27366980.0
1 24337892.0
2 24851178.0
3 25348252.0
4 23403956.0
5 18490446.0
6 12494085.0
7 7479215.0
8 4273249.0
9 2490020.0
10 1558904.625
11 1065366.25
12 789222.0
13 620933.8125
14 508837.9375
15 428078.8125
16 366265.0
17 316963.9375
18 276498.34375
19 242623.84375
20 213931.8125
21 189373.375
22 168215.5
23 149907.75
24 133968.296875
25 120014.5703125
26 107752.7734375
27 96944.9453125
28 87387.2421875
29 78919.40625
30 71392.109375
31 64685.15234375
32 58693.51953125
33 53330.37109375
34 48523.34765625
35 44202.81640625
36 40312.4296875
37 36806.13671875
38 33639.546875
39 30777.58203125
40 28185.3671875
41 25834.21875
42 23700.8828125
43 21760.79296875
44 19995.6484375
45 18387.8359375
46 16917.37890625
47 15577.505859375
48 14352.5390625
49 13232.05078125
50 12207.3232421875
51 11268.361328125
52 10407.634765625
53 9618.0244140625
54 8893.4638671875
55 8227.6630859375
56 7615.3916015625
57 7052.28955078125
58 6534.0439453125
59 6056.74169921875
60 5616.8232421875
61 5210.9423

370 0.0004226124146953225
371 0.0004101716913282871
372 0.00039761903462931514
373 0.0003858095151372254
374 0.0003740307001862675
375 0.00036375655326992273
376 0.00035318531445227563
377 0.0003421108121983707
378 0.00033289127168245614
379 0.0003230060974601656
380 0.0003136640880256891
381 0.0003049702208954841
382 0.0002967094478663057
383 0.0002878496306948364
384 0.00027999363373965025
385 0.000272721576038748
386 0.0002655550488270819
387 0.00025847918004728854
388 0.0002515523519832641
389 0.00024519854923710227
390 0.00023878642241470516
391 0.0002322124200873077
392 0.00022647152945864946
393 0.0002207747456850484
394 0.00021540505986195058
395 0.0002094554074574262
396 0.00020385533571243286
397 0.00019937986508011818
398 0.00019474170403555036
399 0.00018921207811217755
400 0.00018494410323910415
401 0.00018098877626471221
402 0.00017600991122890264
403 0.00017165187455248088
404 0.00016794964903965592
405 0.0001644461153773591
406 0.00016084901290014386
407 0.0001564796111

In [25]:
device = torch.device("cuda:6")
dtype = torch.float
test = torch.tensor([5, 3], device=device, dtype=dtype)
print(test)

tensor([5., 3.], device='cuda:6')


In [40]:
import torch

embedding_dimension = 100
dtype = torch.float
# device = torch.device("cpu")
device = torch.device("cuda:6") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, vocabulary_size, embedding_dimension, vocabulary_size

# Create input and output data from idx_pairs
x = []
y = []
for pair in idx_pairs:
    cur_x = []
    cur_y = []
    for i in range(0, vocabulary_size):
        if(i == pair[0]):
            cur_x.append(1)
        else:
            cur_x.append(0)
        if(i == pair[1]):
            cur_y.append(1)
        else:
            cur_y.append(0)
    x.append(cur_x)
    y.append(cur_y)

print(len(x))


70


In [46]:

                

# # Create random input and output data
# x = torch.randn(N, D_in, device=device, dtype=dtype)
# y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)
print(H, D_in, D_out)
learning_rate = 1e-3
loss_val = 0
for t in range(1):
    # Forward pass: compute predicted y
    for i in range(len(x)):
        x_vector = torch.tensor(x[i], device=device, dtype=dtype)        
        y_vector = torch.tensor(y[i], device=device, dtype=dtype)        
        z1 = torch.matmul(w1, x)
        z2 = torch.matmul(w2, z1)
        log_softmax = F.log_softmax(z2, dim=0)
        loss = F.nll_loss(log_softmax.view(1,-1), y_vector)
        loss_val += loss.item()
        loss.backward()
#         print(w1.size())
#         h_relu = h.clamp(min=0)
#         y_pred = h_relu.mm(w2)

        # Compute and print loss
#         loss = (y_pred - y).pow(2).sum().item()
        print(t, loss_val)
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()

        # Backprop to compute gradients of w1 and w2 with respect to loss
#         grad_y_pred = 2.0 * (y_pred - y)
#         grad_w2 = h_relu.t().mm(grad_y_pred)
#         grad_h_relu = grad_y_pred.mm(w2.t())
#         grad_h = grad_h_relu.clone()
#         grad_h[h < 0] = 0
#         grad_w1 = x.t().mm(grad_h)

#         # Update weights using gradient descent
#         w1 -= learning_rate * grad_w1
#         w2 -= learning_rate * grad_w2

100 15 15
torch.Size([15, 100])
0 42256.65234375


In [35]:
print(w1[0])

tensor([-1.1420e-02,  6.1293e-01, -1.3923e-01, -1.8433e-02, -6.6051e-01,
         1.1946e-01,  1.3269e+00, -5.1187e-02, -5.2783e-01, -1.3794e-01,
        -9.5055e-02, -6.1692e-01, -1.9752e-01,  6.8335e-01,  6.0173e-01,
        -1.2278e-01, -3.3696e-01, -2.2094e-01, -4.1712e-01, -7.6910e-01,
        -5.6342e-01,  1.2546e-01,  7.3180e-01,  3.4895e-01,  1.6103e-01,
        -1.7344e-01, -8.9016e-04,  7.0031e-02,  9.9244e-01, -7.3614e-01,
        -9.4316e-01,  2.6637e-01, -6.9867e-01, -5.7772e-01, -4.3356e-01,
        -1.4589e+00, -7.3039e-01, -1.5080e+00, -6.0735e-02,  1.0355e-01,
        -2.2269e-01, -1.1411e+00, -1.6334e-01, -1.4689e+00, -3.1832e-01,
        -1.7231e-01, -3.4880e-01, -1.3733e-01, -2.1905e-01,  7.7471e-01,
         4.9175e-01,  5.5166e-01, -3.6932e-01,  4.7385e-01, -6.9067e-02,
        -1.1892e-01, -2.2312e+00,  1.5254e-01, -5.2823e-02, -1.3127e+00,
        -9.7139e-02, -1.1388e-01, -7.0674e-01, -7.9843e-02, -1.0846e+00,
         7.5075e-01,  7.3198e-01, -7.9156e-01, -6.2

In [None]:
print