In [1]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F
import nltk

In [2]:
from nltk.corpus import reuters
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',   
]



In [3]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)
print(tokenized_corpus)

[['he', 'is', 'a', 'king'], ['she', 'is', 'a', 'queen'], ['he', 'is', 'a', 'man'], ['she', 'is', 'a', 'woman'], ['warsaw', 'is', 'poland', 'capital'], ['berlin', 'is', 'germany', 'capital'], ['paris', 'is', 'france', 'capital']]


In [4]:
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

In [5]:
import numpy as np
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

In [6]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

In [7]:
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100
learning_rate = 0.01

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:
        print('epo =' + str(epo), 'loss = ' + str(loss_val/len(idx_pairs)))
#         print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

epo =0 loss = 3.83390862260546
epo =10 loss = 2.752981013911111
epo =20 loss = 2.4478960990905763
epo =30 loss = 2.2490220512662615
epo =40 loss = 2.0967302407537187
epo =50 loss = 1.9795851520129613
epo =60 loss = 1.8949202265058245
epo =70 loss = 1.8356305922780718
epo =80 loss = 1.7933225870132445
epo =90 loss = 1.7619091647011893


In [8]:
print(W1)

tensor([[-0.6650, -0.6515,  1.7643, -0.6612, -0.5184,  0.1936, -0.7601,  1.1543,
          0.0982,  1.0659,  1.0329,  0.8713, -1.1771,  1.2245, -0.0804],
        [-0.7244,  0.0624,  0.6872, -0.3318, -0.5093,  0.6087, -0.5052,  1.7931,
          0.3268, -0.1901,  0.4482, -0.5657, -2.5397, -0.2461, -1.9045],
        [-1.6269, -0.6633, -0.5409, -0.9472, -1.5115, -0.0293, -1.1832, -0.0305,
          0.3386, -1.2924,  1.9099,  0.0343,  1.6747,  1.2596, -0.2227],
        [ 0.2623, -0.3287,  1.0322, -0.5111, -0.2392,  1.1337,  0.1848,  0.3596,
          0.4911, -1.0449, -0.6596,  1.0359, -0.4786,  2.0907,  0.2720],
        [-0.9111,  0.5388, -0.0388, -0.9062, -1.2115, -1.7624, -0.6426, -2.1454,
         -0.9097, -0.1549, -2.4633, -0.5267, -1.9401, -1.0004, -0.3105]],
       requires_grad=True)


In [18]:
print(W1.item)

<built-in method item of Tensor object at 0x7fa45e295dc8>


In [20]:
print(vocabulary)

['he', 'is', 'a', 'king', 'she', 'queen', 'man', 'woman', 'warsaw', 'poland', 'capital', 'berlin', 'germany', 'paris', 'france']


In [10]:
print(W1.data)

tensor([[-0.6650, -0.6515,  1.7643, -0.6612, -0.5184,  0.1936, -0.7601,  1.1543,
          0.0982,  1.0659,  1.0329,  0.8713, -1.1771,  1.2245, -0.0804],
        [-0.7244,  0.0624,  0.6872, -0.3318, -0.5093,  0.6087, -0.5052,  1.7931,
          0.3268, -0.1901,  0.4482, -0.5657, -2.5397, -0.2461, -1.9045],
        [-1.6269, -0.6633, -0.5409, -0.9472, -1.5115, -0.0293, -1.1832, -0.0305,
          0.3386, -1.2924,  1.9099,  0.0343,  1.6747,  1.2596, -0.2227],
        [ 0.2623, -0.3287,  1.0322, -0.5111, -0.2392,  1.1337,  0.1848,  0.3596,
          0.4911, -1.0449, -0.6596,  1.0359, -0.4786,  2.0907,  0.2720],
        [-0.9111,  0.5388, -0.0388, -0.9062, -1.2115, -1.7624, -0.6426, -2.1454,
         -0.9097, -0.1549, -2.4633, -0.5267, -1.9401, -1.0004, -0.3105]])


In [11]:
embedding = W1.data.numpy()

In [18]:
print(embedding)

[[-0.66498345 -0.7244271  -1.6269408   0.26228485 -0.9111005 ]
 [-0.65151286  0.06237222 -0.66330755 -0.32867888  0.5387854 ]
 [ 1.7643375   0.6872097  -0.5408893   1.0322185  -0.03879061]
 [-0.6611918  -0.3318367  -0.9471532  -0.51109403 -0.90621793]
 [-0.5184357  -0.5093096  -1.5115323  -0.23921917 -1.2114769 ]
 [ 0.19364242  0.6086715  -0.02925137  1.133663   -1.7624112 ]
 [-0.7601495  -0.5052044  -1.1832007   0.18479522 -0.6426129 ]
 [ 1.1543491   1.7930627  -0.03048167  0.35959837 -2.1454127 ]
 [ 0.09816273  0.32684568  0.33861876  0.49112305 -0.90969056]
 [ 1.0658635  -0.1900604  -1.2923604  -1.0449433  -0.15485954]
 [ 1.0328833   0.44815314  1.9099499  -0.6596172  -2.4632773 ]
 [ 0.8712858  -0.5656944   0.03429155  1.0358573  -0.526661  ]
 [-1.1771438  -2.5396633   1.6747174  -0.47864276 -1.9401181 ]
 [ 1.2244781  -0.24606352  1.2596201   2.0907245  -1.0003916 ]
 [-0.08039328 -1.9045221  -0.22272508  0.2720494  -0.31049582]]


In [16]:
print(embedding[0])

[-0.66498345 -0.7244271  -1.6269408   0.26228485 -0.9111005 ]


In [15]:
embedding = embedding.T

In [19]:
print(vocabulary)

['he', 'is', 'a', 'king', 'she', 'queen', 'man', 'woman', 'warsaw', 'poland', 'capital', 'berlin', 'germany', 'paris', 'france']


In [20]:
print(len(vocabulary))

15


In [23]:
e = {}
for i in range(len(vocabulary)):
    e[vocabulary[i]] = embedding[i]

In [24]:
print(e['king'])

[-0.6611918  -0.3318367  -0.9471532  -0.51109403 -0.90621793]


In [31]:
print(np.dot(e['he'], e['is']))

0.89012814


In [40]:
embedding2 = W2.data.numpy()
b = {}
for i in range(len(vocabulary)):
    b[vocabulary[i]] = embedding2[i]

In [41]:
b['he']

array([ 0.487155  ,  0.43970457, -0.3071918 ,  0.3034896 ,  0.4648041 ],
      dtype=float32)

In [53]:
print(np.dot(b['poland'], b['poland']))

4.370908


In [8]:
import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:6") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

NameError: name 'device' is not defined

In [25]:
device = torch.device("cuda:6")
dtype = torch.float
test = torch.tensor([5, 3], device=device, dtype=dtype)
print(test)

tensor([5., 3.], device='cuda:6')


In [40]:
import torch

embedding_dimension = 100
dtype = torch.float
# device = torch.device("cpu")
device = torch.device("cuda:6") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, vocabulary_size, embedding_dimension, vocabulary_size

# Create input and output data from idx_pairs
x = []
y = []
for pair in idx_pairs:
    cur_x = []
    cur_y = []
    for i in range(0, vocabulary_size):
        if(i == pair[0]):
            cur_x.append(1)
        else:
            cur_x.append(0)
        if(i == pair[1]):
            cur_y.append(1)
        else:
            cur_y.append(0)
    x.append(cur_x)
    y.append(cur_y)

print(len(x))


70


In [46]:

                

# # Create random input and output data
# x = torch.randn(N, D_in, device=device, dtype=dtype)
# y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)
print(H, D_in, D_out)
learning_rate = 1e-3
loss_val = 0
for t in range(1):
    # Forward pass: compute predicted y
    for i in range(len(x)):
        x_vector = torch.tensor(x[i], device=device, dtype=dtype)        
        y_vector = torch.tensor(y[i], device=device, dtype=dtype)        
        z1 = torch.matmul(w1, x)
        z2 = torch.matmul(w2, z1)
        log_softmax = F.log_softmax(z2, dim=0)
        loss = F.nll_loss(log_softmax.view(1,-1), y_vector)
        loss_val += loss.item()
        loss.backward()
#         print(w1.size())
#         h_relu = h.clamp(min=0)
#         y_pred = h_relu.mm(w2)

        # Compute and print loss
#         loss = (y_pred - y).pow(2).sum().item()
        print(t, loss_val)
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()

        # Backprop to compute gradients of w1 and w2 with respect to loss
#         grad_y_pred = 2.0 * (y_pred - y)
#         grad_w2 = h_relu.t().mm(grad_y_pred)
#         grad_h_relu = grad_y_pred.mm(w2.t())
#         grad_h = grad_h_relu.clone()
#         grad_h[h < 0] = 0
#         grad_w1 = x.t().mm(grad_h)

#         # Update weights using gradient descent
#         w1 -= learning_rate * grad_w1
#         w2 -= learning_rate * grad_w2

100 15 15
torch.Size([15, 100])
0 42256.65234375


In [35]:
print(w1[0])

tensor([-1.1420e-02,  6.1293e-01, -1.3923e-01, -1.8433e-02, -6.6051e-01,
         1.1946e-01,  1.3269e+00, -5.1187e-02, -5.2783e-01, -1.3794e-01,
        -9.5055e-02, -6.1692e-01, -1.9752e-01,  6.8335e-01,  6.0173e-01,
        -1.2278e-01, -3.3696e-01, -2.2094e-01, -4.1712e-01, -7.6910e-01,
        -5.6342e-01,  1.2546e-01,  7.3180e-01,  3.4895e-01,  1.6103e-01,
        -1.7344e-01, -8.9016e-04,  7.0031e-02,  9.9244e-01, -7.3614e-01,
        -9.4316e-01,  2.6637e-01, -6.9867e-01, -5.7772e-01, -4.3356e-01,
        -1.4589e+00, -7.3039e-01, -1.5080e+00, -6.0735e-02,  1.0355e-01,
        -2.2269e-01, -1.1411e+00, -1.6334e-01, -1.4689e+00, -3.1832e-01,
        -1.7231e-01, -3.4880e-01, -1.3733e-01, -2.1905e-01,  7.7471e-01,
         4.9175e-01,  5.5166e-01, -3.6932e-01,  4.7385e-01, -6.9067e-02,
        -1.1892e-01, -2.2312e+00,  1.5254e-01, -5.2823e-02, -1.3127e+00,
        -9.7139e-02, -1.1388e-01, -7.0674e-01, -7.9843e-02, -1.0846e+00,
         7.5075e-01,  7.3198e-01, -7.9156e-01, -6.2

In [None]:
print