In [2]:
import torch
import torch.nn.functional as F

from helper import look

# Skip-gram

Vocabulary size 5:

$V = \{w_0, w_1, w_2, w_3, w_4\}$

One-hot of context word:

$o_c = o_1 = \begin{bmatrix}0 \\ 1 \\  0 \\  0 \\  0\end{bmatrix}$

3D embeddings for context words:

$E^I = \begin{bmatrix}-0.4024 & 1.0039 & -0.0477 &  0.8278 & -0.0195 \\ -0.8807 & 2.2967 & -0.1497 & -0.4184 & -1.0950 \\  0.4404 & 1.3009 &  0.0189 &  0.9072 &  1.0590 \end{bmatrix} $

Embedding of context word:

$e_c = E^I \cdot o_c = \begin{bmatrix} 1.0039 \\ 2.2967 \\ 1.3009\end{bmatrix}$

One-hot of target word:

$o_t = o_3 = \begin{bmatrix}0 \\ 0 \\  0 \\  1 \\  0\end{bmatrix}$

3D embeddings for target words:

$E^O = \begin{bmatrix} 1.7711 & -0.2803 & -0.6955 &  1.0053 & -0.1339 \\ -1.6385 &  0.1701 &  0.2023 &  0.4325 & -0.4994 \\ -2.2435 &  0.1664 &  1.4737 & -1.1486 & -0.0510 \end{bmatrix} = \begin{bmatrix} e_{w_0} & e_{w_1} & e_{w_2} & e_{w_3} & e_{w_4} \end{bmatrix}$

Each possible target embedding $e_{w_i}$ multiply with content embedding $e_c$ and turn to probabilities.

$\begin{bmatrix} \frac{e_{w_0} e_c}{\sum_{j=0}^{4} e_{w_j} e_c} & \frac{e_{w_1} e_c}{\sum_{j=0}^{4} e_{w_j} e_c} & \frac{e_{w_2} e_c}{\sum_{j=0}^{4} e_{w_j} e_c} & \frac{e_{w_3} e_c}{\sum_{j=0}^{4} e_{w_j} e_c} & \frac{e_{w_4} e_c}{\sum_{i=0}^{4} e_{w_j} e_c}\end{bmatrix}$

Target one-hot:

$o_t = \begin{bmatrix}0 \\ 0 \\  0 \\  1 \\  0\end{bmatrix}$

Maximizing the probability is minimizing the loss:

$L(\Theta) = \prod_{t = 1}^{T} \prod_{c \in C(t)} p(t | c)$

where $p(t|c) = \frac{e_t e_c}{\sum_{j=1}^{T} e_j e_c}$

$J(\Theta) = -\frac{1}{T} log L(\Theta)$

In [4]:
torch.manual_seed(42)

D = 3 # embedding size
V = 5 # vobavulary size

# context word, one-hot 
c = 1
o_c = torch.tensor(c)
look("$o_c$=", o_c)

# target word, one-hot
t = 3
o_t = torch.tensor(t).unsqueeze(0)
look("$o_t$=", o_t)

# embeddings for context words
E_i = torch.nn.Embedding(num_embeddings=V, embedding_dim=D)
look("$E^I=$", E_i.weight)
e_c = E_i(o_c) # context word, embedding
assert e_c.shape == (D, )
look("$e_c =$", e_c)

# embeddings for target words
E_o = torch.nn.Embedding(num_embeddings=V, embedding_dim=D)

# all embeddings of words
all_e_w = E_o(torch.tensor([range(V)])).squeeze()

# how similar are embedding of all words and context words
all_sim_w_with_c = torch.sum(all_e_w * e_c, dim=1)
assert all_sim_w_with_c.shape == (V,)
look("Similarities of all w=", all_sim_w_with_c)

# turn similarities into probabilities and apply log (because of )
y_hat = F.log_softmax(all_sim_w_with_c, dim=0).view(1, -1)
assert y_hat.shape == (1, V)

look("Output logits:", y_hat)

loss = F.nll_loss(y_hat, o_t)
look("nll loss:", loss)


$o_c$= 1

$o_t$= $\begin{bmatrix} 3\end{bmatrix}$

$E^I=$ $\begin{bmatrix} 0.337 & 0.129 & 0.234 \\ 0.23 & -1.12 & -0.186 \\ 2.21 & -0.638 & 0.462 \\ 0.267 & 0.535 & 0.809 \\ 1.11 & -1.69 & -0.989\end{bmatrix}$

$e_c =$ $\begin{bmatrix} 0.23 & -1.12 & -0.186\end{bmatrix}$

Similarities of all w= $\begin{bmatrix} -1.42 & 0.414 & 0.378 & -0.0956 & -0.0969\end{bmatrix}$

Output logits: $\begin{bmatrix} -3.03 & -1.2 & -1.24 & -1.71 & -1.71\end{bmatrix}$

nll loss: 1.71

# NLL Loss: Negative log-likelihood loss.

In [5]:
torch.manual_seed(42)
input = torch.rand(3, 5)
target = torch.tensor([1, 0, 4])

pred = F.log_softmax(input, dim=1)
fw_output = F.nll_loss(pred, target)

man_output = -torch.mean(torch.gather(pred, dim=1, index=target.view(-1, 1)))

look("input", input)
look("logit = log softmax input", pred)
look("target", target)
look("gather pred by target", torch.gather(pred, dim=1, index=target.view(-1, 1)))
look("nll loss", fw_output)

print("Framework output:", fw_output.item())
print("Manual output   :", man_output.item())

input $\begin{bmatrix} 0.882 & 0.915 & 0.383 & 0.959 & 0.39 \\ 0.601 & 0.257 & 0.794 & 0.941 & 0.133 \\ 0.935 & 0.594 & 0.869 & 0.568 & 0.741\end{bmatrix}$

logit = log softmax input $\begin{bmatrix} -1.47 & -1.43 & -1.97 & -1.39 & -1.96 \\ -1.6 & -1.94 & -1.41 & -1.26 & -2.07 \\ -1.43 & -1.77 & -1.49 & -1.79 & -1.62\end{bmatrix}$

target $\begin{bmatrix} 1 & 0 & 4\end{bmatrix}$

gather pred by target $\begin{bmatrix} -1.43 \\ -1.6 \\ -1.62\end{bmatrix}$

nll loss 1.55

Framework output: 1.551107406616211
Manual output   : 1.551107406616211


## Negative log-likelihood loss

Likelihood $$\prod_{i=1}^{n} \hat{y_i}^{y_i}(1-\hat{y_i})^{1 - y_i}$$

Log-likelihood $$\sum_{i=1}^{n} ({y_i}log(\hat{y_i}) + (1 - y_i)log(1-\hat{y_i}))$$

Minimazing log-likelihood $$L(\hat{y}, y) = -\sum_{i=1}^{n} ({y_i}log(\hat{y_i}) + (1 - y_i)log(1-\hat{y_i}))$$

# Skip-gram with negative sampling

In [6]:
torch.manual_seed(42)

D = 3 # embedding size
V = 5 # vobavulary size

# context word, one-hot 
c = 1
o_c = torch.tensor(c)
look("$o_c=$", o_c)

# target word, one-hot
t = 3
o_t = torch.tensor(t)
look("$o_t=$", o_t)

# sampled negative words
ns = [0, 2]
o_ns = torch.tensor(ns)
look("$o_ns=$", o_ns)

# embeddings for context words
E_i = torch.nn.Embedding(num_embeddings=V, embedding_dim=D)

e_c = E_i(o_c) # context word, embedding
assert e_c.shape == (D, )
look("$e_c=$", e_c)

# embeddings for target and negative words
E_o = torch.nn.Embedding(num_embeddings=V, embedding_dim=D)

# embedding of target word
e_t = E_o(o_t)
look("$e_t=$", e_t)

# all embeddings of negative words
e_ns = E_o(o_ns)
look("$e_ns=$", e_ns)

# how similar are context and target
sim_c_t = torch.sum(e_c * e_t)
look("$sim_{c,t}=$", sim_c_t)
# how similar are negative context words and target
sims_ns_t = torch.matmul(e_ns, e_t.unsqueeze(0).T).squeeze()
look("$sims_{ns,t}=$", sims_ns_t)

# concatenate similarities (first positive, other negative)
y_hat = torch.cat([sim_c_t.unsqueeze(0), sims_ns_t])
look("$\hat{y}=$", y_hat)

y = torch.zeros(y_hat.shape)
y[0] = 1
look("$y=$", y)

# calculate loss
loss = F.binary_cross_entropy_with_logits(y_hat, y)
look("BCE loss", loss)

$o_c=$ 1

$o_t=$ 3

$o_ns=$ $\begin{bmatrix} 0 & 2\end{bmatrix}$

$e_c=$ $\begin{bmatrix} 0.23 & -1.12 & -0.186\end{bmatrix}$

$e_t=$ $\begin{bmatrix} 0.282 & 0.0562 & 0.523\end{bmatrix}$

$e_ns=$ $\begin{bmatrix} 0.958 & 1.32 & 0.817 \\ 0.686 & -0.328 & 0.795\end{bmatrix}$

$sim_{c,t}=$ -0.0956

$sims_{ns,t}=$ $\begin{bmatrix} 0.771 & 0.59\end{bmatrix}$

$\hat{y}=$ $\begin{bmatrix} -0.0956 & 0.771 & 0.59\end{bmatrix}$

$y=$ $\begin{bmatrix} 1.0 & 0.0 & 0.0\end{bmatrix}$

BCE loss 0.975