In [1]:
import torch
from torch import nn
from torch.nn import functional as F

  from .autonotebook import tqdm as notebook_tqdm


- embedding 的计算过程
    + 表是matrix
    + 索引方式：one hot+矩阵乘法
    + input shape:(b,s)
    + embedding(input):(b,s,h)
    

## 1.1 简单前向

In [2]:
embedding = nn.Embedding(10,3) # nxm
# 假装构造一个vocab,其中有10个word，每个word用一个3d的向量表示

In [3]:
embedding.weight 

Parameter containing:
tensor([[-0.4422, -1.4363, -0.4358],
        [-1.6103, -1.4728,  0.9453],
        [-0.6646,  0.8704, -1.1941],
        [-0.3679, -0.6565, -0.8222],
        [ 0.1443, -0.2155, -1.8041],
        [-0.9040, -1.9084, -0.2792],
        [ 0.3402, -1.6956, -0.5691],
        [ 1.3087,  0.5668, -1.4190],
        [-1.1967,  0.0046, -0.9748],
        [ 0.2155, -0.1001,  1.6694]], requires_grad=True)

In [18]:
inputs = torch.LongTensor([[1,2,4,5],[4,3,2,8]]) # input是indices  表示两个sentence，每个sentence由四个单词构成2x4 bxs
inputs.dtype

torch.int64

In [19]:
embedding(inputs), embedding(inputs).shape # bxsxm

(tensor([[[-1.6103, -1.4728,  0.9453],
          [-0.6646,  0.8704, -1.1941],
          [ 0.1443, -0.2155, -1.8041],
          [-0.9040, -1.9084, -0.2792]],
 
         [[ 0.1443, -0.2155, -1.8041],
          [-0.3679, -0.6565, -0.8222],
          [-0.6646,  0.8704, -1.1941],
          [-1.1967,  0.0046, -0.9748]]], grad_fn=<EmbeddingBackward0>),
 torch.Size([2, 4, 3]))

## 1.2 one-hot 矩阵算法

In [24]:
input_onehot = F.one_hot(inputs, num_classes=10)
input_onehot, input_onehot.shape# 2x4——>2x4x10 # 把每一个单索引扩充为01向量

(tensor([[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]],
 
         [[0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
          [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]]]),
 torch.Size([2, 4, 10]))

In [25]:
embedding.weight.dtype,embedding.weight # 单词向量表

(torch.float32,
 Parameter containing:
 tensor([[-0.4422, -1.4363, -0.4358],
         [-1.6103, -1.4728,  0.9453],
         [-0.6646,  0.8704, -1.1941],
         [-0.3679, -0.6565, -0.8222],
         [ 0.1443, -0.2155, -1.8041],
         [-0.9040, -1.9084, -0.2792],
         [ 0.3402, -1.6956, -0.5691],
         [ 1.3087,  0.5668, -1.4190],
         [-1.1967,  0.0046, -0.9748],
         [ 0.2155, -0.1001,  1.6694]], requires_grad=True))

In [26]:
input_onehot.type(torch.float32)

tensor([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]]])

In [27]:
torch.matmul(input_onehot.type(torch.float32), embedding.weight)

tensor([[[-1.6103, -1.4728,  0.9453],
         [-0.6646,  0.8704, -1.1941],
         [ 0.1443, -0.2155, -1.8041],
         [-0.9040, -1.9084, -0.2792]],

        [[ 0.1443, -0.2155, -1.8041],
         [-0.3679, -0.6565, -0.8222],
         [-0.6646,  0.8704, -1.1941],
         [-1.1967,  0.0046, -0.9748]]], grad_fn=<UnsafeViewBackward0>)

## 1.3

In [32]:
embedding = nn.Embedding(3,5,)
embedding.weight.mean(),embedding.weight.std(),embedding.weight

(tensor(-0.1346, grad_fn=<MeanBackward0>),
 tensor(1.0137, grad_fn=<StdBackward0>),
 Parameter containing:
 tensor([[-0.4547,  1.1200, -1.5495,  1.4884, -0.6632],
         [-0.1234, -0.7767, -0.1438,  0.0704, -0.5591],
         [ 1.5854, -1.1521, -0.7820, -1.2070,  1.1288]], requires_grad=True))

In [33]:
torch.norm(embedding.weight, dim=1), torch.norm(embedding.weight, dim=0)

(tensor([2.5529, 0.9781, 2.6802], grad_fn=<NormBackward1>),
 tensor([1.6539, 1.7847, 1.7416, 1.9176, 1.4236], grad_fn=<NormBackward1>))

In [36]:
# embedding = nn.Embedding(3,5,max_norm=True)
embedding = nn.Embedding(3,5,max_norm=3)
embedding.weight.mean(),embedding.weight.std(),embedding.weight

(tensor(0.0374, grad_fn=<MeanBackward0>),
 tensor(1.2599, grad_fn=<StdBackward0>),
 Parameter containing:
 tensor([[-1.6550,  0.9733, -2.0622,  2.5498,  1.8684],
         [ 1.1724, -0.3173, -0.3685,  0.3944, -0.1885],
         [ 0.6279, -0.6441, -0.6993, -1.1012,  0.0101]], requires_grad=True))