In [88]:
import pandas as pd
import os
import string
import re

In [89]:
PATH = 'C:/Users/bill/Documents/projects/data/chatbot'
train_file = f'{PATH}/gpt/train.txt'
test_file = f'{PATH}/gpt/test.txt'

### LayerNorm
Given input of size $x\in \mathbb{R}^{B \times T \times H}$, layer norm is defined by

$$
    \mathbf{y}_t = \frac{\mathbf{x}_t - \mu_t}{\sqrt{\sigma_t + \epsilon}}\odot \boldsymbol{\gamma}_t + \boldsymbol{\beta}_t
$$
where 
- $\gamma \in \mathbb{R}^{T\times H}$ is the weight term
- $\beta \in \mathbb{R}^{T\times H}$ is the bias term
- $\mu_t$ is the average over all the hidden units
- $\sigma_t$ is the standard deviation over all the hidden units

In [90]:
class LayerNorm(nn.Module):
    
    def __init__(self, hidden_size, eps=1e-12):
        super(LayerNorm, self).__init__()
        # 
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps

    def forward(self, x):
        # take the mean and standard deviation over the last dimension
        u = x.mean(-1, keepdim=True)
        v = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(v + self.variance_epsilon)
        return self.weight * x + self.bias

In [91]:
input = torch.randn(8, 10, 768)

In [92]:
m = LayerNorm(input.size()[1:])
output = m(input)
for k, v in m.named_parameters():
    print(k, v.shape, v[0][0:10])
output.shape

weight torch.Size([10, 768]) tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], grad_fn=<SliceBackward>)
bias torch.Size([10, 768]) tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], grad_fn=<SliceBackward>)


torch.Size([8, 10, 768])

In [93]:
m = nn.LayerNorm(input.size()[1:])
output = m(input)
for k, v in m.named_parameters():
    print(k, v.shape, v[0][0:10])
output.shape

weight torch.Size([10, 768]) tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], grad_fn=<SliceBackward>)
bias torch.Size([10, 768]) tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], grad_fn=<SliceBackward>)


torch.Size([8, 10, 768])

Given $x\in \mathbb{R}^{B \times T \times H}$, `Conv1D(F, H)` is just a linear layer that outputs $y\in \mathbb{R}^{B \times T \times F}$

In [162]:
class Conv1D(nn.Module):
    """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
            Basically works like a Linear layer but the weights are transposed
        """
    def __init__(self, F, H):
        super(Conv1D, self).__init__()
        self.F = F
        w = torch.empty(H, F)
        nn.init.normal_(w, std=0.02)
        self.weight = nn.Parameter(w) # H x F
        self.bias = nn.Parameter(torch.zeros(F)) # F
        
    def forward(self, x):
        # x in B x T x H
        # x.size()[:-1] => B x T
        # size_out = B x T x F
        size_out = x.size()[:-1] + (self.F,)
        # Wx + b => (B x T) x F
        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
        # convert back to B x T x F
        x = x.view(*size_out)
        return x
    
class Conv1DNew(nn.Module):
    def __init__(self, F, H):
        super(Conv1DNew, self).__init__()
        self.l = nn.Linear(H, F)
        nn.init.normal_(self.l.weight, std=0.02)
        self.l.bias.data.fill_(0)
        
    def forward(self, x):
        return self.l(x)

In [165]:
import torch.nn.functional as F
m = Conv1D(20, 768)
output = m(input)

n = Conv1DNew(20, 768)
n.l.weight = nn.Parameter(m.weight.t())
output1 = n(input)

assert torch.all(torch.eq(output, output1))
print(f'input: {input.shape}, output: {output.shape}')

input: torch.Size([8, 10, 768]), output: torch.Size([8, 10, 20])


In [None]:
class Attention(nn.Module):
    def __init__(self, nx, n_ctx, config, scale=False):
        super(Attention, self).__init__()
        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
        assert n_state % config.n_head == 0
        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale
        self.c_attn = Conv1D(n_state * 3, nx)
        self.c_proj = Conv1D(n_state, nx)

    def _attn(self, q, k, v):
        w = torch.matmul(q, k)
        if self.scale:
            w = w / math.sqrt(v.size(-1))
        nd, ns = w.size(-2), w.size(-1)
        b = self.bias[:, :, ns-nd:ns, :ns]
        w = w * b - 1e10 * (1 - b)
        w = nn.Softmax(dim=-1)(w)
        return torch.matmul(w, v)

    def merge_heads(self, x):
        x = x.permute(0, 2, 1, 3).contiguous()
        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states

    def split_heads(self, x, k=False):
        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
        if k:
            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
        else:
            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)

    def forward(self, x, layer_past=None):
        x = self.c_attn(x)
        query, key, value = x.split(self.split_size, dim=2)
        query = self.split_heads(query)
        key = self.split_heads(key, k=True)
        value = self.split_heads(value)
        if layer_past is not None:
            past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
            key = torch.cat((past_key, key), dim=-1)
            value = torch.cat((past_value, value), dim=-2)
        present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
        a = self._attn(query, key, value)
        a = self.merge_heads(a)
        a = self.c_proj(a)
        return a, present