In [25]:
import torch
from torch.utils.data import Dataset, DataLoader, SequentialSampler
import pandas as pd
import numpy as np
import h5py 
import pickle
import os
import sys
import copy
import math
from torch.utils.data.dataloader import default_collate

np.set_printoptions(threshold=sys.maxsize)

In [9]:
from transformers import BertTokenizer, BertModel
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader, SequentialSampler
from torch.nn import functional as F
from torch import nn, optim

In [10]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [11]:
class Generator(nn.Module):
    '''
    Define standard linear + softmax generation step.
    '''
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)
    
    
class LayerNorm(nn.Module):
    '''
    Layer normalization
    the output of each sub-layer is LayerNorm(x + sublayer(x))
    sublayer(x) is the function implemented by the sub-layer itself    
    
    To facilitate these residual connections, all sub-layers in the model, 
    as well as the embedding layers, produce outputs of dimension d_model = 512
    
    features = 
    x = 
    '''
    # TODO: what is features and x? Their size? A tuple of 
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

    
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    
    size = size of 
    dropout = dropout rate
    """
    # TODO: what is size
    def __init__(self, size, dropout=0.1):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))
    


In [13]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    print('query.shape = ', query.shape)
    print('key.shape = ', key.shape)
    print('value.shape = ', value.shape)
    
    d_k = query.size(-1) # dimension of key
    torch.mm(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
        
    p_attn = F.softmax(scores, dim = -1)
    
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout = 0.1):
        '''
        h = number of parallel attention layers, or heads
        d_model = To facilitate these residual connections, all sub-layers in the model, 
            as well as the embedding layers, produce outputs of dimension d_model = 512
        '''
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # assume d_v = d_k
        self.h = h
        self.d_k = d_model//h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implement attention layer with query, key, value"
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask = mask, dropout = self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
        
        return self.linears[-1](x)
        

In [14]:
# position-wise feedforward network
'''
In addition to attention sub-layers, each of the layers in our encoder and decoder contains 
a fully connected feed-forward network, which is applied to each position separately and 
identically. 
This consists of two linear transformations with a ReLU activation in between.
'''
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = F.relu(self.w_1(x))
        x = self.dropout(x)
        x = self.w_2(x)
        
        return x # self.w_2(self.dropout(F.relu(self.w_1(x))))

In [15]:
# Embeddings and Softmax
'''
we use learned embeddings to convert the input tokens and output tokens to vectors of dimension
d_model. 
We also use the usual learned linear transformation and softmax function to convert the decoder 
output to predicted next-token probabilities. In our model, we share the same weight matrix 
between the two embedding layers and the pre-softmax linear transformation
'''
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model
        
    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [16]:
# positional encoding
'''
we add “positional encodings” to the input embeddings at the bottoms of the encoder and decoder 
stacks. The positional encodings have the same dimension d_model as the embeddings, 
so that the two can be summed.
'''

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len = 5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p = dropout)
        
        # compute positional encodings in log-scale
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        x = self.dropout(x)
        return x

In [17]:
# make encoder, which is pre-trained bert    
class Encoder(nn.Module):
    '''
    bert base model (6 layers of transformer encoder), no finetune
    '''
    def __init__(self):
        super(Encoder, self).__init__()
        self.model = BertModel.from_pretrained('bert-base-uncased')
        
    def forward(self, x, segs, mask):
        self.eval()
        with torch.no_grad():
            top_vec, _ = self.model(x, segs, attention_mask=mask)
        return top_vec
        

In [18]:
# make decoder 
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn # MultiHeadedAttention(h, d_model)
        self.src_attn = src_attn # MultiHeadedAttention(h, d_model)
        self.feed_forward = feed_forward # PositionwiseFeedForward(d_model, d_ff, dropout)
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)
    
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [19]:
# masking 
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

In [20]:
# put to an encoder decoder
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask,
                            tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [27]:
def make_model(src_vocab, tgt_vocab, N=1, 
               d_model=512, d_ff=2048, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    
    encoder = Encoder()
    decoder_layer =  DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout)
    decoder = Decoder(decoder_layer, N)
    
    model = EncoderDecoder(
        encoder,
        decoder,
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model

In [28]:
tmp_model = make_model(10, 10, 2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
features = (4,6)
ones = torch.ones(features)
param = nn.Parameter(torch.ones(features))

print(param)


Parameter containing:
tensor([[1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.]], requires_grad=True)


NameError: name 'l' is not defined