In [52]:
import tiktoken
import torch
import torch.nn as nn 
import math
import os
import numpy as np
from torch.nn import functional as F
from model import Encoder

# -------- CONSTANTS -------- # 
device = "cpu" # "cuda:0" if torch.cuda.is_available() else "cpu"
VOCAB_SIZE = 50257 + 1 # 51000
PADDING = 50257
LR = 6e-4 # 3e-4
DROPOUT = 0.2
HEADS = 8
NX = 8
LR = 3e-4 # 6e-4
BATCH_SIZE = 14 # 64
CTX = 200 # 256
EMBED_DIM = 584
train_data = np.memmap(os.path.join('./data', 'train.bin'), dtype=np.uint16, mode='r')
val_data = np.memmap(os.path.join('./data', 'val.bin'), dtype=np.uint16, mode='r')
# --------------------------- # 
enc = tiktoken.get_encoding("gpt2")

In [53]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [54]:
train_example = "What is the worst customer service experience you have ever had? "
train_example += '<|endoftext|>'
print(train_example)
idx = enc.encode(train_example, allowed_special={"<|endoftext|>"})
idx += [PADDING for _ in range(CTX - len(idx))]
print(idx)


What is the worst customer service experience you have ever had? <|endoftext|>
[2061, 318, 262, 5290, 6491, 2139, 1998, 345, 423, 1683, 550, 30, 220, 50256, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 502

In [55]:
# Make B x T tensor
test_batch = torch.tensor([idx for i in range(BATCH_SIZE)]).to(device)
print(test_batch.shape)

torch.Size([14, 200])


In [56]:
# Make the mask based on padding in tensor
mask = [[0 if token == PADDING else 1 for token in tensor] for tensor in test_batch]
mask = torch.tensor(mask)
print(mask.shape)

torch.Size([14, 200])


In [57]:
class TestEncoder(nn.Module):

    def __init__(self):
        super().__init__()
        
        self.token_embedding_table = nn.Embedding(VOCAB_SIZE, EMBED_DIM)
        self.position_embedding_table = PositionalEncoding(EMBED_DIM, DROPOUT, CTX).to(device)
        self.encoder = nn.Sequential(*[Encoder() for _ in range(NX)])

    def forward(self, x, targets=None):
        
        idx = x[0]
        mask = x[1]

        tok_enb = self.token_embedding_table(idx) # B, T, C
        pos_enb = self.position_embedding_table(torch.transpose(tok_enb, 0, 1).to(device)) # T, B, C
        idx = torch.transpose(pos_enb, 1, 0).to(device) # B, T, C

        # Feed into encoder
        enc_out = self.encoder((idx, mask)) # -> B, T, C

        return enc_out


In [58]:
model = TestEncoder().to(device)
logits = model((test_batch, mask))

TypeError: layer_norm(): argument 'input' (position 1) must be Tensor, not tuple

In [59]:
source = ["this is our input", "this is what we feed into our model! ", "kill me now kill me now kill me now"]
source = [enc.encode(s + '<|endoftext|>', allowed_special={"<|endoftext|>"}) for s in source]
source = [ex + [PADDING for _ in range(CTX - len(ex))] for ex in source]
targets = ["this is our output", "this is what our model should aim to output! ", "924-534-1000"]
targets = [enc.encode(t + '<|endoftext|>', allowed_special={'<|endoftext|>'}) for t in targets]
targets = [ex + [PADDING for _ in range(CTX - len(ex))] for ex in targets]

In [60]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
    return subsequent_mask == 0

class Batch:
    """Object for holding a batch of data with mask during training."""

    def __init__(self, src, tgt=None, pad=2):  # 2 = <blank>
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if tgt is not None:
            self.tgt = tgt[:, :-1]
            self.tgt_y = tgt[:, 1:]
            self.tgt_mask = self.make_std_mask(self.tgt, pad)
            self.ntokens = (self.tgt_y != pad).data.sum()

    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(
            tgt_mask.data
        )
        return tgt_mask
    
"""
idk if we need this 
"""

'\nidk if we need this \n'

In [71]:

def get_unsupervised_batch(split):
    """
    Generates batch data of BATCH_SIZE of inputs x which is of CTX and targets y
    """
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - CTX, (BATCH_SIZE,), device=device)
    x = torch.stack([torch.tensor(list(data[i:i+CTX-5]) + [50257 for _ in range(5)], dtype=torch.int64) for i in ix]).to(device)
    y = torch.stack([torch.tensor(list(data[i+1:i+1+CTX-5]) + [50257 for _ in range(5)], dtype=torch.int64) for i in ix]).to(device)
    return x, y

In [79]:
src, tar = get_unsupervised_batch('train')
print(enc.decode(src[13, :-5].tolist()))
print(enc.decode(tar[13, :-5].tolist()))

 office, she spent 90 minutes on a transition-planning conference call with Washington officials.

"It's a bit overwhelming, I have to admit," said Ward, 53, who has spent her entire career in government, including nearly 27 years working for the State of California, most of it in disaster planning.

But, she said, "It's in your blood, and when you're asked, it's hard to say 'no.'"

Despite her experience, or perhaps because of it, Ward confesses that she is squeamish about riding the trains beneath the San Francisco Bay between Oakland and San Francisco.

"I'm claustrophobic, I'm a little nervous about being under the water if a big one should hit," she said.

Nor would she drive on the San Francisco-Oakland Bay Bridge for six years after the 1989 Loma Prieta earthquake, when part of the bridge collapsed.

Ward
