# My Transformer

I wrote this to help me understand _Attention Is All You Need_: https://arxiv.org/abs/1706.03762

I cut out the Encoder and I'm using it to generate English words

In [1]:
from collections import defaultdict, Counter
import multiprocessing.pool
from math import sqrt, sin, cos
import os
import random
import sys
import string
import time
import numpy as np

import torch
from torch import tensor
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
import wandb
from torchtext.data import RawField, ReversibleField, LabelField
from torchtext.datasets import WikiText2
from torchtext.datasets.language_modeling import LanguageModelingDataset


## Setup
 

In [2]:
# Basic Config
conf = {
        'attn_heads': 8,
        'bptt_len': 40,
        #'cuda_device_ids': [3, 2, 1, 0],  # I need better GPU coolng first
        'cuda_device_ids': [1,0],
        'd_model': 512,
        #'datafile': './city_names.txt', # from: https://www.britannica.com/topic/list-of-cities-and-towns-in-the-United-States-2023068
        #'datafile': './corncob_lowercase.txt',  # from: http://www.mieliestronk.com/corncob_lowercase.txt
        #'datafile': './alphabet_short.txt',  
        #'datafile': './dummy_data.txt', 
        #'dataset': 'WikiText2',
        'dataset': 'WikiText103',
        'dropout': 0.1,
        'learning_rate': 0.0001,
        'epochs_per_loop': 1,
        'total_training_loops': 20,
        'num_blocks_encoder': 0,
        'num_blocks_decoder': 6,
        #'minibatch_size': 32 * 16,
        'minibatch_size': 20,
        'optimizer': 'Adam',  
        #'optimizer': 'SGD',
        'random_seed': 0,
        #'warmup_steps': 50,
        }


# debugging
#conf['attn_heads'] = 1
#conf['d_model'] = 1
#conf['bptt_len'] = 2
#conf['datafile'] = './dummy_data.txt'  
#conf['num_blocks_decoder'] = 1
#conf['minibatch_size'] = 1
#conf['epochs_per_loop'] = 1


# Make sure d_model, heads, and d_key are compatible
assert conf['d_model'] % conf['attn_heads'] == 0, \
    f'attn_heads=%s does not evenly divide d_model=%s' % (conf['attn_heads'], 
                                                         conf['d_model'])
conf['d_key'] = conf['d_model'] / conf['attn_heads']

# Set up the RNGs for repeatability
if conf['random_seed']:
    torch.manual_seed(conf['random_seed'])
    torch.cuda.manual_seed(conf['random_seed'])
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(conf['random_seed'])
    
    
# Set up Cuda
print("Using", len(conf['cuda_device_ids']), "GPU(s):")
for i in conf['cuda_device_ids']:
    print("    cuda:%s:" % i, torch.cuda.get_device_name(i))

device = torch.device('cuda:' + str(conf['cuda_device_ids'][0]))

print()

# I use this bare FIXME:
bptt_len = conf['bptt_len']

# Logging
wandb = None
#wandb.init(project="official-transformer", config=conf)

Using 2 GPU(s):
    cuda:1: GeForce RTX 2080 Ti
    cuda:0: GeForce RTX 2080 Ti



In [3]:
class EmbeddingTransformer(nn.Module):
    def __init__(self, 
                 vocab, 
                 d_model=conf['d_model'], 
                 nhead=conf['attn_heads'], 
                 num_encoder_layers=conf['num_blocks_encoder'],
                 num_decoder_layers=conf['num_blocks_decoder'], 
                 dropout=conf['dropout']):
        super().__init__()
        
        self.vocab = vocab

        self.embedding = nn.Embedding(num_embeddings=len(vocab), 
                                      embedding_dim=d_model, 
                                      padding_idx=vocab.stoi['<pad>'])

        pe = self._position_encoding(bptt_len=conf['bptt_len'], d_model=d_model)
        self.register_buffer('position_encoding', pe)
        
        self.transformer = nn.Transformer(d_model=d_model, 
                                          nhead=nhead, 
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers, 
                                          dropout=dropout)

        self.linear = nn.Linear(d_model,len(vocab))

        tgt_mask = self.transformer.generate_square_subsequent_mask(sz=conf['bptt_len'])
        self.register_buffer('tgt_mask', tgt_mask)

    def _position_encoding(self, bptt_len, d_model):
        cols = [tensor([sin(pos/(10000**(i/d_model))) 
                        if i % 2 == 0 
                        else 
                        cos(pos/(10000**((i-1)/d_model))) 
                        for i in range(d_model)])
                for pos in range(bptt_len)]
        stack = torch.stack(cols, dim=0).unsqueeze(1)
        return stack

    def forward(self, src, tgt):
        src = self.embedding(src) + self.position_encoding[:src.shape[0],:,:]
        tgt = self.embedding(tgt) + self.position_encoding[:tgt.shape[0],:,:]
        #print('src.size(2)=', src.size(2))
        tgt_mask = self.tgt_mask[:tgt.shape[0],:tgt.shape[0]]
        #print('tgt=', tgt)
        #print('tgt_mask=', tgt_mask)
        transformed = self.transformer(src=src, tgt=tgt, tgt_mask=tgt_mask)
        out = self.linear(transformed)
        
        return out

## Load Data and Build the Model

In [4]:
# dataloader and vocab
#train_ds = load_dataset()
dataloader = getattr(torchtext.datasets, conf['dataset'])
train_ds, val_ds, test_ds = dataloader.iters(batch_size=conf['minibatch_size'], 
                                             bptt_len=2 * conf['bptt_len'])
vocab = train_ds.dataset.fields['text'].vocab

pad_token = '_'
pad_index = vocab.stoi[pad_token]

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [5]:
#os.environ['CUDA_LAUNCH_BLOCKING']='1'

In [6]:
# Create the model
model = EmbeddingTransformer(vocab=vocab,
                                d_model=conf['d_model'], 
                                nhead=conf['attn_heads'], 
                                num_encoder_layers=conf['num_blocks_encoder'],
                                num_decoder_layers=conf['num_blocks_decoder'], 
                                dropout=conf['dropout'])
#model = nn.DataParallel(model, device_ids=conf['cuda_device_ids'])
model = model.to(device)

# Define the Loss
#criterion = nn.CrossEntropyLoss(ignore_index=pad_index)
criterion = nn.CrossEntropyLoss()

## Training Helper Functions

In [13]:
def get_minibatches(dataset=train_ds):
    bptt_len = conf['bptt_len']
    for batch in dataset:
        #print('batch:', batch)
        if batch.text.shape[0] < bptt_len + 1:
            continue
        eo = batch.text[:bptt_len, :]
        di = batch.text[bptt_len:, :]
        y = batch.target[bptt_len:, :]
        yield eo, di, y
        
def accuracy(output, expected_indices):
    indices = torch.max(output, dim=-1)[1]
    indices = indices.squeeze()
    acc = (indices == expected_indices) / float(indices.numel())
    acc = float(acc.sum())
    return acc

def run_minibatch(eo, di, y, optimizer):
    """Runs one minibatch training and returns the loss and accuracy for that minibatch"""
    optimizer.zero_grad()
    eo, di, y = eo.to(device), di.to(device), y.to(device)
    y_pred = model(src=eo, tgt=di)
    acc = accuracy(y_pred, y)
    y_pred = y_pred.transpose(-2, -1)
    loss = criterion(y_pred, y)
    loss.backward()  # Not sure why, but this step logs a UserWarning
    optimizer.step()
    return loss.item(), acc

def test_set_accuracy(model):
    with torch.no_grad():
        eval_model = model.eval()
        accuracies = []
        minibatches = 0
        for eo, di, y in get_minibatches(test_ds):
            eo, di, y = eo.to(device), di.to(device), y.to(device)
            y_pred = model(src=eo, tgt=di)
            accuracies.append(accuracy(y_pred, y))
            minibatches += 1
    acc = 100 * tensor(accuracies, device=device).float().mean().item()
    return acc
            
def do_epoch(epoch, optimizer, model, bptt_len=conf['bptt_len']):
    """Runs one full training batch and returns the average loss,
    accuracy, and duration time in seconds"""
    model = model.train()
    t0 = time.time()
    losses = []
    train_accuracies = []
    for eo, di, y in get_minibatches():
        #print('eo.shape:', encoder_out.shape, 'di.shape', decoder_in.shape, 'y.shape:', y.shape)
        loss, train_acc = run_minibatch(eo, di, y, optimizer) 
        losses.append(loss)
        train_accuracies.append(train_acc)
    #losses = [run_minibatch(*args) for args in get_minibatches(train_ds)]
    tf = time.time()
    if losses:
        avg_loss = tensor(losses, device=device).float().mean().item()
        avg_train_accuracy = 100 * tensor(train_accuracies, device=device).float().mean().item()
    else:
        avg_loss = 0
        avg_train_accuracy = 0
    avg_test_accuracy = test_set_accuracy(model)
    return (avg_loss, avg_train_accuracy, avg_test_accuracy, tf-t0)

def train(optimizer, num_epochs=conf['epochs_per_loop'], start_epoch=0, model=model, 
          vocab=vocab, criterion=criterion):
    """Runs num_epochs training batches and prints out results"""
    for epoch in range(start_epoch, start_epoch+num_epochs):
        loss, train_accuracy, test_accuracy, seconds = do_epoch(epoch, optimizer, model)
        if wandb:
            wandb.log({'epoch': epoch,
                       'loss': loss,
                       'train_accuracy': train_accuracy,
                       'test_accuracy': test_accuracy,
                       'seconds': seconds})
        print('epoch:', epoch, '(%.1fs)' % seconds, 'loss=%f' % loss, 'train_accuracy=%.1f%%' % (train_accuracy), 'test_accuracy=%.1f%%' % (test_accuracy))   
    return epoch + 1

## Train the Model

In [8]:
# Define the Optimizer
#optimizer_class = getattr(torch.optim, conf['optimizer']) 
#lr = conf['learning_rate']
#optimizer = optimizer_class(model.parameters(), lr=lr)

In [9]:
epoch = 1

In [10]:
if 'learning_rate' in conf:
    lr = conf['learning_rate']
    optimizer = torch.optim.Adam(model.parameters(), lr=conf['learning_rate'])

for _ in range(conf['total_training_loops']):
    if 'warmup_steps' in conf:
        warmup_steps = conf['warmup_steps']
        lr = (conf['d_model']**-.5) * min(epoch**-.5, epoch * (warmup_steps**-1.5))
        optimizer = torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9, lr=lr)
    print('lr = %.6f' % lr)
    epoch = train(optimizer=optimizer, num_epochs=conf['epochs_per_loop'], start_epoch=epoch)
    save_file_name = './official-transformer_%s_%s-layer_%s-epochs.pt' % (conf['dataset'], conf['num_blocks_decoder'], epoch-1)
    if hasattr(model, 'module'):
        torch.save(model.module.state_dict(), save_file_name)
    else:
        torch.save(model.state_dict(), save_file_name)
        

lr = 0.000100


RuntimeError: Expected object of device type cuda but got device type cpu for argument #3 'index' in call to _th_index_select

In [14]:
test_set_accuracy(model)

27.8202623128891

In [23]:
save_file_name = './official-transformer_%s_%s-layer_%s-epochs.pt' % (conf['dataset'], conf['num_blocks_decoder'], epoch-1)
torch.save(model.state_dict(), save_file_name)


In [27]:
if 'learning_rate' in conf:
    lr = conf['learning_rate']
    optimizer = torch.optim.Adam(model.parameters(), lr=conf['learning_rate'])

for _ in range(conf['total_training_loops']):
    if 'warmup_steps' in conf:
        warmup_steps = conf['warmup_steps']
        lr = (conf['d_model']**-.5) * min(epoch**-.5, epoch * (warmup_steps**-1.5))
        optimizer = torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9, lr=lr)
    print('lr = %.6f' % lr)
    epoch = train(optimizer=optimizer, num_epochs=conf['epochs_per_loop'], start_epoch=epoch)
    save_file_name = './official-transformer_%s_%s-layer_%s-epochs.pt' % (conf['dataset'], conf['num_blocks_decoder'], epoch-1)
    if hasattr(model, 'module'):
        torch.save(model.module.state_dict(), save_file_name)
    else:
        torch.save(model.state_dict(), save_file_name)
        

lr = 0.000100


RuntimeError: CUDA out of memory. Tried to allocate 818.00 MiB (GPU 1; 10.76 GiB total capacity; 8.64 GiB already allocated; 18.25 MiB free; 1.57 GiB cached)

In [None]:
#torch.save(model.module.state_dict(), './my-transformer-wikitext-2-test-21.4_pct.pt')

## Sampling Helper Functions

In [16]:
unk_token = '<unk>'
pad_token = '<pad>'
eos_token = '<eos>'

In [17]:
def numericalize(tokens):
    """Takse a string and returns a tensor of vocab indices for the tokens"""
    indices = list([vocab.stoi[t] for t in tokens])
    return torch.tensor(indices).to(device).unsqueeze(1)

def tokenize(indices):
    "Takes a tensor of token indices and returns a string"
    tokens = [vocab.itos[i] for i in indices.squeeze()]
    return ' '.join(tokens)

def get_next_token(encoder_out, decoder_in, pos, model=model, deterministic=False):
    """Runs one step of auto-regression, returning the output token for
    position `pos`."""
    
    decoder_out = model(src=encoder_out, tgt=decoder_in)
    
    if deterministic:
        _, indices = torch.max(decoder_out, dim=-1)
    else:
        probs = nn.functional.softmax(decoder_out.float(), dim=-1)
        m = torch.distributions.multinomial.Multinomial(probs=probs)
        _, indices = torch.max(m.sample(), dim=-1)

    next_index = int(indices[pos,0])
    return next_index, vocab.itos[next_index]

def sample(prompt, deterministic=False, vocab=vocab, prnt=True):
    """Auto-regresses using prompt to create the encoder_out tensor"""
    bptt_len = conf['bptt_len']
    prompt_tokens = prompt.split()
    assert len(prompt_tokens) == bptt_len + 1, 'Prompt strings must be %s tokens long' % bptt_len + 1   
    with torch.no_grad():
        eval_model = model.eval()

        eo = numericalize(prompt_tokens[:bptt_len])
        di = numericalize(prompt_tokens[bptt_len:])
        out = []

        next_token = None
        next_index = None
        for pos in range(bptt_len):
            next_index, next_token = get_next_token(eo, di, pos=pos, model=eval_model, deterministic=deterministic)
            if next_token in (eos_token, pad_token):
                break
            if next_token is not None:
                out.append(next_token)
                if pos+1 < bptt_len:
                    di = torch.cat((di, numericalize([next_token])), dim=0)
        
    out = ' '.join(out)
    if prnt:
        print(prompt + '\n --> \n' + out)
    return out

In [18]:
def forced_sample(prompt, expected_output, deterministic=False, vocab=vocab, prnt=True):
    """Auto-regresses using prompt to create the encoder_out tensor"""
    bptt_len = conf['bptt_len']
    prompt_tokens = prompt.split()
    expected_out_tokens = expected_out.split()
    assert len(prompt_tokens) == bptt_len, 'Prompt strings must be %s tokens long' % bptt_len    
    with torch.no_grad():
        eval_model = model.eval()

        eo = numericalize(prompt_tokens)
        di = numericalize(expected_out_tokens)
        out = []
        #print('eo = ', eo)
        #print('eo.shape = ', eo.shape)
        #print('di = ', di)
        #print('di.shape = ', di.shape)

        next_token = None
        next_index = None
        for pos in range(bptt_len):
            print('di=', di)
            next_index, next_token = get_next_token(eo, di, pos=pos, model=eval_model, deterministic=deterministic)
            print('pos=', pos, 'next_index=', next_index, 'next_token=next_token')
            if next_token in (eos_token, pad_token):
                break
        
    out = ' '.join(out)
    if prnt:
        print(prompt + '\n --> \n' + out)
    return out

## Sample the model

In [None]:

test_set_accuracy(model)

In [None]:
for eo, di, y in get_minibatches(dataset=test_ds):
    if eo.shape[0] != 40:
        continue
    prompt_indices = torch.cat((eo[:,0], di[:1,0]), dim=0).unsqueeze(1)
    #print(prompt_indices)
    prompt = tokenize(prompt_indices)
    #print(prompt)

    print("========================")
    sample(prompt, deterministic=False)

In [24]:
prompt = """Born in Omaha , Nebraska , Malcolm X spent his teenage years living in a series of foster homes after his father 's death and his mother 's hospitalization . He engaged in several illicit activities there , eventually being sentenced"""

for _ in range(5):
    print("========================")
    sample(prompt, deterministic=False)

Born in Omaha , Nebraska , Malcolm X spent his teenage years living in a series of foster homes after his father 's death and his mother 's hospitalization . He engaged in several illicit activities there , eventually being sentenced
 --> 
to prison in 1909 .
Born in Omaha , Nebraska , Malcolm X spent his teenage years living in a series of foster homes after his father 's death and his mother 's hospitalization . He engaged in several illicit activities there , eventually being sentenced
 --> 
to time with her time to death . Moore continued to live in Atlanta with smallpox but her final appearance in the first week of eating . At age 18 , Hindley , a family of sons . At prison
Born in Omaha , Nebraska , Malcolm X spent his teenage years living in a series of foster homes after his father 's death and his mother 's hospitalization . He engaged in several illicit activities there , eventually being sentenced
 --> 
to money , was easier to breakdown .
Born in Omaha , Nebraska , Malcolm 

In [None]:
prompt = """Born in Omaha , Nebraska , Malcolm X spent his teenage years living in a series of foster homes after his father 's death and his mother 's hospitalization . He engaged in several illicit activities there , eventually being"""
expected_out="""sentenced to 10 years in prison in 1946 for larceny and breaking and entering . In prison , he joined the Nation of Islam , adopted the name Malcolm X , and quickly became one of the organization 's most"""
#forced_sample(prompt, expected_out, deterministic=True)

In [26]:
prompt = """There was the problematization of madness and illness arising out of social and medical practices , and defining a certain pattern of “ normalization “ ; a problematization of life , language , and labor in discursive practices that conformed to"""

for _ in range(5):
    print("========================")
    sample(prompt)

There was the problematization of madness and illness arising out of social and medical practices , and defining a certain pattern of “ normalization “ ; a problematization of life , language , and labor in discursive practices that conformed to
 --> 
balance others . In the last year firsthand was the main visitor of the nice and owe him directly with the investments . The cross was handed , and in the Stafford burial to the Conor elixirs the strong level
There was the problematization of madness and illness arising out of social and medical practices , and defining a certain pattern of “ normalization “ ; a problematization of life , language , and labor in discursive practices that conformed to
 --> 
be portrayed as well as Upshaw to get Abby Professor to prepare the photographers . Affleck 's role was preparing until the first Russian director was the authors inflammation of spectators and participated in the history of a commitment @-@
There was the problematization of madness and

In [None]:
batches = list(get_minibatches(test_ds))

In [None]:
batches[0]

In [None]:
eo=minibatches[0][0][:,0]
di=minibatches[0][1][:,0]
y=minibatches[0][2][:,0]
print('eo=', ''.join(tokenize(eo)), 'eo.shape=', eo.shape)
print('di=', ''.join(tokenize(di)), 'di.shape=', di.shape)
print('y=', ''.join(tokenize(y)), 'y.shape=', y.shape)


In [None]:
eval_model = model.eval()
with torch.no_grad():
    out = eval_model(src=eo.unsqueeze(1), tgt=di.unsqueeze(1))
    print(out.shape)
    indices = torch.max(out, dim=-1)[1]
    print(tokenize(indices))
    

In [None]:
minibatches=batches

In [None]:
bb = list(test_ds)

In [None]:
batch = bb[0]
print(batch.text)
print(batch.target)
bptt_len = conf['bptt_len']
eo = batch.text[:bptt_len, :]
di = batch.text[bptt_len:, :]
y = batch.target[bptt_len:, :]
print('eo=', eo)
print('di=', di)
print('y=', y)
print('eo_t=', ''.join(tokenize(eo[:,0])))
print('di_t=', ''.join(tokenize(di[:,0])))
print('y_t=', ''.join(tokenize(y[:,0])))

from torch.autograd.gradcheck import zero_gradients


def compute_jacobian(inputs, output):
    """
    :param inputs: Batch X Size (e.g. Depth X Width X Height)
    :param output: Batch X Classes
    :return: jacobian: Batch X Classes X Size
    """
    assert inputs.requires_grad

    num_classes = output.size()[0]

    jacobian = torch.zeros(num_classes, *inputs.size())
    grad_output = torch.zeros(*output.size())
    if inputs.is_cuda:
        grad_output = grad_output.to(inputs.device)
        jacobian = jacobian.to(inputs.device)

    for i in range(num_classes):
        zero_gradients(inputs)
        grad_output.zero_()
        grad_output[i, :, :] = 1
        output.backward(grad_output, retain_graph=True)
        jacobian[i] = inputs.grad.data

    jacobian = jacobian.squeeze()
    jacobian = torch.transpose(jacobian, dim0=0, dim1=1)

    return jacobian

bptt_len = conf['bptt_len']
#model = model.train()
#model = model.train()


size = (2, 1)
#print('size=', size)
eo = torch.ones(size).long()
di = torch.tensor([[1],[2]])
assert di.shape == size

class Nonsense(EmbeddingTransformer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.module = nn.Module()
        

cpu_model = EmbeddingTransformer(vocab=vocab,
                                d_model=conf['d_model'], 
                                nhead=conf['attn_heads'], 
                                num_encoder_layers=conf['num_blocks_encoder'],
                                num_decoder_layers=conf['num_blocks_decoder'], 
                                dropout=conf['dropout'])

#path=~/src/openai_scholars_program/sequence_models
d = torch.load('./91.9_Pct/official-transformer_WikiText2_6-layer_9-epochs.pt', map_location=torch.device('cpu'))
cpu_model.load_state_dict(d)

src=eo
tgt=di
self = cpu_model

print('tgt.shape=', tgt.shape)

src = self.embedding(src) # + self.position_encoding[:src.shape[0],:,:]
tgt = self.embedding(tgt) # + self.position_encoding[:tgt.shape[0],:,:]
#print('src.size(2)=', src.size(2))
tgt_mask = self.tgt_mask[:tgt.shape[0],:tgt.shape[0]]
print('tgt=', tgt)
print('tgt_mask=', tgt_mask)
transformed = self.transformer(src=src, tgt=tgt, tgt_mask=tgt_mask)
out = self.linear(transformed)
print('out.shape=', out.shape)

print('out:', out)


j = compute_jacobian(tgt, out)
print()
print('squeezed jacobian:')
print(j)


In [None]:
self.position_encoding

In [None]:
d