<a href="https://colab.research.google.com/github/jamadri/Transformers/blob/main/model_utils_and_training_drive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Utils and Model

## Utils

### addAndNorm.py

In [1]:
"""Residual connection."""


def addAndNorm(x, blockOutput, norm):
    """Residual connection."""
    return norm(x + blockOutput)


### PositionalEncoding.py

In [2]:
"""Positional Encoding."""
import math
import torch


def positionalEncoding(x, dim_model):
    """Positional Encoding."""
    def sineOrCosine(i):
        """sin(alpha+pi/2) = cos(alpha)."""
        return math.pi/2*(i % 2 == 1)
    values = [
        [sineOrCosine(i) + pos/math.pow(10000, 2*(i//2)/dim_model) for
         i in range(dim_model)]
        for pos in range(x.shape[-2])  # seq_length
    ]
    return torch.sin(torch.tensor(values)).unsqueeze(0)

## Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!ls /content/drive/MyDrive/Transformers/data

tiny_shakespeare.txt


In [5]:
tiny_shakespeare = open('/content/drive/MyDrive/Transformers/data/tiny_shakespeare.txt',
            'rb').read().decode(encoding='utf-8')

## Model

In [6]:
from torch import nn
import torch

class Transformer(nn.Module):
    """Main transformer block.

    Inputs: - model_parameters: ordered dictionary of key-values describing the
    layer parameters of the model:
      - dim_model: dimension of the model.
      - layers: dictionary of key-values describing specific layers
        - <layer_name>: dictionary of parameters for the specific multi-head
          layer
          - attention: dictionary of parameters for the specific attention
            function
            - dim_key: dimension of the key and query.
            - dim_value: dimension of the value.
          - nb_head: number of heads.

    """

    def __init__(self, model_parameters):
        """Initialize parameters."""
        super().__init__()
        self.dim_model = model_parameters["dim_model"]
        self.encoder = Encoder(model_parameters["encoder"])
        self.decoder = Decoder(model_parameters["decoder"])
        self.embedding = Embedding(model_parameters)
        self.toProba = nn.Sequential(
            nn.Linear(self.dim_model,
                      model_parameters["vocabulary_size"]),
            nn.Softmax()
        )
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, lastOutput):
        """Apply a step forward."""
        encoderInput = self.embedding(x) + positionalEncoding(
            x, self.dim_model)
        decoderInput = self.embedding(lastOutput) + positionalEncoding(
            lastOutput, self.dim_model)
        encoderInput = self.dropout(encoderInput)
        decoderInput = self.dropout(decoderInput)
        encoderOutput = self.encoder(encoderInput)
        decoderOutput = self.decoder(decoderInput, encoderOutput)
        lastOutput = self.toProba(decoderOutput)
        return lastOutput


class Encoder(nn.Module):
    """Encoder."""

    def __init__(self, encoderConfig):
        """Initialize."""
        super().__init__()
        self.nb_layers = encoderConfig["nb_layers"]
        self.dim_model = encoderConfig["dim_model"]
        self.dim_feedforward = encoderConfig["feedforward"]["dim_feedforward"]
        self.norm = nn.LayerNorm(normalized_shape=self.dim_model,
                                 elementwise_affine=True, bias=True)
        self.multiheads = [MultiHeadAttention(encoderConfig["multihead"],
                                              masked=False)
                           for i in range(self.nb_layers)]
        self.feedforwards = [nn.Sequential(nn.Linear(self.dim_model,
                                                     self.dim_feedforward),
                                           nn.ReLU(),
                                           nn.Linear(self.dim_feedforward,
                                                     self.dim_model))
                             for i in range(self.nb_layers)]
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        """Forward."""
        for i in range(self.nb_layers):
            h1 = addAndNorm(x, self.dropout(self.multiheads[i](x, x,
                                                               x)),
                            self.norm)
            x = addAndNorm(h1, self.dropout(self.feedforwards[i](h1)),
                           self.norm)
        return x


class Decoder(nn.Module):
    """Decoder."""

    def __init__(self, decoderConfig):
        """Initialize."""
        super().__init__()
        self.dim_model = decoderConfig["dim_model"]
        self.norm = nn.LayerNorm(normalized_shape=self.dim_model)
        self.dim_feedforward = decoderConfig["feedforward"]["dim_feedforward"]
        self.nb_layers = decoderConfig["nb_layers"]
        self.layer = []
        self.multiheads1 = [MultiHeadAttention(decoderConfig["multihead"],
                                               masked=True)
                            for i in range(self.nb_layers)]
        self.multiheads2 = [MultiHeadAttention(decoderConfig["multihead"],
                                               masked=False)
                            for i in range(self.nb_layers)]
        self.feedforwards = [nn.Sequential(nn.Linear(self.dim_model,
                                                     self.dim_feedforward),
                                           nn.ReLU(),
                                           nn.Linear(self.dim_feedforward,
                                                     self.dim_model))
                             for i in range(self.nb_layers)]

    def forward(self, decoderInput, encoderOutput):
        """Forward."""
        for i in range(self.nb_layers):
            h1 = addAndNorm(decoderInput,
                            self.multiheads1[i](decoderInput,
                                                decoderInput,
                                                decoderInput),
                            self.norm)
            h2 = addAndNorm(h1,
                            self.multiheads2[i](h1,
                                                encoderOutput,
                                                encoderOutput),
                            self.norm)
            lastOutput = addAndNorm(h2,
                                    self.feedforwards[i](h2),
                                    self.norm)
        return lastOutput


class LonelyDecoder(nn.Module):
    """A lonely decoder."""

    def __init__(self, model_parameters):
        """Initialize."""
        super().__init__()
        self.decoderConfig = model_parameters["decoder"]
        self.dim_model = self.decoderConfig["dim_model"]
        self.norm = nn.LayerNorm(normalized_shape=self.dim_model)
        self.dim_feedforward = self.decoderConfig["feedforward"]["dim_feedforward"]
        self.nb_layers = self.decoderConfig["nb_layers"]
        self.embedding = Embedding(model_parameters)
        self.multiheads1 = nn.ModuleList([MultiHeadAttention(self.decoderConfig["multihead"],
                                               masked=True)
                            for i in range(self.nb_layers)])
        #self.multiheads2 = nn.ModuleList([MultiHeadAttention(self.decoderConfig["multihead"],
        #                                       masked=False)
        #                    for i in range(self.nb_layers)])
        self.feedforwards = nn.ModuleList([nn.Sequential(nn.Linear(self.dim_model,
                                                     self.dim_feedforward),
                                           nn.ReLU(),
                                           nn.Linear(self.dim_feedforward,
                                                     self.dim_model))
                             for i in range(self.nb_layers)])
        self.toLogit = nn.Linear(self.dim_model,
                                 model_parameters["vocabulary_size"])
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        """Forward."""
        x = self.embedding(x) + positionalEncoding(
            x, self.dim_model)
        x = self.dropout(x)
        for i in range(self.nb_layers):
            h1 = addAndNorm(x,
                            self.dropout(self.multiheads1[i](x, x, x)),
                            self.norm)
            #h2 = addAndNorm(h1,
            #                self.dropout(self.multiheads2[i](h1, h1, h1)),
            #                self.norm)
            layerOutput = addAndNorm(h1,
                                     self.dropout(self.feedforwards[i](h1)),
                                     self.norm)
        finalOutput = self.toLogit(layerOutput)
        return finalOutput


class ScaledDotProductAttention(nn.Module):
    """Scaled Dot-Product Attention."""

    def __init__(self, dim_model, masked=False):
        """Initialize.

        Inputs:
        - dim_model: model dimension
        - masked: prevents tokens to attend to the following ones.
        """
        super().__init__()
        self.dim_model = dim_model
        self.masked = masked
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, Q, K, V):
        """Forward.

        Inputs:
        - Q: query
        - K: key
        - V: value
        """
        matmul_0 = torch.matmul(Q, K.transpose(-2, -1))
        scaled = torch.divide(matmul_0, torch.Tensor([self.dim_model]))
        if self.masked:
            mask = torch.ones(scaled.shape)
            mask = mask - torch.tril(mask)*mask
            mask = torch.where(mask == 1, float('-inf'), 0)
            scaled = scaled + mask
        softmaxed = self.softmax(scaled)
        sdpa = torch.matmul(softmaxed, V)
        return sdpa


class MultiHeadAttention(nn.Module):
    """Multi-Head Attention.

    Inputs:
    - multi_head_config: dictionary
    """

    def __init__(self, multi_head_config, masked=False):
        """Initialize multi-head."""
        super().__init__()
        self.dim_key = multi_head_config["attention"]["dim_key"]
        self.dim_value = multi_head_config["attention"]["dim_value"]
        self.nb_heads = multi_head_config["nb_heads"]
        self.dim_model = self.dim_key * self.nb_heads

        self.WQs = nn.ModuleList([nn.Linear(self.dim_model, self.dim_key)
                    for i in range(self.nb_heads)])
        self.WKs = nn.ModuleList([nn.Linear(self.dim_model, self.dim_key)
                    for i in range(self.nb_heads)])
        self.WVs = nn.ModuleList([nn.Linear(self.dim_model, self.dim_value)
                    for i in range(self.nb_heads)])
        self.spda = ScaledDotProductAttention(self.dim_model, masked)

    def forward(self, Q, K, V):
        """One step of the multi-head block."""
        heads = [self.spda(self.WQs[i](Q),
                           self.WKs[i](K),
                           self.WVs[i](V))
                 for i in range(self.nb_heads)]
        return torch.cat([head for head in heads], -1)


class Embedding(nn.Module):
    """Embedding."""

    def __init__(self, model_parameters):
        """Initialize embedding."""
        super().__init__()
        self.embedding = nn.Linear(model_parameters["vocabulary_size"],
                                   model_parameters["dim_model"])

    def forward(self, x):
        """Forward step."""
        return self.embedding(x)

model_parameters = {
    "dim_model": 256,
    "vocabulary_size": 67,
    "batch_size": 64,
    "encoder": {
        "nb_layers": 1,
        "dim_model": 256,
        "multihead": {
            "attention": {
                "dim_model": 256,
                "dim_key": 128,
                "dim_value": 128
                },
            "nb_heads": 2
            },
        "feedforward": {
            "dim_feedforward": 256
            }
        },
    "decoder": {
        "nb_layers": 1,
        "vocabulary_size": 67,
        "dim_model": 256,
        "multihead": {
            "attention": {
                "dim_model": 256,
                "dim_key": 64,
                "dim_value": 64,
            },
            "nb_heads": 4
        },
        "feedforward": {
            "dim_feedforward": 256
        }
    }
}
decoder = LonelyDecoder(model_parameters)
print(decoder)

LonelyDecoder(
  (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (embedding): Embedding(
    (embedding): Linear(in_features=67, out_features=256, bias=True)
  )
  (multiheads1): ModuleList(
    (0): MultiHeadAttention(
      (WQs): ModuleList(
        (0-3): 4 x Linear(in_features=256, out_features=64, bias=True)
      )
      (WKs): ModuleList(
        (0-3): 4 x Linear(in_features=256, out_features=64, bias=True)
      )
      (WVs): ModuleList(
        (0-3): 4 x Linear(in_features=256, out_features=64, bias=True)
      )
      (spda): ScaledDotProductAttention(
        (softmax): Softmax(dim=-1)
      )
    )
  )
  (feedforwards): ModuleList(
    (0): Sequential(
      (0): Linear(in_features=256, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=256, bias=True)
    )
  )
  (toLogit): Linear(in_features=256, out_features=67, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


# Training

In [7]:
# """Training."""
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
# # import Transformers.model.transformer.Transformer

text = tiny_shakespeare
print('Length of text: {} characters'.format(len(text)))
print(text[:250])

# unique characters in the file
vocab = sorted(set(text+"@"+"#")) # @ will be the initial character
                                  # and # the final character. They
                                  # are not in the text.
print('{} unique characters'.format(len(vocab)))

# Lookup tables
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])
print("char2idx")
print(char2idx)
print("idx2char")
print(idx2char)
print("text_as_int")
print(text_as_int)
print ('{} ---- characters mapped to int ---- >{}'.format(repr(text[:13]), text_as_int[:13]))

# Create training examples:
seq_length = 8
examples_per_epoch = len(text)//(seq_length)

int_text_tensor = torch.tensor(text_as_int)
chunks = torch.chunk(int_text_tensor, examples_per_epoch, 0)
print(int_text_tensor)

examples = [chunk[:-1] for chunk in chunks]
targets = [chunk[1:] for chunk in chunks]
print(f"""There are {len(examples)} chunks of {seq_length} characters available for the
network training.""")



model_parameters = {
    "dim_model": 512,
    "vocabulary_size": 67,
    "batch_size": 64,
    "encoder": {
        "nb_layers": 1,
        "dim_model": 256,
        "multihead": {
            "attention": {
                "dim_model": 256,
                "dim_key": 128,
                "dim_value": 128
                },
            "nb_heads": 2
            },
        "feedforward": {
            "dim_feedforward": 256
            }
        },
    "decoder": {
        "nb_layers": 1,
        "vocabulary_size": 67,
        "dim_model": 512,
        "multihead": {
            "attention": {
                "dim_model": 512,
                "dim_key": 64,
                "dim_value": 64,
            },
            "nb_heads": 8
        },
        "feedforward": {
            "dim_feedforward": 512
        }
    }
}

# transformer = Transformer(model_parameters)
# x = torch.randn(10, 128)
# lastOutput = torch.randn(3, 128)
# transformer(x, lastOutput)

decoder = LonelyDecoder(model_parameters)
x = torch.randn(10, model_parameters["vocabulary_size"])
lastOutput = torch.randn(3, model_parameters["vocabulary_size"])
#decoder(x)


one_hot_examples = F.one_hot(torch.stack(examples[:-1]).long(),
                             model_parameters["vocabulary_size"]).float()
targets = torch.stack(targets[:-1]).long()

print(f"one_hot_examples shape: {one_hot_examples.shape}")
print(''.join([idx2char[i] for i in torch.max(one_hot_examples[0], dim=1)[1].tolist()]))
print(f"targets shape: {targets.shape}")
print(''.join([idx2char[i] for i in targets[0]]))
print(f"one_hot_examples[0].shape: {one_hot_examples[0].shape}")
print(f"targets[0].shape: {targets[0].shape}")
#decoder(one_hot_examples[0])
one_hot_examples[0]

loss_fn = nn.CrossEntropyLoss()
#loss = loss_fn(decoder(one_hot_examples[0]), one_hot_targets[0])
#loss.backward()
optimizer = torch.optim.Adam(decoder.parameters(), lr=1e-3, betas=(0.9, 0.98))

from torch.utils.data import DataLoader, Dataset

class customDataset(Dataset):
    def __init__(self, data, target):
        self.data  = data
        self.target = target
    def __len__(self):
        return self.data.shape[0]
    def __getitem__(self, idx):
        return self.data[idx], self.target[idx]

# dataset = customDataset(data, target)
dataset = customDataset(one_hot_examples, targets)
train_dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

def train_loop(dataloader, model, loss_fn, optimizer, t):
    """Train loop. Taken from pytorch tutorial."""
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch
    # normalization and dropout layers Unnecessary in this situation
    # but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss =  loss_fn(pred.transpose(1,2), y) # loss_fn(pred.transpose(1,2)[:,:,-1], y[:,-1])

        # Backpropagation
        loss.backward()
        for name, param in model.named_parameters():
            if param.requires_grad:
                print(f"{name}: {param.data}")
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * 64 + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            print("Input")
            print(''.join([idx2char[i] for i in torch.max(X[0], dim=1)[1].tolist()]))
            print("Target")
            print(''.join([idx2char[i] for i in y[0]]))
            print("Prediction")
            print(''.join([idx2char[i] for i in torch.max(pred[0], dim=1)[1].tolist()]))
        torch.save({
            'epoch': t+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, f"model_epoch{t}.pt")

def run_n_epochs(epochs, dataloader, model, loss_fn, optimizer):
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loop(dataloader, model, loss_fn, optimizer, t)
print(decoder)
decoder(next(iter(train_dataloader))[0])


Length of text: 1115394 characters
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

67 unique characters
char2idx
{'\n': 0, ' ': 1, '!': 2, '#': 3, '$': 4, '&': 5, "'": 6, ',': 7, '-': 8, '.': 9, '3': 10, ':': 11, ';': 12, '?': 13, '@': 14, 'A': 15, 'B': 16, 'C': 17, 'D': 18, 'E': 19, 'F': 20, 'G': 21, 'H': 22, 'I': 23, 'J': 24, 'K': 25, 'L': 26, 'M': 27, 'N': 28, 'O': 29, 'P': 30, 'Q': 31, 'R': 32, 'S': 33, 'T': 34, 'U': 35, 'V': 36, 'W': 37, 'X': 38, 'Y': 39, 'Z': 40, 'a': 41, 'b': 42, 'c': 43, 'd': 44, 'e': 45, 'f': 46, 'g': 47, 'h': 48, 'i': 49, 'j': 50, 'k': 51, 'l': 52, 'm': 53, 'n': 54, 'o': 55, 'p': 56, 'q': 57, 'r': 58, 's': 59, 't': 60, 'u': 61, 'v': 62, 'w': 63, 'x': 64, 'y': 65, 'z': 66}
idx2char
['\n' ' ' '!' '#' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' '@' 'A' 'B' 'C'
 'D' '

tensor([[[ 1.1532,  0.4577, -0.7640,  ...,  0.6408, -0.1626,  0.1651],
         [ 0.3437,  0.5492, -0.6154,  ...,  0.4403,  0.1573,  0.8275],
         [ 0.5086,  0.4925, -0.5348,  ..., -0.1011, -0.4964,  0.2324],
         ...,
         [-0.0185,  0.2282, -1.6808,  ...,  0.1551, -0.6751,  0.5988],
         [ 0.0041,  0.7301, -1.3693,  ...,  0.1777, -0.5002,  0.4568],
         [ 0.1935, -0.0676, -1.0820,  ...,  0.0388, -0.5048,  0.3791]],

        [[ 1.7181,  0.8787, -0.3345,  ...,  0.3341,  0.4758,  0.5261],
         [ 0.9315,  0.4030, -0.7516,  ...,  0.5031, -0.2916,  0.4182],
         [ 0.2445,  0.0314, -0.5543,  ...,  0.0166, -0.6494,  0.5519],
         ...,
         [-0.1584,  0.2671, -1.2621,  ..., -0.1479, -0.3506,  0.7022],
         [ 0.4705,  0.0717, -1.0611,  ...,  0.0998, -0.6710,  0.6897],
         [ 0.3777, -0.1349, -1.2209,  ..., -0.0382, -0.5848,  0.0406]],

        [[ 1.4505,  0.6350, -1.1654,  ...,  0.2120,  0.2089,  0.2286],
         [ 0.9760,  0.9969, -0.8701,  ...,  0

In [18]:
print("Train loop")
def train_loop(dataloader, model, loss_fn, optimizer, t):
    """Train loop. Taken from pytorch tutorial."""
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch
    # normalization and dropout layers Unnecessary in this situation
    # but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss =  loss_fn(pred.transpose(1,2), y) # loss_fn(pred.transpose(1,2)[:,:,-1], y[:,-1])

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * 64 + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            print("Input")
            print(''.join([idx2char[i] for i in torch.max(X[0], dim=1)[1].tolist()]))
            print("Target")
            print(''.join([idx2char[i] for i in y[0]]))
            print("Prediction")
            print(''.join([idx2char[i] for i in torch.max(pred[0], dim=1)[1].tolist()]))
        torch.save({
            'epoch': t+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, f"model_epoch{t}.pt")
def run_n_epochs(epochs, dataloader, model, loss_fn, optimizer):
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loop(dataloader, model, loss_fn, optimizer, t)

run_n_epochs(10, train_dataloader, decoder, loss_fn, optimizer)

Train loop
Epoch 1
-------------------------------
loss: 2.069202  [   64/123932]
Input
!
My sou
Target

My soul
Prediction


y so r
loss: 1.992315  [ 6464/123932]
Input
,
And aw
Target

And awf
Prediction
 And tna
loss: 1.925008  [12864/123932]
Input
:
Not ma
Target

Not mad
Prediction

Tot ten
loss: 2.041757  [19264/123932]
Input
nna
Live
Target
na
Live 
Prediction
dotToke 
loss: 2.013444  [25664/123932]
Input
, in hea
Target
 in heav
Prediction
 af te r
loss: 1.942425  [32064/123932]
Input
 who sha
Target
who shal
Prediction
tierwhal
loss: 2.010029  [38464/123932]
Input
imes to 
Target
mes to c
Prediction
ne tth t
loss: 2.021778  [44864/123932]
Input
less tha
Target
ess than
Prediction
l t thet
loss: 2.016076  [51264/123932]
Input
tent!

H
Target
ent!

HA
Prediction
hr  

KE
loss: 1.992336  [57664/123932]
Input
shall be
Target
hall bea
Prediction
 all te 
loss: 2.048733  [64064/123932]
Input
illiam L
Target
lliam Lo
Prediction
nl nneto
loss: 1.908268  [70464/123932]
Input
 wars in
T

KeyboardInterrupt: 

In [None]:
 checkpoint = torch.load("model_epoch9.pt")

In [None]:
checkpoint['model_state_dict'].keys()

odict_keys(['norm.weight', 'norm.bias', 'embedding.embedding.weight', 'embedding.embedding.bias', 'multiheads1.0.WQs.0.weight', 'multiheads1.0.WQs.0.bias', 'multiheads1.0.WQs.1.weight', 'multiheads1.0.WQs.1.bias', 'multiheads1.0.WQs.2.weight', 'multiheads1.0.WQs.2.bias', 'multiheads1.0.WQs.3.weight', 'multiheads1.0.WQs.3.bias', 'multiheads1.0.WQs.4.weight', 'multiheads1.0.WQs.4.bias', 'multiheads1.0.WQs.5.weight', 'multiheads1.0.WQs.5.bias', 'multiheads1.0.WQs.6.weight', 'multiheads1.0.WQs.6.bias', 'multiheads1.0.WQs.7.weight', 'multiheads1.0.WQs.7.bias', 'multiheads1.0.WKs.0.weight', 'multiheads1.0.WKs.0.bias', 'multiheads1.0.WKs.1.weight', 'multiheads1.0.WKs.1.bias', 'multiheads1.0.WKs.2.weight', 'multiheads1.0.WKs.2.bias', 'multiheads1.0.WKs.3.weight', 'multiheads1.0.WKs.3.bias', 'multiheads1.0.WKs.4.weight', 'multiheads1.0.WKs.4.bias', 'multiheads1.0.WKs.5.weight', 'multiheads1.0.WKs.5.bias', 'multiheads1.0.WKs.6.weight', 'multiheads1.0.WKs.6.bias', 'multiheads1.0.WKs.7.weight', 'm

In [None]:
mymodel = LonelyDecoder(model_parameters)
mymodel.load_state_dict(checkpoint['model_state_dict'])
myoptimizer = torch.optim.Adam(decoder.parameters(), lr=1e-2, betas=(0.9, 0.98))
myoptimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']
print(f"Loaded checkpoint at epoch {epoch} with loss {loss}")

def train_loop(dataloader, model, loss_fn, optimizer, t):
    """Train loop. Taken from pytorch tutorial."""
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch
    # normalization and dropout layers Unnecessary in this situation
    # but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred.transpose(1,2)[:,:,-1], y[:,-1])

        # Backpropagation
        loss.backward()
        # Optimizer step
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * 64 + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            print("Input")
            print(''.join([idx2char[i] for i in torch.max(X[0], dim=1)[1].tolist()]))
            print("Target")
            print(''.join([idx2char[i] for i in y[0]]))
            print("Prediction")
            print(''.join([idx2char[i] for i in torch.max(pred[0], dim=1)[1].tolist()]))
        torch.save({
            'epoch': t+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, f"model_epoch{t}.pt")
print("Train loop")
def run_n_epochs(epochs, dataloader, model, loss_fn, optimizer):
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loop(dataloader, model, loss_fn, optimizer, t)
run_n_epochs(1, train_dataloader, mymodel, loss_fn, optimizer)

Loaded checkpoint at epoch 10 with loss 0.08523036539554596
Train loop
Epoch 1
-------------------------------
loss: 2.264168  [   64/21870]
Input
 at me, make their pastime at my sorrow:
They shou
Target
at me, make their pastime at my sorrow:
They shoul
Prediction
at me, make their pastime at my soreow:
They shous
loss: 2.465627  [ 6464/21870]
Input
t want their remedies.
Cousin, I am too young to b
Target
 want their remedies.
Cousin, I am too young to be
Prediction
 want their remedies.
Cousin, I am too young to be
loss: 2.635937  [12864/21870]
Input
h to me to be at enmity;
I hate it, and desire all
Target
 to me to be at enmity;
I hate it, and desire all 
Prediction
 to me to be at enmity;
I hate it, and desire alle


KeyboardInterrupt: 

In [23]:
decoder.eval()
totalString = "ALEXANDRE:"
print(f"input:\n{totalString}")
T=10
for w in range(500):
  input = [char2idx[c] for c in totalString[-8:]]
  one_hot_initString = F.one_hot(torch.tensor(input).long(),
                             model_parameters["vocabulary_size"]).float()
  # print(decoder(one_hot_initString.squeeze(0))[0][-1].shape)
  next_char = torch.distributions.Categorical(probs = nn.Softmax()(decoder(one_hot_initString.squeeze(0))[0][-1])/T).sample()
  totalString = totalString + idx2char[next_char]

# print(one_hot_initString.shape)
print("Prediction:")
print(totalString) #''.join([idx2char[i] for i in torch.max(decoder(one_hot_initString.squeeze(0))[0],1)[1].tolist()]))
print("---------------------------------------------")

input:
ALEXANDRE:
Prediction:
ALEXANDRE:
Tyrank
What fear of thy, beforesa come tyrance I was may.
I my very worly
What my Grean thou art the day, and some
And whith mid of the despectings, you steks here. Their his no med whe she hard: a such thou with toed.

KING RICHARD III:
Shall him the quirest handelose you prack: to cannour, streace's lanotevers of Engle here arm, we storm I knower fain she deatio. That wake foul count the pepolip:
In the lightly! every graciest ben his thesed am a.

POLIXETER:
The sping the joy his dischose, i
---------------------------------------------


In [None]:
x,y = next(iter(train_dataloader))
print(x.shape)
decoder.eval()
print(''.join([idx2char[i] for i in torch.max(x[30], dim=1)[1].tolist()]))
print(''.join([idx2char[i] for i in torch.max(decoder(x[30].squeeze(0))[0],1)[1].tolist()]))

In [None]:
x,y = next(iter(train_dataloader))
print(x.shape)
decoder.eval()
print(''.join([idx2char[i] for i in torch.max(x[30], dim=1)[1].tolist()]))
print(''.join([idx2char[i] for i in torch.max(decoder(x[30].squeeze(0))[0],1)[1].tolist()]))

decoder.eval()
totalString = "HECTOR:"
idxString = [char2idx[c] for c in totalString[-8:]]
tokenizedString = F.one_hot(torch.tensor(idxString).long(),
                             model_parameters["vocabulary_size"]).float()

print(tokenizedString.shape)
for w in range(25):
    #print(decoder(tokenizedString.squeeze(0))[0].shape)
    next_char_proba = nn.Softmax(dim=-1)(decoder(tokenizedString.squeeze(0))[:,-1])
    next_char = torch.max(decoder(tokenizedString.squeeze(0))[0],1)[1].tolist()[-1]
    totalString = totalString + idx2char[next_char]
    #print(f"next_char_proba.shape {next_char_proba.shape}")
    #print(f"tokenizedString.shape {tokenizedString.shape}")
    tokenizedString = torch.cat([tokenizedString[1:], next_char_proba], dim=0)
    # print(tokenizedString)
    #print(tokenizedString.shape)
print(one_hot_initString.shape)
print("###")
print(totalString) #''.join([idx2char[i] for i in torch.max(decoder(one_hot_initString.squeeze(0))[0],1)[1].tolist()]))
print("###")

torch.Size([64, 8, 67])
vers, Va
e y  aal
torch.Size([7, 67])
torch.Size([8, 67])
###
HECTOR:
Tore eeel tenea elet ael
###


In [None]:
print(decoder)

LonelyDecoder(
  (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (embedding): Embedding(
    (embedding): Linear(in_features=67, out_features=256, bias=True)
  )
  (multiheads1): ModuleList(
    (0): MultiHeadAttention(
      (WQs): ModuleList(
        (0-7): 8 x Linear(in_features=256, out_features=32, bias=True)
      )
      (WKs): ModuleList(
        (0-7): 8 x Linear(in_features=256, out_features=32, bias=True)
      )
      (WVs): ModuleList(
        (0-7): 8 x Linear(in_features=256, out_features=32, bias=True)
      )
      (spda): ScaledDotProductAttention(
        (softmax): Softmax(dim=None)
      )
    )
  )
  (multiheads2): ModuleList(
    (0): MultiHeadAttention(
      (WQs): ModuleList(
        (0-7): 8 x Linear(in_features=256, out_features=32, bias=True)
      )
      (WKs): ModuleList(
        (0-7): 8 x Linear(in_features=256, out_features=32, bias=True)
      )
      (WVs): ModuleList(
        (0-7): 8 x Linear(in_features=256, out_features=32, bias=

In [None]:
train_dataloader = DataLoader(dataset, batch_size=64, shuffle=False)
x,y = next(iter(train_dataloader))
print(x.shape)
print(y.shape)
print(torch.max(x[0], dim=1)[1].tolist())
print(y[0])

torch.Size([64, 8, 67])
torch.Size([64, 8])
[20, 49, 58, 59, 60, 1, 17, 49]
tensor([49, 58, 59, 60,  1, 17, 49, 60])


In [None]:
# Example of target with class indices
loss = nn.CrossEntropyLoss()
target = torch.empty(3, dtype=torch.long).random_(5)
input = F.one_hot(target, num_classes=5).float()
print(target)
print(input)
output = loss(input, target)
print(output)

tensor([2, 2, 4])
tensor([[0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1.]])
tensor(0.9048)


In [None]:
def run_n_epochs(epochs, dataloader, model, loss_fn, optimizer):
    for t in range(epochs):
        print(f"Epoch {5+t+1}\n-------------------------------")
        train_loop(dataloader, model, loss_fn, optimizer)

run_n_epochs(45, train_dataloader, decoder, loss_fn, optimizer)

Epoch 6
-------------------------------
loss: 4.009966  [   64/123932]
Target
irst Cit
Prediction
XHakevm'


  return self._call_impl(*args, **kwargs)


loss: 3.983451  [ 6464/123932]
Target
irst Cit
Prediction
qZ mfjj,
loss: 3.990031  [12864/123932]
Target
irst Cit
Prediction
obQQfQzZ
loss: 4.036373  [19264/123932]
Target
irst Cit
Prediction
whWQooX 
loss: 4.001715  [25664/123932]
Target
irst Cit
Prediction
-gQGakn,
loss: 4.010240  [32064/123932]
Target
irst Cit
Prediction
g,#.TTJy
loss: 4.002563  [38464/123932]
Target
irst Cit
Prediction
o-r.merx
loss: 4.007478  [44864/123932]
Target
irst Cit
Prediction
our
Miou
loss: 3.987219  [51264/123932]
Target
irst Cit
Prediction
asbib!bj
loss: 4.015161  [57664/123932]
Target
irst Cit
Prediction
XuyxVkuc
loss: 4.015061  [64064/123932]
Target
irst Cit
Prediction
yd,
wiad
loss: 4.015932  [70464/123932]
Target
irst Cit
Prediction
T&eqtqzt
loss: 3.983913  [76864/123932]
Target
irst Cit
Prediction

Wha.Wr'
loss: 4.002244  [83264/123932]
Target
irst Cit
Prediction
nd;-n-;P
loss: 4.000215  [89664/123932]
Target
irst Cit
Prediction
';'Pjjzn
loss: 4.019834  [96064/123932]
Target
irst Cit
Prediction

b;e

KeyboardInterrupt: 