In [27]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim.lr_scheduler import LRScheduler
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import TemplateProcessing
import pandas as pd
import math

In [2]:
# Setting hyperparameters
if torch.backends.mps.is_available():
    device = 'mps'
elif torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'
train_split = 0.9
test_split = 1 - train_split
batch_size = 16 # how many independent sequences will we process in parallel?
# block_size = 32 # what is the maximum context length for predictions?
eval_iters = 200

VOCAB_SIZE = 3700
D_MODEL = 512
N_HEADS = 8
DFF = 2048
N_LAYERS = 6
DROPOUT_RATE = 0.5
path_to_data = './dataset/archive/'
test_data_file = 'wmt14_translate_de-en_test.csv'
train_data_file ='wmt14_translate_de-en_train.csv'
validation_data_file ='wmt14_translate_de-en_validation.csv'
path_to_tokenizer = './tokenizer/'

In [3]:
chunksize = 1000  # Number of rows per chunk
num_chunks_to_read = 444  # Number of chunks to read

# Initialize an empty list to store chunks
chunks = []

# Read the CSV file in chunks
for chunk in pd.read_csv(path_to_data + train_data_file, chunksize=chunksize):
    chunks.append(chunk)
    if len(chunks) >= num_chunks_to_read:
        break

# Concatenate chunks into a single DataFrame
train_data = pd.concat(chunks, ignore_index=True)
english_train = train_data.values[:, 1]
german_train = train_data.values[:, 0]
val_data = pd.read_csv(path_to_data + validation_data_file)
english_validation = val_data.values[:, 1]
german_validation = val_data.values[:, 0]
test_data = pd.read_csv(path_to_data + test_data_file)
english_test = test_data.values[:, 1]
german_test = test_data.values[:, 0]

In [4]:
{test_data.loc[i,x] for i, x in enumerate(test_data)}

{'Consequently, they will be particularly motivated playing against their former coach.',
 'Ursprünglich war die Schulhofsanierung sogar schon in den Jahren 2008/2009 geplant, doch hohe unplanmäßige Ausgaben brachten eine Verschiebung.'}

In [5]:
def clean_complete_data(data):
    # Identify rows with NaN values in either column
    nan_indices = data[data.isna().any(axis=1)].index
    
    # Remove these rows from the DataFrame
    cleaned_df = data.drop(nan_indices)
    
    return cleaned_df
train_data = clean_complete_data(train_data)
val_data = clean_complete_data(val_data)
test_data = clean_complete_data(test_data)

In [6]:
def clean_data(english, german):
    # Find indices of NaN values in both lists
    nan_indices_german = {i for i, x in enumerate(german) if isinstance(x, float) and math.isnan(x)}
    nan_indices_english = {i for i, x in enumerate(english) if isinstance(x, float) and math.isnan(x)}
    
    nan_indices = nan_indices_german.union(nan_indices_english)

    # Remove elements at NaN indices from both lists
    english_cleaned = [x for i, x in enumerate(english) if i not in nan_indices]
    german_cleaned = [x for i, x in enumerate(german) if i not in nan_indices]
    return english_cleaned, german_cleaned
english_train, german_train = clean_data(english_train, german_train)
english_test, german_test = clean_data(english_test, german_test)
english_validation, german_validation = clean_data(english_validation, german_validation)

In [7]:
with open('input.txt', 'r', encoding='utf-8') as f:
    shakespear_data = f.read()

# chars = sorted(list(set(text)))
# vocab_size = len(chars)
# stoi = {s:i for i,s in enume rate(chars)}
# itos = {i:s for i,s in enumerate(chars)}
# encode = lambda s: [stoi[c] for c in s]
# decode = lambda l: ''.join([itos[i] for i in l])

In [8]:
# data = torch.tensor(encode(text), dtype=torch.long)
n = int(train_split*len(data))
shakespear_train_data = shakespear_data[:n]
shakespear_val_data = shakespear_data[n:]

# # data loading
# def get_batch(split):
#     # generate a small batch of data of inputs x and targets y
#     data = train_data if split == 'train' else val_data
#     ix = torch.randint(len(data) - block_size, (batch_size,))
#     x = torch.stack([data[i:i+block_size] for i in ix])
#     y = torch.stack([data[i+1:i+block_size+1] for i in ix])
#     x, y = x.to(device), y.to(device)
#     return x, y

In [9]:
class PositionalEncoding(nn.Module):
    """
    Positional Encoding 
    PE(pos,2i) =sin(pos/10000^(2i/dmodel))
    PE(pos,2i+1) =cos(pos/10000^(2i/dmodel))
    """
    def __init__(self):
        super().__init__()

    def get_angles(self, pos, i, d_model):
        """
        pos: (seq_length, 1)
        i: (1, d_model)
        d_model: int (dimension of embedding)

        return: (seq_length, d_model)
        """
        power = 2*(i//2)/ torch.tensor(d_model, dtype=torch.float32)
        return pos / (torch.pow(10000, power))

    def forward(self, inputs):
        """
        inputs: (batch_size, seq_length, d_model)
        """
        assert len(inputs.shape) == 3
        seq_length = inputs.shape[-2]
        d_model = inputs.shape[-1]
        angles = self.get_angles(
            torch.arange(seq_length).unsqueeze(1),
            torch.arange(d_model).unsqueeze(0),
            d_model
        )
        
        pe = torch.zeros(seq_length, d_model, device = inputs.device)
        pe[:, 0::2] = torch.sin(angles[:, 0::2])
        pe[:, 1::2] = torch.cos(angles[:, 1::2])
        pe.unsqueeze(0)
        return inputs + pe
        

In [10]:
"""
Positional encoding test
"""
test_pe_input = torch.tensor([
    [[1,2,3], [2,3,4]], 
    [[3,4,5], [4,5,6]]
]) #batch_size = 2, seq_length = 2, d_model = 3
pos1_i0 = torch.sin(torch.tensor(1/math.pow(10000,0)))
pos1_i1 = torch.cos(torch.tensor(1/math.pow(10000,0)))
pos1_i2 = torch.sin(torch.tensor(1/math.pow(10000,2/float(3))))
expected_pe = torch.tensor([
    [[0, 1, 0], [pos1_i0, pos1_i1, pos1_i2]], 
    [[0, 1, 0], [pos1_i0, pos1_i1, pos1_i2]]
]) + test_pe_input
assert (expected_pe == PositionalEncoding().forward(test_pe_input)).all()

In [105]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, d_model):
        super().__init__()
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_head = self.d_model // self.n_heads
        self.query_lin = nn.Linear(in_features = self.d_model, out_features = self.d_model).to(device)
        self.key_lin = nn.Linear(in_features = self.d_model, out_features = self.d_model).to(device)
        self.value_lin = nn.Linear(in_features = self.d_model, out_features = self.d_model).to(device)
        self.final_lin = nn.Linear(in_features = self.d_model, out_features = self.d_model).to(device)
        
        
    def scaled_dot_product_attention(self, query, key, value, mask):
        """
        softmax((QK.T)/sqrt(dk))V
        
        query: (batch_size, num_heads, seq_length, d_k)
        key: (batch_size, num_heads, seq_length, d_k)
        value: (batch_size, num_heads, seq_length, d_v)
        mask: (batch_size, 1, 1, seq_length)
        return: (batch_size, num_heads, seq_length, d_v)
        """
        assert len(query.shape) == len(key.shape) and len(query.shape) == len(value.shape)
        assert key.dtype == torch.float
        
        product = query @ (key.transpose(-1,-2))
        
        dk = torch.tensor(key.shape[-1], dtype = torch.float32)
        sqrt_dk = torch.sqrt(dk)
        scaled_product = product/sqrt_dk

        if mask is not None:
            scaled_product += mask * -1e9

        softmax = torch.softmax(scaled_product, dim = -1)
        attention = softmax @ value
        return attention

    def split_to_heads(self, inputs, batch_size):
        """
        input: (batch_size, seq_length, d_model)
        return: (batch_size, n_proj, seq_length, d_model//n_heads)
        """
        proj_inputs = inputs.view(batch_size, -1, self.n_heads, self.d_head)
        return proj_inputs.transpose(1, 2)

    def concat_from_heads(self, inputs, batch_size):
        """
        input: (batch_size, n_proj, seq_length, d_model//n_heads)
        return: (batch_size, seq_length, d_model)
        """
        return inputs.transpose(2,1).reshape(batch_size, -1, self.d_model)

    def forward(self, query, key, value, mask):
        """
        query: (batch_size, seq_length, d_model)
        key: (batch_size, seq_length, d_model)
        value: (batch_size, seq_length, d_model)
        mask: (batch_size, 1, 1, seq_length)
        
        return: (batch_size, seq_length, d_model)
        """

        batch_size = query.shape[0]
        queries = self.query_lin(query.to(torch.float32))
        keys = self.key_lin(key.to(torch.float32))
        values = self.value_lin(value.to(torch.float32))

        queries = self.split_to_heads(queries, batch_size)
        keys = self.split_to_heads(keys, batch_size)
        values = self.split_to_heads(values, batch_size)
        
        attention = self.scaled_dot_product_attention(queries, keys, values, mask)
        attention = self.concat_from_heads(attention, batch_size)
        outputs = self.final_lin(attention)
        return outputs

In [12]:
"""
Multi Head Attention Test
"""
def test_multi_head_attention_output_shape(input_shape = (32, 50, 64), n_heads = 8):
     # (batch_size, seq_length, d_model)
    mha = MultiHeadAttention(n_heads=n_heads, d_model=input_shape[-1])
    batch_size, seq_length, d_model = input_shape
    query = torch.rand(batch_size, seq_length, d_model)
    key = torch.rand(batch_size, seq_length, d_model)
    value = torch.rand(batch_size, seq_length, d_model)
    outputs = mha.forward(query, key, value, mask)
    assert outputs.shape == (batch_size, seq_length, d_model)
    
test_multi_head_attention_output_shape()

In [13]:
"""
Scaled Dot Product Attention Test
"""
def test_scaled_dot_product():
    test_scaled_dot_attention = torch.tensor([
        [[1,2,3], [2,3,4]], 
        [[3,4,5], [4,5,6]]
    ], dtype = torch.float32) #batch_size:2, seq_length: 2, d_model: 3
    test_mask = torch.tensor([
        [[0, 0]],
        [[0, 1]]
    ])# (batch_size, 1, seq_length)
    test_product = torch.tensor([[[14., 20.],
             [20., 29.]],
            [[50., 62.],
             [62., 77.]]], dtype = torch.float32)
    test_scaled_product = test_product/math.sqrt(3)
    test_scaled_product[1, :, 1] += -1e9 #applying mask
    expected_attention = torch.softmax(test_scaled_product, dim = -1) @ test_scaled_dot_attention
    assert (expected_attention == MultiHeadAttention(5, test_scaled_dot_attention.shape[-1]).scaled_dot_product_attention(test_scaled_dot_attention, test_scaled_dot_attention, test_scaled_dot_attention, test_mask)).all()

test_scaled_dot_product()

In [58]:
class FeedForward(nn.Module):
    def __init__(self, d_model, dff):
        super().__init__()
        self.feed_forward_inner_lin = nn.Linear(in_features = d_model, out_features = dff).to(device)
        self.feed_forward_relu = nn.ReLU()
        self.feed_forward_outer_lin = nn.Linear(in_features = dff, out_features = d_model).to(device)

    def forward(self, inputs):
        """
        inputs: (batch_size, seq_length, d_model)
        return: (batch_size, seq_length, d_model)
        """
        outputs = self.feed_forward_inner_lin(inputs)
        outputs = self.feed_forward_relu(outputs)
        outputs = self.feed_forward_outer_lin(outputs)
        return outputs
        

In [15]:
"""
Feed Forward Test
"""
def test_feed_forward_output_shape(input_shape = (32, 50, 64), dff = 2048):
     # (batch_size, seq_length, d_model)
    ffn = FeedForward(input_shape[-1] ,dff)
    batch_size, seq_length, d_model = input_shape
    inputs = torch.rand(batch_size, seq_length, d_model)
    outputs = ffn.forward(inputs)
    assert outputs.shape == (batch_size, seq_length, d_model)
    
test_feed_forward_output_shape()

In [57]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, dff, dropout_rate = 0.5):
        super().__init__()
        self.multi_head_attention = MultiHeadAttention(n_heads=n_heads, d_model=d_model)
        self.attention_dropout = nn.Dropout(p = dropout_rate)
        self.attention_layer_norm = nn.LayerNorm(d_model).to(device)
        self.feed_forward = FeedForward(d_model, dff)
        self.feed_forward_layer_norm = nn.LayerNorm(d_model).to(device)
        self.feed_forward_dropout = nn.Dropout(p = dropout_rate)

    def forward(self, inputs, mask):
        attention = self.multi_head_attention(inputs,
                                           inputs,
                                           inputs,
                                           mask)
        attention = self.attention_dropout(attention)
        attention = self.attention_layer_norm(inputs + attention)

        outputs = self.feed_forward(attention)
        outpus = self.feed_forward_dropout(outputs)
        outputs = self.feed_forward_layer_norm(attention + outputs)
        return outputs
        

In [17]:
"""
Encoder Layer Test
"""
def test_encoder_layer_output_shape(input_shape = (32, 50, 64), n_heads = 8, dff = 2048):
     # (batch_size, seq_length, d_model)
    encoder_layer = EncoderLayer(d_model = input_shape[-1] , n_heads = n_heads, dff = dff)
    batch_size, seq_length, d_model = input_shape
    inputs = torch.rand(batch_size, seq_length, d_model)
    mask = torch.ones(batch_size, 1, 1, seq_length)
    outputs = encoder_layer.forward(inputs, mask)
    assert outputs.shape == (batch_size, seq_length, d_model)
    
test_encoder_layer_output_shape()

In [61]:
class Encoder(nn.Module):
    def __init__(self, 
                 d_model, 
                 n_heads, 
                 dff = 2048,
                 n_layers = 6, 
                 dropout_rate = 0.5):
        super().__init__()
        self.n_layers = n_layers
        self.encoder_layers = [EncoderLayer(d_model, n_heads, dff, dropout_rate) for _ in range(n_layers)]

    def forward(self, inputs, mask):
        outputs = self.encoder_layers[0](inputs, mask)
        for i in range(1, self.n_layers):
            outputs = self.encoder_layers[i](outputs, mask)

        return outputs

In [116]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, dff, dropout_rate = 0.5):
        super().__init__()
        self.multi_head_masked_attention = MultiHeadAttention(n_heads=n_heads, d_model=d_model)
        self.masked_attention_dropout = nn.Dropout(p = dropout_rate)
        self.masked_attention_layer_norm = nn.LayerNorm(d_model).to(device)
        
        self.multi_head_attention = MultiHeadAttention(n_heads=n_heads, d_model=d_model)
        self.attention_dropout = nn.Dropout(p = dropout_rate)
        self.attention_layer_norm = nn.LayerNorm(d_model).to(device)
        
        self.feed_forward = FeedForward(d_model, dff)
        self.feed_forward_layer_norm = nn.LayerNorm(d_model).to(device)
        self.feed_forward_dropout = nn.Dropout(p = dropout_rate)

    def forward(self, inputs, enc_outputs, lookahead_mask, padding_mask):
        """
        inputs: (batch_size, generated_seq_length, d_model)
        enc_outputs: (batch_size, seq_length, d_model)
        lookahead_mask: (batch_size, 1, 1, seq_length)
        padding_mask: (batch_size, 1, 1, seq_length)
        
        """
        masked_attention = self.multi_head_masked_attention(inputs,
                                           inputs,
                                           inputs,
                                           lookahead_mask) # (batch_size, generated_seq_length, d_model)
        masked_attention = self.masked_attention_dropout(masked_attention) # (batch_size, generated_seq_length, d_model)
        masked_attention = self.masked_attention_layer_norm(inputs + masked_attention) # (batch_size, generated_seq_length, d_model)
        
        attention = self.multi_head_attention(masked_attention,
                                           enc_outputs,
                                           enc_outputs,
                                           padding_mask)

        attention = self.attention_dropout(attention)
        attention = self.attention_layer_norm(masked_attention + attention)

        outputs = self.feed_forward(attention)
        outpus = self.feed_forward_dropout(outputs)
        outputs = self.feed_forward_layer_norm(attention + outputs)
        return outputs

In [117]:
class Decoder(nn.Module):
    def __init__(self, 
                 d_model, 
                 n_heads, 
                 dff = 2048,
                 n_layers = 6, 
                 dropout_rate = 0.5):
        super().__init__()
        self.n_layers = n_layers
        self.d_model = d_model
        self.decoder_layers = [DecoderLayer(d_model, n_heads, dff, dropout_rate) for _ in range(n_layers)]

    def forward(self, inputs, enc_outputs, lookahead_mask, padding_mask):
        outputs = self.decoder_layers[0](inputs, enc_outputs, lookahead_mask, padding_mask)
        for i in range(1, self.n_layers):
            outputs = self.decoder_layers[i](outputs, enc_outputs, lookahead_mask, padding_mask)

        return outputs

In [118]:
class Transformer(nn.Module):
    def __init__(self,
                vocab_size,
                d_model = 512,
                n_heads = 8,
                dff = 2048,
                n_layers = 6,
                dropout_rate = 0.5):
        super().__init__()
        self.n_layers = n_layers
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding()
        self.input_dropout = nn.Dropout(dropout_rate)
        self.output_dropout = nn.Dropout(dropout_rate)
        self.encoder = Encoder(d_model, n_heads, dff, n_layers, dropout_rate)
        self.decoder = Decoder(d_model, n_heads, dff, n_layers, dropout_rate)
        self.fc_out = nn.Linear(d_model, vocab_size, bias = False).to(device)
        
        # Share the weights between the embedding and the output linear layer
        self.fc_out.weight = torch.nn.Parameter(self.embedding.weight)
        self.final_softmax = nn.Softmax(-1)

    def create_padding_mask(self, inputs):
        """
        inputs: (batch_size, seq_length)
        return: (batch_size, 1, 1, seq_length)
        """
        mask = (inputs == 0).unsqueeze(1).unsqueeze(2).to(torch.float32).to(device)
        return mask

    def create_lookahead_mask(self, inputs):
        seq_len = inputs.size(1)
        look_ahead_mask = 1 - torch.triu(torch.ones((seq_len, seq_len)), diagonal=1)
        return look_ahead_mask.to(device)

    def forward(self, inputs, outputs):
        """
        inputs: (batch_size, seq_length)
        outputs: (batch_size, generated_seq_length)
        """
        embedded_inputs = self.embedding(inputs) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float32)) # (batch_size, seq_length, d_model)
        pe_inputs = self.positional_encoding(embedded_inputs) # (batch_size, seq_length, d_model)
        pe_inputs = self.input_dropout(pe_inputs)# (batch_size, seq_length, d_model)
        
        embedded_outputs = self.embedding(outputs) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float32))# (batch_size, generated_seq_length, d_model)
        pe_outputs = self.positional_encoding(embedded_outputs) # (batch_size, generated_seq_length, d_model)
        pe_outputs = self.input_dropout(pe_outputs)  # (batch_size, generated_seq_length, d_model)

        padding_mask_encoder = self.create_padding_mask(inputs) #(batch_size, 1, 1, seq_length)
        encoded_outputs = self.encoder(pe_inputs, padding_mask_encoder)# (batch_size, seq_length, d_model)
        
        padding_mask_decoder = self.create_padding_mask(inputs) # (batch_size, 1, 1, seq_length)
        lookahead_mask_decoder = torch.maximum(
            self.create_padding_mask(outputs), # (batch_size, 1, 1, generated_seq_length)
            self.create_lookahead_mask(outputs) # (generated_seq_length, generated_seq_length)
        )

        decoded_outputs = self.decoder(pe_outputs,
                                      encoded_outputs,
                                      lookahead_mask_decoder,
                                      padding_mask_decoder) #(batch_size, generated_seq_length, d_model)

        outputs = self.fc_out(decoded_outputs) #(batch_size, generated_seq_length, vocab_size)
        outputs = self.final_softmax(outputs) #(batch_size, generated_seq_length, vocab_size))
        return outputs   

In [119]:
class CustomLRScheduler(LRScheduler):
    def __init__(self, optimizer, d_model, warmup_steps=4000, last_epoch=0, initial_lr = 1.7e-07):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        for param_group in optimizer.param_groups:
            param_group['initial_lr'] = param_group['lr']
        super(CustomLRScheduler, self).__init__(optimizer, last_epoch)
        
    
    def get_lr(self):
        step = self.last_epoch + 1

        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)
        lr = (self.d_model ** -0.5) * min(arg1, arg2)
        
        return [lr for _ in self.optimizer.param_groups]

In [23]:
def testing():
    step = 1
    warmup_steps = 4000
    d_model = 512
    arg1 = step ** -0.5
    arg2 = step * (warmup_steps ** -1.5)
    lr = (d_model ** -0.5) * min(arg1, arg2)
    return lr
testing()

1.746928107421711e-07

In [24]:
class Tokenizer():
    def __init__(self, files):
        if files is not None:
            self.tokenizer = ByteLevelBPETokenizer()
            self.tokenizer.train(files=files, 
                                 vocab_size=VOCAB_SIZE, 
                                 min_frequency=2, 
                                 special_tokens=[
                                     "<PAD>",
                                     "<START>",
                                     "<END>",
                                     "<UNK>",
                                 ])
            self.tokenizer.pad_token = "<PAD>"
            self.tokenizer.pad_token_id = 0
            self.tokenizer.post_processor = TemplateProcessing(
                                            single="<START> $A <END>",
                                            pair="<START> $A <END> $B:1 <END>:1",
                                            special_tokens=[("<PAD", 0), ("<START>", 1), ("<END>", 2)],
                                        )
        else:
            self.tokenizer = ByteLevelBPETokenizer(path_to_tokenizer + 'vocab.json',
                                                  path_to_tokenizer + 'merges.txt')
            self.tokenizer.pad_token = "<PAD>"
            self.tokenizer.pad_token_id = 0
            self.tokenizer.post_processor = TemplateProcessing(
                                            single="<START> $A <END>",
                                            pair="<START> $A <END> $B:1 <END>:1",
                                            special_tokens=[("<PAD", 0), ("<START>", 1), ("<END>", 2)],
                                        )
    def encode(self, text):
        return self.tokenizer.encode(text)

    def decode(self, encoded_text):
        return self.tokenizer.decode(encoded_text)

In [25]:
# tknzr = Tokenizer(path_to_data + train_data_file)
# tknzr.tokenizer.save_model(path_to_tokenizer)
# tknzr.encode("<start>")

In [28]:
tokenizer = Tokenizer(None)
tokenizer.encode("test").ids

[1, 87, 395, 2]

In [29]:
def find_max_seq_length():
    max_length = 0
    sentence = ""
    tokenizer = Tokenizer(None)
    for i in english_train + english_test + english_validation + german_train + german_test + german_validation:
        encoded = tokenizer.encode(i)
        if max_length < len(encoded.ids):
            max_length = len(encoded.ids)
            sentence = i
    print(max_length) # 9331
    print(sentence)
# find_max_seq_length()

9331
Su-jeong (7) O" (8) O&A (9) O&A Army (10) O&C Railroad (11) O&G (12) O&K (13) O&M (14) O&M Hausser (15) O&O (16) O&O Defrag (17) O&W (18) O&Y (19) O&YH Union (20) O&Y Hope Union (21) O&g (22) O' (23) O'Ahu 'Alauahio (24) O'Ahu 'Amakihi (25) O'B (26) O'Bama (27) O'Bananon Publishing (28) O'Banion Middle School (29) O'Bannon (30) O'Bannon, Kentucky (31) O'Bannon, Louisville (32) O'Bannon (DD-987) (33) O'Bannon (DD 987) (34) O'Bannon (surname) (35) O'Bannon Mill (36) O'Bannon Publishing (37) O'Bannon Woods State Park (38) O'Berry Neuro-Medical Center (39) O'Biden (40) O'Bleness Memorial Hospital (41) O'Boylan (42) O'Boyle (43) O'Boyle Donegal (44) O'Braein, Tighernach (45) O'Braien (46) O'Brian (47) O'Brian White (48) O'Brian Woodbine (49) O'Brien (50) O'Brien's-Bridge (51) O'Brien's Bridge (52) O'Brien's Tower (53) O'Brien, Argentina (54) O'Brien, George Donoghue (55) O'Brien, John (56) O'Brien, Michael (57) O'Brien, OR (58) O'Brien, Or (59) O'Brien, Oregon (60) O'Brien, Sean (61) O

In [30]:
def find_max_seq_length_for_batch(data):
    max_length = 0
    for encoded in data:
        if max_length < len(encoded.ids):
            max_length = len(encoded.ids)
    return max_length

In [151]:
from torch.nn.utils.rnn import pad_sequence

def get_batch(split, tokenizer, batch_size = batch_size):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data), (batch_size,))
    x = [tokenizer.encode(i).ids for i in data.values[ix, 1]]
    y = [tokenizer.encode(i).ids for i in data.values[ix, 0]]
    x = pad_sequence([torch.tensor(i) for i in x], batch_first= True, padding_value=0)
    y = pad_sequence([torch.tensor(i) for i in y], batch_first= True, padding_value=0)
    # x, y = x.to(device), y.to(device)
    return x, y

def transformer_loss(target, pred):
    # Compute the cross-entropy loss
    return criterion(pred.transpose(-1, -2), target)
    

def model_predict(X, target_max_length = 9400, sos_token_output = 1, eos_token_output =2):
    X = X.unsqueeze(0)
    out_sentence = torch.tensor(sos_token_output)
    output = out_sentence.unsqueeze(axis=0).unsqueeze(0)
    logits = model(X, output)
    for _ in range(target_max_len):
        predictions = model(inputs, output) 
        prediction = predictions[:, -1:, :]
        predicted_id = torch.tensor(torch.argmax(prediction, axis=-1), dtype = torch.int32)
        if predicted_id == eos_token_output:
            return torch.squeeze(output, axis=0)
        # Concat the predicted word to the output sequence
        output = torch.concat([output, predicted_id], axis=-1)

def model_test(X, Y):
    outputs = Y[:,:-1]
    X, outputs = X.to(device), outputs.to(device)
    logits = model(X, outputs)
    loss = transformer_loss(Y[:,1:].to(device), logits)
    return loss
    
@torch.no_grad()
def estimate_loss(tokenizer, sos_token_output = 1, eval_iters = 5):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            # print("Evaluation iteration", k)
            X, Y = get_batch(split, tokenizer)
            loss = model_test(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
max_iters = 1000
eval_interval = 10
tokenizer = Tokenizer(None)
model = Transformer(VOCAB_SIZE,
                   D_MODEL,
                    N_HEADS,
                    DFF,
                    N_LAYERS,
                    DROPOUT_RATE)
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.AdamW(model.parameters(), lr=1.7e-07, betas=(0.9, 0.98), eps = 1e-9)
scheduler = CustomLRScheduler(optimizer, d_model=512, warmup_steps=4000)

In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(tokenizer)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train', tokenizer)
    

    # evaluate the loss
    loss = model_test(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    scheduler.step()

3.7888 M parameters
step 0: train loss 8.2160, val loss 8.2157
step 10: train loss 8.2157, val loss 8.2159
step 20: train loss 8.2162, val loss 8.2158
step 30: train loss 8.2156, val loss 8.2149
step 40: train loss 8.2153, val loss 8.2160
step 50: train loss 8.2153, val loss 8.2147
