In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [4]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]


In [5]:
SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [7]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

Two young, White males are outside near many bushes. 

Several men in hard hats are operating a giant pulley system. 

A little girl climbing into a wooden playhouse. 

A man in a blue shirt is standing on a ladder cleaning a window. 

Two men are at the stove preparing food. 

A man in green holds a guitar while the other man observes his shirt. 

A man is smiling at a stuffed lion 

A trendy girl talking on her cellphone while gliding slowly down the street. 

A woman with a large purse is walking by a gate. 

Boys dancing on poles in the middle of the night. 

A ballet class of five girls jumping in sequence. 

Four guys three wearing hats one not are jumping at the top of a staircase. 

A black dog and a spotted dog are fighting 

A man in a neon green and orange uniform is driving on a green tractor. 

Several women wait outside in a city. 

A lady in a black top with glasses is sprinkling powdered sugar on a bundt cake. 

A little girl is sitting in front of a large painted rainb

Two individuals, a male and a female, standing in a forested area around a tub. 

A sleeping baby is in someone's arms and wearing a pink striped outfit 

A member of an African tribe is watching the camera intently in tribal dress. 

A samurai warrior in full black dress takes his sword from the sheath on an outdoor training mat. 

Several young people sitting on a rail above a crowded beach. 

Five hikers, one facing towards the camera and the others facing away from it, are walking through a rocky riverbed. 

A security guard stands by a metal, lighted sculpture. 

Two women, one in green and the other in purple, washing a sidewalk. 

Two girls crouch in front of some bushes and talk on their phones. 

A young girl wearing a multicolored holding an orange ball in her right hand walking through bright green grass behind a house 

A young child playing with a toy while laying on the floor. 

A little boy skateboarder is doing a trick on his board while another young skateboarder watch

A woman is walking at dusk down an urban street. 

A little girl with brown hair is blowing the petals off of a flower. 

Three people on ATV's outside. 

A man on a four wheeler is flying through the air. 

A man on a four-wheeler jumps near a small building. 

A woman with dark hair wearing a bikini is sitting on a beach. 

Two hands are flipping some food in a cast iron pan, with a spatula. 

Two women hugging in a grassy, fenced field with a cow's behind in the background. 

A large ship approaching the dock with two men awaiting its arrival. 

A man in sandals and white cardigan sits on a green bench while talking on his cellphone. 

A man on stage singing into a microphone. 

Three guys riding on an elephant with a house-like structures and trees in the background. 

A mexican man sits under the hood of his truck. 

A boy in yellow glasses and red-haired girl pose for the camera. 

A group of people gather and holding different types of flags. 

A white baby boy crying over a tip

A lady wearing a helmet holding a bike. 

A young man standing in front of a red freight car holding a camera and taking a photo. 

A person in blue shorts and wearing a Walkman jogs. 

A man and a woman play on a tree while others look on. 

A group of young people are in a garage 

Three men are looking at a sign on a cross in a canyon. 

A young lady sits in front of a fence with a bucket. 

The woman tries to hide from work under a black sweatshirt, but her red corduroy pants give her away. 

A young man cleaning a statue with a brush. 

Two young men clutch rags in their hands as a elderly man tells them how to clean the large cross. 

A woman sits on a man's shoulders using a heavy brush as a hammer. 

Group setting up a wooden cross near a large boulder. 

A young boy picks up a paintbrush and grins. 

Four men in white shirts and baseball caps sitting around a table. 

A boy looking at a chopped log. 

Man with gray hair telling a story to a group of younger people on a bench. 

A woman with dirty blond-hair and glasses is cutting something. 

A man wearing a ball cap and a blue T-shirt rides a white horse down a dirt road through lush, green nature. 

A middle-aged woman cooking while her dog watches. 

Redhead woman in pigtails and glasses sewing on a sewing machine. 

Two men in a room looking at a computer screen. 

A gray dog runs along side a pool while a yellow dog jumps into the pool. 

A child is sliding down a hill on a sled. 

A man and a woman locking arms (wearing expensive clothing) next to glass display (perhaps retail stores) on the sidewalk in an urban setting. 

Two workers toil in a smelting factory. 

An old woman sits in a transit station next to a backlit advertisement. 

People waiting outside a building next to a mural. 

Three people push a piece of large machinery through the street. 

A man wearing a sleeveless shirt and construction helmet. 

A man with a beard and a hat is begging for money from people. 

The girl on the unicycle r

A woman in the middle of throwing a bowling ball in a bowling alley. 

One man is playing the drums while another is singing Karaoke. 

A brown horse stands near a black horse that is sitting on the ground. 

A kid looking over a carnival booth counter at the various toys. 

A man sleeps in a hammock next to water and a boat. 

Eleven little girls posing as a team in a pool. 

A man in a gray and black sweater is sitting on a stool in front of a group of cages. 

A very unusually dressed man sitting beside an ice cream cooler. 

A man in a brown shirt is sitting on the sidewalk and playing a guitar. 

A little girl with a tiara eating in someones lap. 

A black and brown dog is laying on a white shaggy carpet. 

The three girls sat on the beach. 

A little boy looks at his reflection in a burnished marble wall. 

Men and women on bikes stop to look at something. 

A little girl in pink dances with her hands on her hips. 

A man and woman enjoy a cigarette outside of a shop. 

An Asian 

Two bicyclists riding beside some railroad tracks. 

A couple enjoy a beautiful day bicycling on a bike trail wearing bicycling safety helmets and gloves. 

Women in bike helmets take break from long ride. 

A man wearing a black helmet, blue jacket, and khaki shirts rides his bike on the street next to a red car. 

Kid about to go on bike ride with parent 

Two little boys look back as they walk across a grassy area. 

Child on a small motorbike near a small pond. 

Two little blond girls in helmets are sitting on a red ATV. 

A boy in white plays baseball. 

A young boy with his tongue stuck out is climbing onto a wooden platform. 

A woman standing at the counter of a takeout window. 

A dog wearing a blue harness stands in the snow and pants. 

A woman is standing next to a Japanese version of Disney's Snow White. 

A woman orders a dish at a street kitchen vendor. 

Dogs pulling a sled in a sled race. 

A lone musician in black is on stage, playing an acoustic guitar and singing i

A man in a red shirt is jumping from a large rock formation. 

Several people are riding on a roller coaster while reacting to going through a loop in various ways. 

A young boy examines a field of pumpkins, with some already in his wheelbarrow. 

Guy playing banjo in the park as a duck enjoys the music. 

The girl in the purple top and shorts, wearing a hat, is laughing. 

Woman in a bikini top is walking on the beach 

A man in a dark jacket stands next to a man dressed in brown reaching down into a bag. 

A man on stage performing a concert for people. 

An ice cream truck is stopped in front of two small apartment buildings. 

A boy wearing black swimming trunks is standing in a fountain. 

A young Asian boy leaps for joy into a pool of water, his tongue stuck out for joy. 

Some kids playing near the street. 

This boy is playing on a playground with tires on it. 

A SCUBA diver swimming deep underwater with a turtle. 

A group of people carry things down a narrow road. 

A brown

A black dog and a white dog race in a grassy field while spectators look on. 

A bunch of birds outside of a building. 

A golfer prepares to take a shot on golf course surrounded by trees. 

A firefighter extinguishes a fire under the hood of a car. 

People gather around a table with many red, white, and blue decorations around in the spacious tent. 

A toddler with eggs in a bowl. 

Three teenagers are carrying wood down a street while one of the teenagers is smiling at the camera. 

A man stands in front of the Gateway Arch. 

A young boy covered in ink stands in front of a white door. 

A blond girl is stepping on vertical logs in the sand. 

A young girl with a purple shirt, waters grass. 

The cat looks away from the person next to it. 

A couple waiting at a crosswalk in a brightly lit city. 

A person in a yellow shirt stretching on a bridge. 

A gut wearing a plaid shirt with a mustache selling fish at a market. 

City workers are doing maintenance on a train track. 

A child

KeyboardInterrupt: 

In [29]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [93]:
# # vars returns attribute and value pairs in a dict format
# for i in range(10):
#     text = vars(train_data.examples[i])['trg'] 
#     print(f"length:{len(text)} - body:{text}",'\n')

length:11 - body:['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.'] 

length:12 - body:['several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.'] 

length:9 - body:['a', 'little', 'girl', 'climbing', 'into', 'a', 'wooden', 'playhouse', '.'] 

length:15 - body:['a', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'standing', 'on', 'a', 'ladder', 'cleaning', 'a', 'window', '.'] 

length:9 - body:['two', 'men', 'are', 'at', 'the', 'stove', 'preparing', 'food', '.'] 

length:15 - body:['a', 'man', 'in', 'green', 'holds', 'a', 'guitar', 'while', 'the', 'other', 'man', 'observes', 'his', 'shirt', '.'] 

length:8 - body:['a', 'man', 'is', 'smiling', 'at', 'a', 'stuffed', 'lion'] 

length:14 - body:['a', 'trendy', 'girl', 'talking', 'on', 'her', 'cellphone', 'while', 'gliding', 'slowly', 'down', 'the', 'street', '.'] 

length:12 - body:['a', 'woman', 'with', 'a', 'large', 'purse', 'is', 'walking', 'by', 'a', 'gate', '.'] 


In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [31]:
BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

In [32]:
# print(len(train_data), len(train_iterator))
# print(len(train_data), len(train_iterator))
# print(len(test_data), len(test_iterator))

In [49]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        # Tell interpreter that this class inherits super class, which is nn.Module in this case
        super().__init__()
        
        # The dimension of hidden cells 
        self.hid_dim = hid_dim
        # The number of hidden layers, which is equal to the number of outputs of encoder
        self.n_layers = n_layers
        # The word embeddings layer, emb_dim specifies the dimension of word embeddings
        self.embedding = nn.Embedding(input_dim, emb_dim)
        # By passing an object to 'rnn', nn.Module knows it's a RNN model
        # LSTM takes emb_dim, hid_dim, n_layers, and dropout as argument
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        # Takes dropout rate as input
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        # src = [src len, batch size]
        # First, it converts source sentence into a group of word embeddings
        # Then, dropout some of them
        embedded = self.dropout(self.embedding(src))
        
        # embedded = [src len, batch size, emb dim]
        # By passing embedded sentences, nn.LSTM automatically does the forward computation
        # Then, it outputs the output of the last hidden cell
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        return hidden, cell

In [34]:
# PyTorch Tensor: [# of matrices, # of rows of a matrix, # of columns of a matrix]
a = torch.tensor([[[1,2,3],
                   [4,5,6]],
                  [[7,8,9],
                   [10,11,12]],
                  [[13,14,15],
                   [16,17,18]],
                  [[19,20,21],
                   [22,23,24]]
                 ])
a.size()

torch.Size([4, 2, 3])

In [77]:
"""

Note: 
as we always have a sequence length of 1, we could use nn.LSTMCell, instead of nn.LSTM, 
as it is designed to handle a batch of inputs that aren't necessarily in a sequence. 
nn.LSTMCell is just a single cell and nn.LSTM is a wrapper around potentially multiple cells. 
Using the nn.LSTMCell in this case would mean we don't have to unsqueeze to add a fake sequence length dimension, 
but we would need one nn.LSTMCell per layer in the decoder and to ensure each nn.LSTMCell receives the correct initial hidden state from the encoder. 
All of this makes the code less concise - hence the decision to stick with the regular nn.LSTM.

"""

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        # The word embeddings layer, emb_dim specifies the dimension of word embeddings
        self.embedding = nn.Embedding(output_dim, emb_dim)
        # By passing an object to 'rnn', nn.Module knows it's a RNN model
        # LSTM takes emb_dim, hid_dim, n_layers, and dropout as argument
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        # Output function, which generates conditional probability distributions over the target vocabulary 
        # output_dim is equal to the size of the target vocabulary
        # fc stands for fully connected
        self.fc_out = nn.Linear(hid_dim, output_dim)
        # Takes dropout rate as input
        self.dropout = nn.Dropout(dropout)
    
    """
    
    In this tutorial, hidden states as in nodes in the hidden layer and cell states as in output node
    So at the beginning of the decoder, it takes the last cell output from the encoder.
    
    """
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        # As we are only decoding one token at a time, the input tokens will always have a sequence length of 1. 
        # We unsqueeze the input tokens to add a sentence length dimension of 1.
        # We have to squeeze/unsqueeze bc of PyTorch's architecture
        print(f'current input size: {input.size()}')
        input = input.unsqueeze(0)
        print(f'input size after unsqueezed: {input.size()}')
        
        #input = [1, batch size]
        # Then, we embed a generated token; this line outputs word embedding of token from the previuos time step
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        # Then, similar to the encoder, we pass through an embedding layer and apply dropout.        
        # Passing (hidden, cell) is a way to tell rnn that it takes hidden and cell states from the previous time step
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        print(f'output size: {output.size()}\n, output size after squeezed: {output.squeeze(0).size()}')
        
        #prediction = [batch size, output dim]
        
        # It retures, prediction (predicted token at the current time step)
        # And returns a predicted token
        
        print(f'Prediction size: {prediction.size()}\n')
        return prediction, hidden, cell

In [36]:
print(a.squeeze(0).size())
print(a.unsqueeze(0).size())
print(a.view(2,12))

torch.Size([4, 2, 3])
torch.Size([1, 4, 2, 3])
tensor([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
        [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]])


In [37]:
t = torch.ones(2,1,1,2)
print(t.size())
# Squeeze gets rid of all the empty dimensions
print(torch.squeeze(t).size())
# The second argument specifies which dimension to remove: 0, 1, ... , N
print(torch.squeeze(t,1).size())
print(a.size())
# Unsqueeze insert empty dimension at which the second argument specifies 
print(torch.unsqueeze(a,0).size())
print(torch.unsqueeze(a,1).size())
print(torch.unsqueeze(a,2).size())

torch.Size([2, 1, 1, 2])
torch.Size([2, 2])
torch.Size([2, 1, 2])
torch.Size([4, 2, 3])
torch.Size([1, 4, 2, 3])
torch.Size([4, 1, 2, 3])
torch.Size([4, 2, 1, 3])


In [59]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            # if teacher forcing, use actual next token (in the target sentence) as next input
            # if not, use predicted token as next input
            # teacher forcing means that we use tokens in supervised (target) sentences as input to the next hidden state
            # non teacher forcing means that we use predicted token as input to the next hidden state
            input = trg[t] if teacher_force else top1
        
        return outputs

In [78]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [79]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5)
  )
)

In [80]:
set([p.numel() for p in model.parameters() if p.requires_grad])
# 2010880 = src_voab * end_emb_dim = W_{x}, mapping one-hot vector into embedding
# 1508608 = trg_vocab * dec_emb_dim = W_{y}, mapping embedding into one-hot vector
# 524288 = hid_dim * emb_dim * 4 (=n_layer * 2 (enc + dec)) = W_{u}, weights between the hidden state and embeddings for each layer
# 5893 = output dimension size (one-hot vector) = W_{o}, weight for softmax layer
# 2048 = hid_dim * 4 = bias_{h} for hidden state
# 1048576 = hid_dim * hid_dim * 4 (=n_layer * 2 (enc + dec)) = W_{h}, weight for the recurrent state
# 3017216 = trg_vocab * hid_dim = W_{y'}, mapping previous token (one-hot vector) into the hidden state at next time step 

{2048, 5893, 524288, 1048576, 1508608, 2010880, 3017216}

In [81]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [82]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    # Loop through each batch 
    for i, batch in enumerate(iterator):
        
        # Source and Target sentences in a batch 
        src = batch.src
        trg = batch.trg
        
        # Reset gradient in the optimizer
        # grad stands for 'gradient' in PyTorchese
        optimizer.zero_grad()
        # Sentence predictions
        output = model(src, trg)
        
        # trg = [trg len, batch size]
        # output = [trg len, batch size, output dim]
        output_dim = output.shape[-1]
        
        # Reshape output predictions and target
        # -1 is a wildcard, but it seems like just a convension of PyTorch lol
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        # trg = [(trg len - 1) * batch size]
        # output = [(trg len - 1) * batch size, output dim]
        # Calculate cross entropy loss between prediction and observation
        loss = criterion(output, trg)
        # Backpropagatet the loss
        loss.backward()
        # Clip the gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        # Update parameters by step
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [83]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [84]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [1]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

# Output
#### current input size: torch.Size([32]) 
#### input size after unsqueezed: torch.Size([1, 32]) 
#### output size: torch.Size([1, 32, 512]), output size after squeezed: torch.Size([32, 512]) 
#### Prediction size: torch.Size([32, 5893])