# Imports



In [69]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter

import ssl

import matplotlib.pyplot as plt
from datetime import datetime

# check if CUDA is available
is_gpu_available = torch.cuda.is_available()

if not is_gpu_available:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

DIR_PATH = '/content/drive/MyDrive/Deep learning 05107255/ex2_316168061_313471526'

CUDA is available!  Training on GPU ...


In [2]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
params = {'batch_size'    : 20,
          'seq_length'    : 20,
          'hidden_size'   : 200,
          'num_layers'    : 2,
          'embed_size'    : 200,
          'dropout'       : 0.2,
          'lr'            : 0.01,
          'lr_decay'      : 1.3,
          'max_grad_norm' : 2,
          'epochs'        : 5
          }

# Load Data

In [25]:
class Dictionary(object):
  def __init__(self):
    self.word2idx = {}
    self.idx2word = []

  def add_word(self, word):
    if word not in self.word2idx:
      self.idx2word.append(word)
      self.word2idx[word] = len(self.idx2word) - 1

  def __len__(self):
    return len(self.word2idx)


class Corpus(object):
  def __init__(self, data_dir_path):
    self.dictionary = Dictionary()
    self.train = self.tokenize(os.path.join(data_dir_path, 'ptb.train.txt'))
    self.valid = self.tokenize(os.path.join(data_dir_path, 'ptb.valid.txt'))
    self.test = self.tokenize(os.path.join(data_dir_path, 'ptb.test.txt'))

  def tokenize(self, fname):
    with open(fname, 'r') as f:
      data_str = f.read()
      words = data_str.replace('\n', '<eof>')
      words = words.split()
      ids = torch.LongTensor(len(words))
      for i, word in enumerate(words):
        self.dictionary.add_word(word)
        ids[i] = self.dictionary.word2idx[word]
    return ids

corpus = Corpus(DIR_PATH)
vocab_size = len(corpus.dictionary)

In [71]:
def batch_generator(data, batch_size, seq_length):
  mini_batch_size = batch_size * seq_length
  n_mini_batches = len(data) // mini_batch_size
  # Trucate data
  # TODO add possibility to pad
  data = data[:n_mini_batches * mini_batch_size]
  data = data.reshape((batch_size, -1))
  for i in range(0, data.size(1), seq_length):
    x = data[:, i:i + seq_length]
    y = torch.zeros_like(x)
    if i + seq_length < data.size(1):
      y[:, :-1], y[:, -1] = x[:, 1:], data[:, i + seq_length]
    else:
      y[:, :-1], y[:, -1] = x[:, 1:], data[:, 0].roll(-1, 0)
    yield x, y


In [4]:
# train_file_name = f'{DIR_PATH}/ptb.train.txt'
# val_file_name = f'{DIR_PATH}/ptb.valid.txt'
# test_file_name = f'{DIR_PATH}/ptb.test.txt'

# word2idx = {}

# def load_data(fname, batch_size, dictionary={}):
#   with open(fname, 'r') as f:
#     data_str = f.read()
#   data = data_str.replace('\n', '<eof>')
#   data = data.split()
#   print("Loading {}, size of data = {}".format(fname, len(data)))

#   x = torch.LongTensor(len(data))
#   vocab_idx = len(dictionary)
#   for i in range(len(data)):
#     if data[i] not in dictionary:
#       dictionary[data[i]] = vocab_idx
#       vocab_idx += 1
#     x[i] = dictionary[data[i]]

#   num_batches = x.size(0) // batch_size
#   x = x[:num_batches * batch_size]
#   return x.view(batch_size, -1), dictionary

# train_data, word2idx = load_data(train_file_name, params['batch_size'])
# val_data, word2idx = load_data(val_file_name, params['batch_size'], dictionary=word2idx)
# test_data, word2idx = load_data(test_file_name, params['batch_size'], dictionary=word2idx)


Loading /content/drive/MyDrive/Deep learning 05107255/ex2_316168061_313471526/ptb.train.txt, size of data = 929589
46479
torch.Size([929580])
Loading /content/drive/MyDrive/Deep learning 05107255/ex2_316168061_313471526/ptb.valid.txt, size of data = 73760
3688
torch.Size([73760])
Loading /content/drive/MyDrive/Deep learning 05107255/ex2_316168061_313471526/ptb.test.txt, size of data = 82430
4121
torch.Size([82420])


# Define The Network

In [66]:
from torch.nn.modules import dropout
class Net(nn.Module):
  def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout_prob, use_lstm=True):
    super(Net, self).__init__()
    self.n_hidden = hidden_size
    self.n_layers = num_layers
    self.use_lstm = use_lstm

    self.embed = nn.Embedding(vocab_size, embed_size)
    if use_lstm:
      self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout_prob)
    else:
      self.rnn = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout_prob)
    self.dropout = nn.Dropout(dropout_prob)
    self.fc = nn.Linear(hidden_size, vocab_size)

  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
    if self.use_lstm:
      hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
    else:
      hidden = weight.new(self.n_layers, batch_size, self.n_hidden).zero_()
    return hidden

  def forward(self, x, hidden):
    x = self.dropout(self.embed(x))
    out, hidden = self.rnn(x, hidden)
    out = self.dropout(out)
    out = out.reshape(-1, self.n_hidden)
    out = self.fc(out)
    return out, hidden

def repackage_hidden(hidden):
  if type(hidden) == torch.Tensor:
    hidden = hidden.data
  else:
    hidden = tuple([h.data for h in hidden])
  return hidden

model = Net(vocab_size, params['embed_size'], params['hidden_size'], params['num_layers'], params['dropout']).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma = params['lr_decay'])

# Functions For Training the network

In [68]:
def evaluate(net, data, criterion, params, is_gpu_available):
  running_loss = 0.0
  batch_size = params['batch_size']
  hidden = net.init_hidden(batch_size)
  net.eval()
  with torch.no_grad():
    for x, y in batch_generator(data, params['batch_size'], params['seq_length']):
      if is_gpu_available:
        x, y = x.cuda(), y.cuda()

      # Creating new variables for the hidden state, otherwise
      # we'd backprop through the entire training history
      hidden = repackage_hidden(hidden)

      output, hidden = net(x, hidden)
      loss = criterion(output, y.reshape(batch_size*params['seq_length']))

      running_loss += loss.item()
  
  valid_loss = running_loss / (len(data) // (params['seq_length'] * params['batch_size']))
  return valid_loss

def train(net, data, criterion, opt, params, is_gpu_available):
  running_loss = 0.0
  batch_size = params['batch_size']
  hidden = net.init_hidden(batch_size)
  net.train()
  batch_gen = batch_generator(data, params['batch_size'], params['seq_length'])
  for x, y in batch_gen:
    if is_gpu_available:
      x, y = x.cuda(), y.cuda()

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    hidden = repackage_hidden(hidden)
    net.zero_grad()
    output, hidden = net(x, hidden)
    loss = criterion(output, y.reshape(batch_size*params['seq_length']))
    loss.backward()
    nn.utils.clip_grad_norm_(net.parameters(), params['max_grad_norm'])
    opt.step()

    running_loss += loss.item()
  
  epoch_loss = running_loss / (len(data) // (params['seq_length'] * params['batch_size']))
  return net, opt, epoch_loss

def training_loop(train_data, val_data, net, opt, criterion, scheduler, params, is_gpu_available):
  # set objects for storing metrics
    train_losses = []
    val_losses = []
    train_perp_vec = []
    val_perp_vec = []
    tb = SummaryWriter()

    print(f'{datetime.now().time().replace(microsecond=0)} START')

    # Train model
    for epoch in range(0, params['epochs']):
      net, optimizer, train_loss = train(net, train_data, criterion, opt, params, is_gpu_available)
      train_perp = np.exp(train_loss)
      train_losses.append(train_loss)
      train_perp_vec.append(train_perp)
      scheduler.step()

      # validation
      val_loss = evaluate(net, val_data, criterion, params, is_gpu_available)
      val_perp = np.exp(val_loss)
      val_losses.append(val_loss)
      val_perp_vec.append(val_perp)

      tb.add_scalar("Train Loss", train_loss, epoch)
      tb.add_scalar("Valid Loss", val_loss, epoch)
      tb.add_scalar("Train Perplexity", train_perp, epoch)
      tb.add_scalar("Valid Perplexity", val_perp, epoch)

      print(
          f'{datetime.now().time().replace(microsecond=0)} --- '
          f'Epoch: {epoch}\t'
          f'Train loss: {train_loss:.4f}\t'
          f'Val loss: {val_loss:.4f}\t'
          f'Train perplexity: {train_perp:.4f}\t'
          f'Test perplexity: {val_perp:.4f}'
        )
    
    tb.close()
    return net, opt, (train_losses, val_losses), (train_perp_vec, val_perp_vec)



# Plotting & Helper Functions

In [8]:
def plot_train_and_valid_losses(train_losses, test_losses, train_type=None):
  train_losses = np.array(train_losses) 
  test_losses = np.array(test_losses)

  fig, ax = plt.subplots(figsize = (8, 4.5))

  ax.plot(train_losses, color='blue', label='Training loss') 
  ax.plot(test_losses, color='red', label='Validation loss')
  title = "Loss over epochs"
  if train_type:
    title += "\n" + train_type
  ax.set(title=title, 
          xlabel='Epoch',
          ylabel='Loss') 
  ax.legend()
  plt.grid()
  fig.show()
    # --------------------------------------------------------------------------------------------------
def plot_train_and_valid_perp(train_perp, valid_perp, train_type=None):
  train_perp = np.array(train_perp) 
  valid_perp = np.array(valid_perp)

  fig, ax = plt.subplots(figsize = (8, 4.5))

  ax.plot(train_perp, color='blue', label='Training Perplexity') 
  ax.plot(valid_perp, color='red', label='Validation Perplexity')
  title = "Perplexity over epochs"
  if train_type:
    title += "\n" + train_type
  ax.set(title=title, 
          xlabel='Epoch',
          ylabel='Perplexity') 
  ax.legend()
  fig.show()
  
  return fig
# --------------------------------------------------------------------------------------------------
def save_model(model, optimizer, dir_path, train_type=None):
  # create checkpints folder
  checkpoint_folder = os.path.join(dir_path, 'checkpoints')
  os.makedirs(checkpoint_folder, exist_ok=True)
  # save model
  curr_datetime = datetime.now().strftime("%y_%m_%d_%H:%M:%S")
  model_name = f'{curr_datetime}'.replace(':', '_')
  if train_type:
    model_name = train_type + '_' + model_name
    # model_name = train_type
  ckpt_path = os.path.join(checkpoint_folder, f'{model_name}.ckpt')
  torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'use_dropout': model.use_dropout,
    'use_batchnorm': model.use_batchnorm,
    
  }, ckpt_path)

  print(f'model saved to "{ckpt_path}"')

# Train the Network

In [70]:
model = Net(vocab_size, params['embed_size'], params['hidden_size'], params['num_layers'], params['dropout'], use_lstm=True).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=params['lr_decay'])
model = training_loop(corpus.train, corpus.valid, model, optimizer, criterion, lr_scheduler, params, is_gpu_available)

19:42:09 START
19:42:57 --- Epoch: 0	Train loss: 5.8985	Val loss: 5.4915	Train perplexity: 364.4724	Test perplexity: 242.6160
19:43:42 --- Epoch: 1	Train loss: 5.5780	Val loss: 5.4562	Train perplexity: 264.5393	Test perplexity: 234.2148
19:44:27 --- Epoch: 2	Train loss: 5.4478	Val loss: 5.4543	Train perplexity: 232.2532	Test perplexity: 233.7622
19:45:11 --- Epoch: 3	Train loss: 5.4435	Val loss: 5.5411	Train perplexity: 231.2542	Test perplexity: 254.9671
19:45:56 --- Epoch: 4	Train loss: 5.5174	Val loss: 5.6654	Train perplexity: 248.9944	Test perplexity: 288.6917
