# Imports



In [17]:
import torch
import torch.nn as nn

import ssl

import matplotlib.pyplot as plt
from datetime import datetime

# check if CUDA is available
is_gpu_available = torch.cuda.is_available()

if not is_gpu_available:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

DIR_PATH = '/content/drive/MyDrive/Deep learning 05107255/ex2_316168061_313471526'

CUDA is not available.  Training on CPU ...


In [5]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [27]:
params = {'batch_size'    : 20,
          'seq_length'       : 20,
          'hidden_size'   : 200,
          'num_layers'    : 2,
          'embed_size'    : 200,
          'dropout'       : 0.2,
          'lr'            : 1,
          'lr_decay'      : 1.2,
          'max_grad_norm' : 3,
          'epochs'        : 2
          }

# Load Data

In [8]:
train_file_name = f'{DIR_PATH}/ptb.train.txt'
val_file_name = f'{DIR_PATH}/ptb.valid.txt'
test_file_name = f'{DIR_PATH}/ptb.test.txt'

word2idx = {}

def load_data(fname, batch_size, dictionary={}):
  with open(fname, 'r') as f:
    data_str = f.read()
  data = data_str.replace('\n', '<eof>')
  data = data.split()
  print("Loading {}, size of data = {}".format(fname, len(data)))

  x = torch.LongTensor(len(data))
  vocab_idx = len(dictionary)
  for i in range(len(data)):
    if data[i] not in dictionary:
      dictionary[data[i]] = vocab_idx
      vocab_idx += 1
    x[i] = dictionary[data[i]]

  num_batches = x.size(0) // batch_size
  x = x[:num_batches * batch_size]
  return x.view(batch_size, -1), dictionary

train_data, word2idx = load_data(train_file_name, params['batch_size'])
val_data, word2idx = load_data(val_file_name, params['batch_size'], dictionary=word2idx)
test_data, word2idx = load_data(test_file_name, params['batch_size'], dictionary=word2idx)

vocab_size = len(word2idx)
print(vocab_size)


Loading /content/drive/MyDrive/Deep learning 05107255/ex2_316168061_313471526/ptb.train.txt, size of data = 929589
10000
Loading /content/drive/MyDrive/Deep learning 05107255/ex2_316168061_313471526/ptb.valid.txt, size of data = 73760
10000
Loading /content/drive/MyDrive/Deep learning 05107255/ex2_316168061_313471526/ptb.test.txt, size of data = 82430
10000


# Define The Network

In [25]:
from torch.nn.modules import dropout
class Net(nn.Module):
  def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout_prob):
    super(Net, self).__init__()
    self.n_hidden = hidden_size
    self.n_layers = num_layers

    self.embed = nn.Embedding(vocab_size, embed_size)
    self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout_prob)
    self.dropout = nn.Dropout(dropout_prob)
    self.fc = nn.Linear(hidden_size, vocab_size)

  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
    hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
              weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
    return hidden

  def forward(self, x, hidden):
    x = self.dropout(self.embed(x))
    out, hidden = self.lstm(x, hidden)
    out = self.dropout(out)
    out = out.contiguous().view(-1, self.n_hidden)
    out = self.fc(out)
    return out, hidden

model = Net(vocab_size, params['embed_size'], params['hidden_size'], params['num_layers'], params['dropout'])

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma = params['lr_decay'])

# Functions For Training the network

In [29]:
def evaluate(net, data, criterion, params, is_gpu_available):
  running_loss = 0.0
  batch_size = data.size(0)
  hidden = net.init_hidden(batch_size)
  net.eval()
  with torch.no_grad():
    for i in range(0, data.size(1) - params['seq_length'], params['seq_length']):
      x = data[:, i: i + params['seq_length']].to(DEVICE)
      y = data[:, i + 1: i + 1 + params['seq_length']].to(DEVICE)
      if is_gpu_available:
        x, y = x.cuda(), y.cuda()

      # Creating new variables for the hidden state, otherwise
      # we'd backprop through the entire training history
      hidden = tuple([each.data for each in hidden])

      output, hidden = net(x, hidden)
      loss = criterion(output, y.reshape(batch_size*params['seq_length']))

      running_loss += loss
  
  valid_loss = running_loss / (data.size(1) // params['seq_length'])
  return valid_loss

def train(net, data, criterion, opt, params, is_gpu_available):
  running_loss = 0.0
  batch_size = data.size(0)
  hidden = net.init_hidden(batch_size)
  net.train()
  for i in range(0, data.size(1) - params['seq_length'], params['seq_length']):
    x = data[:, i: i + params['seq_length']].to(DEVICE)
    y = data[:, i + 1: i + 1 + params['seq_length']].to(DEVICE)
    if is_gpu_available:
      x, y = x.cuda(), y.cuda()

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    hidden = tuple([each.data for each in hidden])
    net.zero_grad()
    output, hidden = net(x, hidden)
    loss = criterion(output, y.reshape(batch_size*params['seq_length']))
    loss.backward()
    nn.utils.clip_grad_norm_(net.parameters(), params['max_grad_norm'])
    opt.step()

    running_loss += loss.item()
  
  epoch_loss = running_loss / (data.size(1) // params['seq_length'])
  return net, opt, epoch_loss

def training_loop(train_data, val_data, net, opt, criterion, params, is_gpu_available):
  # set objects for storing metrics
    train_losses = []
    val_losses = []
    train_perp_vec = []
    val_perp_vec = []

    print(f'{datetime.now().time().replace(microsecond=0)} START')

    # Train model
    for epoch in range(0, params['epochs']):
      net, optimizer, train_loss = train(net, train_data, criterion, opt, params, is_gpu_available)
      train_perp = torch.exp(train_loss)
      train_losses.append(train_loss)
      train_perp_vec.append(train_perp)

      # validation
      model, val_loss = evaluate(net, val_data, criterion, params, is_gpu_available)
      val_perp = torch.exp(val_loss)
      val_losses.append(val_loss)
      val_perp_vec.append(val_perp)

      print(
          f'{datetime.now().time().replace(microsecond=0)} --- '
          f'Epoch: {epoch}\t'
          f'Train loss: {train_loss:.4f}\t'
          f'Val loss: {val_loss:.4f}\t'
          f'Train perplexity: {train_perp:.4f}\t'
          f'Test perplexity: {val_perp:.4f}'
        )
      
    return net, opt, (train_losses, val_losses), (train_perp_vec, val_perp_vec)



In [None]:
training_loop(train_data, val_data, model, optimizer, criterion, params, is_gpu_available)

20:30:49 START
