
# 6.4 Exercises

To get better classification results on RNN, you can explore the following RNN architectures:

   - 1) Add more ``nn.RNN`` layers
   - 2) Add the ``nn.LSTM`` layers
   - 3) Add the ``nn.GRU`` layers


In [None]:
%matplotlib inline

# import library
from __future__ import unicode_literals, print_function, division
from io import open
# import ooperating system module
import os
# glob module finds all the pathnames matching a specified pattern
import glob
import unicodedata
import string
import random
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import torch
import torch.nn as nn
from torch.utils.data import Dataset
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Find letter index from all_letters, index of n_letters
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

def label_from_output(output, output_labels):
    top_n, top_i = output.topk(1)
    label_i = top_i[0].item()
    return output_labels[label_i], label_i

class NamesDataset(Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir  # for provenance of the dataset
        labels_set = set()  # set of all classes

        self.data = []
        self.data_tensors = []
        self.labels = []
        self.labels_tensors = []

        # read all the ``.txt`` files in the specified directory
        text_files = glob.glob(os.path.join(data_dir, '*.txt'))
        for filename in text_files:
            label = os.path.splitext(os.path.basename(filename))[0]
            labels_set.add(label)
            lines = open(filename, encoding='utf-8').read().strip().split('\n')
            for name in lines:
                self.data.append(name)
                self.data_tensors.append(lineToTensor(name))
                self.labels.append(label)

        # This last bit converts the labels from string to number
        # Using the idea of encoding we saw during the lectures
        self.labels_uniq = list(labels_set)
        for idx in range(len(self.labels)):
            temp_tensor = torch.tensor([self.labels_uniq.index(self.labels[idx])], dtype=torch.long)
            self.labels_tensors.append(temp_tensor)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data_item = self.data[idx]
        data_label = self.labels[idx]
        data_tensor = self.data_tensors[idx]
        label_tensor = self.labels_tensors[idx]

        return label_tensor, data_tensor, data_label, data_item

alldata = NamesDataset("data/names")

train_set, test_set = torch.utils.data.random_split(alldata, [.85, .15], generator=torch.Generator(device=device).manual_seed(2024))

print(f"train examples = {len(train_set)}, validation examples = {len(test_set)}")

## Exercise 1

Create and train an architecture with more ``nn.RNN`` layers.



In [None]:
class RNN(nn.Module):
    ## CODE HERE

n_letters =57
n_categories =18
print("number of letters: ", n_letters)
print("number of categories: ", n_categories)

# set the hidden neurons
n_hidden = 256
# number of RNN layers
num_layers = 2
# instance a RNN model
rnn = RNN(n_letters, n_hidden, num_layers, n_categories)

print(rnn)

In [None]:
# loss function
criterion = nn.NLLLoss()

# If you set this too high, it might explode. If too low, it might not learn
learning_rate = 0.15

optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)

In [None]:
def train(rnn, training_data, n_epoch=10, n_batch_size = 64):
    current_loss = 0
    all_losses = []
    rnn.train()

    for iter in range(1, n_epoch + 1):
        rnn.zero_grad() # clear the gradients

        # create some minibatches
        # we cannot use dataloaders because each of our names is a different length!!
        batches = list(range(len(training_data)))
        random.shuffle(batches)
        batches = np.array_split(batches, len(batches) // n_batch_size )

        for idx, batch in enumerate(batches):
            batch_loss = 0
            for i in batch: # for each example in this batch
                (label_tensor, text_tensor, label, text) = training_data[i]
                output = rnn.forward(text_tensor)
                loss = criterion(output, label_tensor)
                batch_loss += loss

            # optimize parameters
            batch_loss.backward()
            nn.utils.clip_grad_norm_(rnn.parameters(), 3)  # this is to facilitate convergence
            optimizer.step()
            optimizer.zero_grad()

            current_loss += batch_loss.item() / len(batch)

        all_losses.append(current_loss / len(batches))
        if iter % 5 == 0:
            print(f"{iter} ({iter / n_epoch:.0%}): \t average batch loss = {all_losses[-1]}")
        current_loss = 0

    return all_losses

def evaluate(rnn, testing_data, classes):
    confusion = torch.zeros(len(classes), len(classes))

    rnn.eval() # set to eval mode
    with torch.no_grad(): # do not record the gradients during eval phase
        for i in range(len(testing_data)):
            (label_tensor, text_tensor, label, text) = testing_data[i]
            output = rnn(text_tensor)
            guess, guess_i = label_from_output(output, classes)
            label_i = classes.index(label)
            confusion[label_i][guess_i] += 1

    # Normalize by dividing every row by its sum
    for i in range(len(classes)):
        denom = confusion[i].sum()
        if denom > 0:
            confusion[i] = confusion[i] / denom

    # Set up plot
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(confusion.cpu().numpy()) # numpy uses cpu here so we need to use a cpu version
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticks(np.arange(len(classes)), labels=classes, rotation=90)
    ax.set_yticks(np.arange(len(classes)), labels=classes)

    # Force label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [None]:
all_losses = train(rnn, train_set, n_epoch=27, n_batch_size=64)

In [None]:
evaluate(rnn, test_set, classes=alldata.labels_uniq)

## Exercise 2

Create and train an architecture using [LSTM layers](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)



In [None]:
# build Recurrent neural network using nn.LSTM() (one to one)
class LSTMRNN(nn.Module):
    ## CODE HERE

In [None]:
n_letters =57
n_categories =18
print("number of letters: ", n_letters)
print("number of categories: ", n_categories)

# set the hidden neurons
n_hidden = 256
# number of RNN layers
num_layers = 2

# instance a LSTM model
rnn = LSTMRNN(n_letters, n_hidden, num_layers, n_categories)

print(rnn)

In [None]:
# loss function
criterion = nn.NLLLoss()

# If you set this too high, it might explode. If too low, it might not learn
learning_rate = 0.15

optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)

In [None]:
def train(rnn, training_data, n_epoch=10, n_batch_size = 64):
    current_loss = 0
    all_losses = []
    rnn.train()

    for iter in range(1, n_epoch + 1):
        rnn.zero_grad() # clear the gradients

        # create some minibatches
        # we cannot use dataloaders because each of our names is a different length!!
        batches = list(range(len(training_data)))
        random.shuffle(batches)
        batches = np.array_split(batches, len(batches) // n_batch_size )

        for idx, batch in enumerate(batches):
            batch_loss = 0
            for i in batch: # for each example in this batch
                (label_tensor, text_tensor, label, text) = training_data[i]
                output = rnn.forward(text_tensor)
                loss = criterion(output, label_tensor)
                batch_loss += loss

            # optimize parameters
            batch_loss.backward()
            nn.utils.clip_grad_norm_(rnn.parameters(), 3)  # this is to facilitate convergence
            optimizer.step()
            optimizer.zero_grad()

            current_loss += batch_loss.item() / len(batch)

        all_losses.append(current_loss / len(batches))
        if iter % 5 == 0:
            print(f"{iter} ({iter / n_epoch:.0%}): \t average batch loss = {all_losses[-1]}")
        current_loss = 0

    return all_losses

def evaluate(rnn, testing_data, classes):
    confusion = torch.zeros(len(classes), len(classes))

    rnn.eval() # set to eval mode
    with torch.no_grad(): # do not record the gradients during eval phase
        for i in range(len(testing_data)):
            (label_tensor, text_tensor, label, text) = testing_data[i]
            output = rnn(text_tensor)
            guess, guess_i = label_from_output(output, classes)
            label_i = classes.index(label)
            confusion[label_i][guess_i] += 1

    # Normalize by dividing every row by its sum
    for i in range(len(classes)):
        denom = confusion[i].sum()
        if denom > 0:
            confusion[i] = confusion[i] / denom

    # Set up plot
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(confusion.cpu().numpy()) # numpy uses cpu here so we need to use a cpu version
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticks(np.arange(len(classes)), labels=classes, rotation=90)
    ax.set_yticks(np.arange(len(classes)), labels=classes)

    # Force label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [None]:
all_losses = train(rnn, train_set, n_epoch=27, n_batch_size=64)

In [None]:
evaluate(rnn, test_set, classes=alldata.labels_uniq)

## Exercise 3

Create and train an architecture using [GRU layers](https://pytorch.org/docs/stable/generated/torch.nn.GRU.html)

In [None]:
# Recurrent neural network, nn.GRU() (one to one)
class GRURNN(nn.Module):
    ## CODE HERE

In [None]:
n_letters =57
n_categories =18
print("number of letters: ", n_letters)
print("number of categories: ", n_categories)

# set the hidden neurons
n_hidden = 256
# number of RNN layers
num_layers = 2

# instance a RNN model
rnn = GRURNN(n_letters, n_hidden, num_layers, n_categories)
print(rnn)

In [None]:
# loss function
criterion = nn.NLLLoss()

# If you set this too high, it might explode. If too low, it might not learn
learning_rate = 0.15

optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)

In [None]:
def train(rnn, training_data, n_epoch=10, n_batch_size = 64):
    current_loss = 0
    all_losses = []
    rnn.train()

    for iter in range(1, n_epoch + 1):
        rnn.zero_grad() # clear the gradients

        # create some minibatches
        # we cannot use dataloaders because each of our names is a different length!!
        batches = list(range(len(training_data)))
        random.shuffle(batches)
        batches = np.array_split(batches, len(batches) // n_batch_size )

        for idx, batch in enumerate(batches):
            batch_loss = 0
            for i in batch: # for each example in this batch
                (label_tensor, text_tensor, label, text) = training_data[i]
                output = rnn.forward(text_tensor)
                loss = criterion(output, label_tensor)
                batch_loss += loss

            # optimize parameters
            batch_loss.backward()
            nn.utils.clip_grad_norm_(rnn.parameters(), 3)  # this is to facilitate convergence
            optimizer.step()
            optimizer.zero_grad()

            current_loss += batch_loss.item() / len(batch)

        all_losses.append(current_loss / len(batches))
        if iter % 5 == 0:
            print(f"{iter} ({iter / n_epoch:.0%}): \t average batch loss = {all_losses[-1]}")
        current_loss = 0

    return all_losses

def evaluate(rnn, testing_data, classes):
    confusion = torch.zeros(len(classes), len(classes))

    rnn.eval() # set to eval mode
    with torch.no_grad(): # do not record the gradients during eval phase
        for i in range(len(testing_data)):
            (label_tensor, text_tensor, label, text) = testing_data[i]
            output = rnn(text_tensor)
            guess, guess_i = label_from_output(output, classes)
            label_i = classes.index(label)
            confusion[label_i][guess_i] += 1

    # Normalize by dividing every row by its sum
    for i in range(len(classes)):
        denom = confusion[i].sum()
        if denom > 0:
            confusion[i] = confusion[i] / denom

    # Set up plot
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(confusion.cpu().numpy()) # numpy uses cpu here so we need to use a cpu version
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticks(np.arange(len(classes)), labels=classes, rotation=90)
    ax.set_yticks(np.arange(len(classes)), labels=classes)

    # Force label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [None]:
all_losses = train(rnn, train_set, n_epoch=27, n_batch_size=64)

In [None]:
evaluate(rnn, test_set, classes=alldata.labels_uniq)