# Recurrent Neural Networks and LSTMs/GRUs

Following [Classifying Names with a Character-Level RNN](https://docs.pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html)

For each character, generate a prediction and hidden state at each step by feeding its *previous* hidden state in each next step. Take the final prediction to be the output, which class the word belongs to.

The task is to train on thousands of last names from 18 different languages and predict which language a name is based on the spelling.

In [1]:
import string
import unicodedata

import torch

In [2]:
%load_ext watermark

In [2]:
# Check if CUDA is available
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")

torch.set_default_device(device)
print(f"Using device = {torch.get_default_device()}")

Using device = cpu


# Data preparation

In [3]:
# We can use "_" to represent an out-of-vocabulary character, that is, any character we are not handling in our model
allowed_characters = string.ascii_letters + " .,;'" + "_"
n_letters = len(allowed_characters)


# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return "".join(
        c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn" and c in allowed_characters
    )

In [4]:
string.ascii_letters

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [5]:
# example of name to convert
print(f"converting 'Ślusàrski' to {unicodeToAscii('Ślusàrski')}")

converting 'Ślusàrski' to Slusarski


# Turn names into tensors

In [6]:
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    # return our out-of-vocabulary character if we encounter a letter unknown to our model
    if letter not in allowed_characters:
        return allowed_characters.find("_")
    else:
        return allowed_characters.find(letter)


# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [7]:
print(f"The letter 'a' becomes\n{lineToTensor('a')}")  # notice that the first position in the tensor = 1

The letter 'a' becomes
tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]]])


In [8]:
print(f"The name 'Ahn' becomes\n{lineToTensor('Ahn')}")  # notice 'A' sets the 27th index to 1

The name 'Ahn' becomes
tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]]])


In [9]:
print(f"The name 'Abba' becomes\n{lineToTensor('Abba')}")

The name 'Abba' becomes
tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

# Use `Dataset`` and `DataLoader` classes

In [10]:
from io import open
import glob
import os
import time

import torch
from torch.utils.data import Dataset

Note: "Provenance of the dataset" means keeping track of who, what, when, where, and how of the data. It's critical metadata that can be helpful for reproducibility purposes.

In [None]:
class NamesDataset(Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir  # for provenance of the dataset
        self.load_time = time.localtime  # for provenance of the dataset
        labels_set = set()  # set of all classes

        self.data = []
        self.data_tensors = []
        self.labels = []
        self.labels_tensors = []

        # read all the ``.txt`` files in the specified directory
        text_files = glob.glob(os.path.join(data_dir, "*.txt"))
        for filename in text_files:
            label = os.path.splitext(os.path.basename(filename))[0]
            labels_set.add(label)
            lines = open(filename, encoding="utf-8").read().strip().split("\n")
            for name in lines:
                self.data.append(name)
                self.data_tensors.append(lineToTensor(name))
                self.labels.append(label)

        # Cache the tensor representation of the labels
        # BL added `sorted` here to maintain alphabetical order of languages
        self.labels_uniq = sorted(list(labels_set))
        for idx in range(len(self.labels)):
            temp_tensor = torch.tensor([self.labels_uniq.index(self.labels[idx])], dtype=torch.long)
            self.labels_tensors.append(temp_tensor)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data_item = self.data[idx]
        data_label = self.labels[idx]
        data_tensor = self.data_tensors[idx]
        label_tensor = self.labels_tensors[idx]

        return label_tensor, data_tensor, data_label, data_item

In [19]:
alldata = NamesDataset("../local_data/names")
print(f"loaded {len(alldata)} items of data")


loaded 20074 items of data


In [20]:
# provenance of the dataset
alldata.data_dir

'../local_data/names'

In [21]:
alldata.load_time()

time.struct_time(tm_year=2025, tm_mon=7, tm_mday=25, tm_hour=5, tm_min=2, tm_sec=25, tm_wday=4, tm_yday=206, tm_isdst=0)

In [22]:
# first dataset example
# tuple of
# label_tensor: tensor representation of the label (single integer value)
# data_tensor: output of name being converted using lineToTensor
# data_label: class (the language)
# data_item: the name being classified
print(f"example = {alldata[0]}")

example = (tensor([2]), tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]]]), 'Czech', 'Abl')


Use `torch.utils.data` to create a train/test split as a generator.

In [None]:
train_set, test_set = torch.utils.data.random_split(
    alldata, [0.85, 0.15], generator=torch.Generator(device=device).manual_seed(2024)
)

print(f"train examples = {len(train_set)}, validation examples = {len(test_set)}")

train examples = 17063, validation examples = 3011


In [None]:
import torch.nn as nn
import torch.nn.functional as F


class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharRNN, self).__init__()

        self.rnn = nn.RNN(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, line_tensor):
        rnn_out, hidden = self.rnn(line_tensor)
        output = self.h2o(hidden[0])
        output = self.softmax(output)

        return output

In [26]:
n_hidden = 128
rnn = CharRNN(n_letters, n_hidden, len(alldata.labels_uniq))
print(rnn)

CharRNN(
  (rnn): RNN(58, 128)
  (h2o): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=1)
)


In [None]:
def label_from_output(output, output_labels):
    top_n, top_i = output.topk(1)
    label_i = top_i[0].item()
    return output_labels[label_i], label_i


input = lineToTensor("Albert")
output = rnn(input)  # this is equivalent to ``output = rnn.forward(input)``
print(output)
print(label_from_output(output, alldata.labels_uniq))

tensor([[-2.9539, -2.8097, -2.7657, -2.8274, -2.8341, -2.9097, -2.9939, -3.0721,
         -2.7528, -2.8996, -2.8607, -2.9514, -3.0111, -2.9556, -2.8843, -2.7442,
         -2.9206, -2.9529]], grad_fn=<LogSoftmaxBackward0>)
('Scottish', 15)


# Theoretical understanding and implementation with numpy

The description of RNN's from [Kartpathy's blog](https://karpathy.github.io/2015/05/21/rnn-effectiveness/) seems intuitive:

> [The] output vector’s contents are influenced not only by the input you just fed in, but also on the entire history of inputs you’ve fed in in the past.

# RNN

In [None]:
import torch
import torch.nn as nn

class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        """
        Initializes the SimpleRNN model.

        Args:
            input_size (int): The number of expected features in the input `x`.
                              For word embeddings, this would be the embedding dimension.
            hidden_size (int): The number of features in the hidden state `h`.
            output_size (int): The size of the output layer (e.g., number of classes for classification).
            num_layers (int, optional): Number of recurrent layers. Defaults to 1.
        """
        super(SimpleRNN, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Define the RNN layer
        # batch_first=True means input/output tensors are (batch, seq, feature)
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)

        # Define a linear layer to map the RNN's output to the desired output_size
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        """
        Defines the forward pass of the RNN model.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, sequence_length, input_size).

        Returns:
            torch.Tensor: Output tensor of shape (batch_size, output_size).
        """
        # Initialize hidden state with zeros
        # The hidden state tensor has shape (num_layers * num_directions, batch_size, hidden_size)
        # For a simple RNN, num_directions is 1.
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        # Pass input through the RNN layer
        # output: (batch_size, sequence_length, hidden_size * num_directions)
        # hn: (num_layers * num_directions, batch_size, hidden_size)
        output, hn = self.rnn(x, h0)

        # We are interested in the output of the last time step for classification/prediction.
        # In batch_first=True mode, output[:, -1, :] gives the last time step's output for all batches.
        # This is equivalent to hn[-1, :, :] if num_layers = 1 and it's a simple RNN.
        # For multi-layered RNNs, hn[-1] is the final hidden state of the last layer.
        final_output = self.fc(output[:, -1, :]) # Taking the output of the last time step

        return final_output

In [3]:
%watermark

Last updated: 2025-08-06T23:14:29.951880+00:00

Python implementation: CPython
Python version       : 3.12.11
IPython version      : 9.4.0

Compiler    : GCC 12.2.0
OS          : Linux
Release     : 6.10.14-linuxkit
Machine     : aarch64
Processor   : 
CPU cores   : 7
Architecture: 64bit



In [4]:
%watermark -iv




In [None]:
# test