# RNN - Classifying Names by Language

Based on [NLP From Scratch Pytorch Tutorial](https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html) but with lots of simplifications/improvements. Creates a RNN from scratch using loops and basic matrix operations. 

In [1]:
from os import path
from requests import get
from zipfile import ZipFile
import shutil
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import unicodedata
import string
import torch
import torch.nn as nn

## Data

First we download the data:

In [2]:
local_path_zip = path.join("data", "language.zip")
local_path_files = path.join("data", "language")

# Download the data if it doesn't exist localy
if not path.isfile(local_path_zip):
    data_url = "https://download.pytorch.org/tutorial/data.zip"
    r = get(data_url, allow_redirects=True)
    open(local_path_zip, 'wb').write(r.content)
    
    # Extract
    zip = ZipFile(local_path_zip,"r")
    zip.extractall(local_path_files, files)

Next we process. The tutorial does this in a very roundabout way so instead we create an array of data, which contains tuples of name, language

In [155]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Build the category_lines dictionary, a list of names per language
raw_data = []
languages = []

def readLines(filename):
    """ Read a file by line """
    names_unicode = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(name) for name in names_unicode]

# Populate data with all the names from all files
for filename in glob.glob(local_path_files + '/data/names/*.txt'):
    language = os.path.splitext(os.path.basename(filename))[0]
    languages.append(language)
    names = readLines(filename)
    for name in names:
        raw_data.append([name, language])

len(raw_data), languages

(20074,
 ['Greek',
  'Japanese',
  'French',
  'Portuguese',
  'Russian',
  'German',
  'Chinese',
  'Italian',
  'Czech',
  'Korean',
  'Vietnamese',
  'English',
  'Dutch',
  'Arabic',
  'Spanish',
  'Polish',
  'Scottish',
  'Irish'])

Now we want to shuffle the data for training:

In [54]:
import random
random.shuffle(raw_data)
raw_data[:5]

[['Pylin', 'Russian'],
 ['Abdrakhmanov', 'Russian'],
 ['Totah', 'Arabic'],
 ['Teague', 'English'],
 ['Nahutin', 'Russian']]

### One-Hot Encoding

Now we want to convert the input (name) into a tensor. No need for the output as PyTorch does this at runtime.

In [26]:
letters = string.ascii_letters + " .,;'"
letters

"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'"

In [132]:
def letter_to_index(letter: string) -> int :
    return all_letters.find(letter)

def name_to_tensor(name: string) -> torch.tensor:
    """
    Convert a name into a mxn matrix, where m = number of letters in alphabet
    and n = number of letters in the name
    """
    tensor = torch.zeros(len(name), n_letters)
    for li, letter in enumerate(name):
        tensor[li][letter_to_index(letter)] = 1
    return tensor

data = [[nameToTensor(name), languages.index(language)] for name, language in raw_data]
data[1]

[tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0.],
         [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0.],
         [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0.,

## Model

<img src="https://i.imgur.com/Z2xbySO.png"/>

In [125]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)

In [133]:
input = lineToTensor('Albert')
hidden = torch.zeros(1, n_hidden)
output, next_hidden = rnn(input[0], hidden)
input.shape, hidden.shape, output

(torch.Size([6, 1, 57]),
 torch.Size([1, 128]),
 tensor([[-2.8870, -2.1749, -3.0054, -2.9577, -3.0302, -2.9240, -2.9862, -3.0015,
          -2.8840, -3.0046, -3.0320, -2.9114, -2.9140, -2.9943, -2.8992, -2.9538,
          -2.9546, -2.8960]], grad_fn=<LogSoftmaxBackward>))

## Training

In [136]:
criterion = nn.NLLLoss()
learning_rate = 0.005

def train(n_batches: int):
  
    # For each batch
    for i in range(n_batches):
       
        batch_loss = 0
    
        # For each training example
        for name, language in data:
            
            hidden = rnn.initHidden()
            rnn.zero_grad()

            # Go through one letter at a time
            for letter_number in range(name.shape[0]):
                output, hidden = rnn(name[letter_number].unsqueeze(0), hidden)
            
            loss = criterion(output, torch.tensor([language]))
            batch_loss += loss
            loss.backward()
    
            # Add parameters' gradients to their values, multiplied by learning rate
            for p in rnn.parameters():
                p.data.add_(p.grad.data, alpha=-learning_rate)
                
        # Print the batch loss
        print(batch_loss/len(data))
        
train(5)

tensor(1.3126, grad_fn=<DivBackward0>)
tensor(1.0595, grad_fn=<DivBackward0>)
tensor(0.9677, grad_fn=<DivBackward0>)
tensor(0.9187, grad_fn=<DivBackward0>)
tensor(0.8849, grad_fn=<DivBackward0>)


## Use

Testing with a few names:

In [161]:
def predict(name: string):
    input = name_to_tensor(name)
    hidden = rnn.initHidden()
    for letter_number in range(input.shape[0]):
        output, hidden = rnn(input[letter_number].unsqueeze(0), hidden)
    
    _pred, language_index = torch.topk(output, 1, 1)
    return languages[language_index[0][0]]

predict("Alan"), predict("Dostoevsky"), predict("Francisco"), predict("Siya")

('English', 'Russian', 'Italian', 'Japanese')