# RNN - Classifying Names by Language

Based on [NLP FROM SCRATCH: CLASSIFYING NAMES WITH A CHARACTER-LEVEL RNN](https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html)

In [44]:
from os import path
from requests import get
from zipfile import ZipFile
import shutil
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import unicodedata
import string
import torch
import torch.nn as nn

## Data

First we download the data:

In [21]:
local_path_zip = path.join("data", "language.zip")
local_path_files = path.join("data", "language")

# Download the data if it doesn't exist localy
if not path.isfile(local_path_zip):
    data_url = "https://download.pytorch.org/tutorial/data.zip"
    r = get(data_url, allow_redirects=True)
    open(local_path_zip, 'wb').write(r.content)
    
    # Extract
    zip = ZipFile(local_path_zip,"r")
    zip.extractall(local_path_files, files)

Next we process by converting into a dictionary, with languages as the keys and a list of names for each language as the values. Using the tutorial code for this:

In [32]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in glob.glob(local_path_files + '/data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

category_lines.keys()

dict_keys(['Greek', 'Japanese', 'French', 'Portuguese', 'Russian', 'German', 'Chinese', 'Italian', 'Czech', 'Korean', 'Vietnamese', 'English', 'Dutch', 'Arabic', 'Spanish', 'Polish', 'Scottish', 'Irish'])

Note each language is then just a list of names:

In [35]:
category_lines["Greek"][:5]

['Adamidis', 'Adamou', 'Agelakos', 'Akrivopoulos', 'Alexandropoulos']

### One-Hot Encoding

Now we create a function to one-hot encode each name by letter:

In [43]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

def letterToIndex(letter: string):
    return all_letters.find(letter)

def lineToTensor(line: string):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

# Should be 5 (letters) x 1 (batch size) x 57 (number of letters in one hot encoding)
lineToTensor("hello").shape

torch.Size([5, 1, 57])

## Model

<img src="https://i.imgur.com/Z2xbySO.png"/>

In [45]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size