### Purpose
Followng the steps from https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html

In [1]:
from __future__ import unicode_literals, print_function, division
import torch
from pathlib import Path

from io import open
import glob 
import os
import unicodedata, string
import random
import math

In [5]:
# !wget https://download.pytorch.org/tutorial/data.zip
# !unzip data.zip

In [6]:
basedir = Path('data')

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

In [7]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD',s)
        if unicodedata.category(c) != 'Mn' and c in all_letters
    )

# print(unicodeToAscii('Ślusàrski'))

#build category lines dictionary, a list of names per language
category_lines = {}
all_categories = []


def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

filenames = basedir.glob('names/*.txt')
for filename in basedir.glob('names/*.txt'):
    category = filename.as_posix().split('/')[2].split('.')[0]
    all_categories.append(category)
    lines = readLines(str(filename))
    category_lines[category] = lines
    
n_categories = len(all_categories)
n_categories

18

In [8]:
filename = next(filenames)

In [9]:
def letterToIndex(letter):
    return all_letters.find(letter)

def letterToTensor(letter):
    tensor = torch.zeros(1,n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

def lineToTensor(line):
    tensor = torch.zeros(len(line),1,n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

print(lineToTensor('Jones').size())

"""
One hot encoded tensor of first to last letters of line. Shape is (len(line),1,n_letters)
"""
def inputTensor(line):
    t = torch.zeros(len(line),1,n_letters)
    for idx in range(len(line)):
        letter = line[idx]
        t[idx][0][letterToIndex(letter)] = 1
    return t

"""
Long tensor of second to EOS letters of line. Shape is (len(line),1,n_letters)
"""
def targetTensor(line):
    letter_indexes = [all_letters.find(line[idx]) for idx in range(1,len(line))]
    letter_indexes = 
    

torch.Size([5, 1, 57])


In [6]:
category = 'English'
all_categories.index(category)
torch.tensor([all_categories.index(category)])

tensor([9])

This is a very basic 'rnn', where we feed each letter and also have a hidden_state.  The output after feeding all letters will output the language.

In [7]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN,self).__init__()
        self.i2h = nn.Linear(input_size + hidden_size,hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size,output_size)
        self.softmax = nn.LogSoftmax(1)
        self.hidden_size = hidden_size
    
    def forward(self, line, hidden):
        combined = torch.cat([line,hidden],1)
        hidden = self.i2h(combined)
        out = self.i2o(combined)
        out = self.softmax(out)
        return out, hidden
    
    def initHidden(self):
        return torch.zeros(1,self.hidden_size)

In [32]:
def getRandomTrainingElement():
    category = random.choice(all_categories)
    line = random.choice(category_lines[category])
    lineTensor = lineToTensor(line)
    catTensor = torch.tensor([all_categories.index(category)],dtype=torch.long)
    return line, category, lineTensor, catTensor

hidden_size = 128
import time
n_iter = 10000
lr = 0.001
print_every = 1000
plot_every = 5000

criterion = nn.NLLLoss()

def train(line_tensor, category_tensor):
    hidden = rnn.initHidden()
    rnn.zero_grad()
    
    for idx in range(line_tensor.size()[0]):
        out, hidden = rnn.forward(line_tensor[idx],hidden)
    loss = criterion(out,category_tensor)
    loss.backward()
        
    for p in rnn.parameters():
        p.data = p.data -lr * p.grad.data
            
    return out, loss.item()

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def categoryFromOutput(output):
    _, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i],category_i

rnn = RNN(n_letters,hidden_size,n_categories)

In [33]:
start = time.time()
current_loss = 0
all_losses = []
for idx in range(1,n_iter + 1):
    line, category, lineTensor, catTensor = getRandomTrainingElement()
    output, loss = train(lineTensor,catTensor)
    current_loss += loss
    
    # print losses
    if idx%print_every == 0:
        guess, guess_i = categoryFromOutput(output)
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (idx, idx / n_iter * 100, timeSince(start), loss, line, guess, correct))
        
    # Add current loss avg to list of losses
    if idx%plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0

1000 1% (0m 1s) 2.7837 Creasey / Scottish ✗ (English)
2000 2% (0m 2s) 2.8741 Whyte / Russian ✗ (Scottish)
3000 3% (0m 3s) 2.6828 Agelakos / Greek ✓
4000 4% (0m 4s) 2.7579 Zapatero / Spanish ✓
5000 5% (0m 5s) 2.9166 Abel / Dutch ✗ (French)
6000 6% (0m 6s) 2.8925 Jordan / English ✗ (Polish)
7000 7% (0m 7s) 2.9356 Gajos / Greek ✗ (Polish)
8000 8% (0m 9s) 2.8144 Shikitei / German ✗ (Japanese)
9000 9% (0m 10s) 2.8031 Araujo / Spanish ✗ (Portuguese)
10000 10% (0m 11s) 2.8178 Awad / Arabic ✓
11000 11% (0m 13s) 2.8363 Lobo / Spanish ✗ (Portuguese)
12000 12% (0m 14s) 2.8048 Ramm / Korean ✗ (English)
13000 13% (0m 15s) 2.8079 Laver / German ✗ (English)
14000 14% (0m 16s) 2.8677 Gaber / German ✗ (Arabic)
15000 15% (0m 17s) 2.7230 Rios / Greek ✗ (Portuguese)
16000 16% (0m 18s) 2.6805 Kowalczyk / Polish ✓
17000 17% (0m 20s) 2.7195 Jelinek / German ✗ (Czech)
18000 18% (0m 21s) 2.5668 Gomolka / Japanese ✗ (Polish)
19000 19% (0m 22s) 2.7836 Tang / Korean ✗ (Chinese)
20000 20% (0m 23s) 2.7405 Espinoza 