#### Adapted from [CLASSIFYING NAMES WITH A CHARACTER-LEVEL RNN](https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html#classifying-names-with-a-character-level-rnn)

In [1]:
import glob
import math
import os
import random
import string
import unicodedata

import torch
from torch import nn

Download data from [here](https://download.pytorch.org/tutorial/data.zip) into the current directory and extract it into `data` directory before proceeding

In [2]:
def find_files(path):
    return glob.glob(path)

In [3]:
find_files('data/names/*.txt')[:4]

['data/names/Czech.txt',
 'data/names/German.txt',
 'data/names/Arabic.txt',
 'data/names/Japanese.txt']

In [4]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn' and c in all_letters
    )

In [5]:
print(unicode_to_ascii('Ślusàrski'))

Slusarski


In [6]:
category_lines = {}
all_categories = []

def readlines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicode_to_ascii(line) for line in lines]

for filename in find_files('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readlines(filename)
    category_lines[category] = lines

In [7]:
print(all_categories[:4])
print(category_lines['Czech'][:4])

['Czech', 'German', 'Arabic', 'Japanese']
['Abl', 'Adsit', 'Ajdrna', 'Alt']


In [8]:
def letter_to_index(letter):
    return all_letters.find(letter)

def letter_to_tensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letter_to_index(letter)] = 1
    return tensor

def line_to_tensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for idx, letter in enumerate(line):
        tensor[idx][0][letter_to_index(letter)] = 1
    return tensor

In [9]:
from model import LstmCell

class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.output_size = output_size
        self.lstm = LstmCell(input_size, hidden_size)
        self.linear = nn.Linear(hidden_size, output_size)
        self.reset_parameters()

    def reset_state(self):
        self.lstm.reset_state()
    
    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.output_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, x):
        x = self.lstm(x)
        x = self.linear(x)
        return x

In [10]:
hidden_size = 100
net = Net(n_letters, hidden_size, len(all_categories))

In [11]:
def random_training_example():
    cat = random.choice(all_categories)
    cat_t = torch.tensor([all_categories.index(cat)], dtype=torch.long)

    line = random.choice(category_lines[cat])
    line_t = line_to_tensor(line)
    
    return cat, line, cat_t, line_t

In [12]:
for i in range(4):
    category, line, category_tensor, line_tensor = random_training_example()
    print('category =', category, '/ line =', line)

category = Vietnamese / line = Han
category = Russian / line = Galena
category = Greek / line = Kouros
category = Japanese / line = Fuse


In [13]:
def category_from_output(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(net.parameters(), learning_rate,
                                momentum=0.9, alpha=0.95)

In [14]:
def train(net, criterion, optimizer):
    n_iters = 200000
    print_every = 5000
    learning_rate = 0.0001

    cur_loss = 0
    correct_count = 0
    losses = []
    
    for idx in range(1, n_iters+1):
        cat, line, cat_t, line_t = random_training_example()

        net.reset_state()
        optimizer.zero_grad()

        for char_t in line_t:
            output = net(char_t)

        loss = criterion(output, cat_t)
        loss.backward()
        losses.append(loss.item())
        optimizer.step()
        
        guess, guess_i = category_from_output(output)
        if guess == cat:
            correct_count += 1
            
        if idx == 1 or idx % print_every == 0:
            losses = losses[-100:]
            progress = idx / n_iters * 100
            precision = correct_count / print_every * 100
            correct = '✓' if guess == cat else '✗ (answer %s)' % cat
            print('%d(%d%%) loss %.4f(mean %.4f) precision %.4f%% %s / %s %s' % (
                idx, progress, loss, sum(losses)/len(losses), precision, line, guess, correct))
            correct_count = 0

    return output, loss.item()

In [15]:
train(net, criterion, optimizer)

1(0%) loss 3.0800(mean 3.0800) precision 0.0000% Hwang / French ✗ (answer Korean)
5000(2%) loss 1.2659(mean 1.6370) precision 32.8000% Dubanowski / Polish ✓
10000(5%) loss 1.5804(mean 1.3333) precision 51.8400% Vasyuk / Czech ✗ (answer Russian)
15000(7%) loss 2.1582(mean 1.2622) precision 57.9000% Del olmo / Italian ✗ (answer Spanish)
20000(10%) loss 0.2749(mean 1.0271) precision 63.6000% Gui / Chinese ✓
25000(12%) loss 3.1362(mean 0.9027) precision 68.0000% Albuquerque / Portuguese ✗ (answer Spanish)
30000(15%) loss 0.1172(mean 0.7063) precision 71.6800% Mustafa / Arabic ✓
35000(17%) loss 3.6725(mean 0.7841) precision 74.3200% Sappe / Dutch ✗ (answer Czech)
40000(20%) loss 1.1175(mean 0.6720) precision 76.8400% Ozimuk / Japanese ✗ (answer Czech)
45000(22%) loss 1.5211(mean 0.7740) precision 78.6200% Wilchek / Polish ✗ (answer Czech)
50000(25%) loss 0.5574(mean 0.6166) precision 80.7800% Rocco / Italian ✓
55000(27%) loss 0.0009(mean 0.4168) precision 82.1600% Shimazu / Japanese ✓
60000

(tensor([[-69.6351, -67.4072, -61.4861, -64.5850, -71.0838, -71.5544, -60.0541,
          -70.6511, -66.6004, -65.3142, -66.0218, -74.3977, -70.6490, -67.3909,
          -74.3360, -69.1282, -71.7948, -75.9072]], grad_fn=<AddmmBackward>),
 1.6634140014648438)