In [1]:
# https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html
# https://jaketae.github.io/study/pytorch-rnn/

In [2]:
import os
import random
from string import ascii_letters

import torch
from torch import nn
import torch.nn.functional as F
from unidecode import unidecode

_ = torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
data_dir = "./data/names"

lang2label = {
    file_name.split(".")[0]: torch.tensor([i], dtype=torch.long)
    for i, file_name in enumerate(os.listdir(data_dir))
}

num_langs = len(lang2label)

In [4]:
lang2label

{'English': tensor([0]),
 'Spanish': tensor([1]),
 'Arabic': tensor([2]),
 'Russian': tensor([3]),
 'Vietnamese': tensor([4]),
 'Korean': tensor([5]),
 'French': tensor([6]),
 'Portuguese': tensor([7]),
 'Italian': tensor([8]),
 'Polish': tensor([9]),
 'Greek': tensor([10]),
 'Irish': tensor([11]),
 'Japanese': tensor([12]),
 'Chinese': tensor([13]),
 'Scottish': tensor([14]),
 'German': tensor([15]),
 'Dutch': tensor([16]),
 'Czech': tensor([17])}

In [5]:
# os.listdir(data_dir)

In [6]:
unidecode("Ślusàrski")

'Slusarski'

In [7]:
char2idx = {letter: i for i, letter in enumerate(ascii_letters + " .,:;-'")}
num_letters = len(char2idx)
num_letters

59

In [8]:
# char2idx

In [9]:
def name2tensor(name):
    tensor = torch.zeros(len(name), 1, num_letters)
    for i, char in enumerate(name):
        tensor[i][0][char2idx[char]] = 1
    return tensor

In [10]:
name2tensor("abZ")

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          1., 0., 0., 0., 0., 0., 0., 0.]]])

In [11]:
tensor_names = []
target_langs = []

for file in os.listdir(data_dir):
    with open(os.path.join(data_dir, file)) as f:
        lang = file.split(".")[0]
        names = [unidecode(line.rstrip()) for line in f]
        for name in names:
            try:
                tensor_names.append(name2tensor(name))
                target_langs.append(lang2label[lang])
            except KeyError:
                pass

In [12]:
# target_langs

In [13]:
from sklearn.model_selection import train_test_split

train_idx, test_idx = train_test_split(
    range(len(target_langs)), 
    test_size=0.1, 
    shuffle=True, 
    stratify=target_langs,
    random_state=1
)

train_dataset = [
    (tensor_names[i], target_langs[i])
    for i in train_idx
]

test_dataset = [
    (tensor_names[i], target_langs[i])
    for i in test_idx
]

In [14]:
names, labels = next(iter(train_dataset))
names.shape

torch.Size([10, 1, 59])

In [15]:
print(f"Train: {len(train_dataset)}")
print(f"Test: {len(test_dataset)}")

Train: 18063
Test: 2007


In [16]:
class MyRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MyRNN, self).__init__()
        self.hidden_size = hidden_size
        self.input2hidden = nn.Linear(input_size, hidden_size)
        self.hidden2hidden = nn.Linear(hidden_size, hidden_size)
        self.hidden2output = nn.Linear(hidden_size, output_size)
        
        torch.nn.init.normal(self.input2hidden.weight, mean=0., std=0.05)
        torch.nn.init.normal(self.hidden2hidden.weight, mean=0., std=0.05)
        torch.nn.init.normal(self.hidden2output.weight, mean=0., std=0.05)
        
        torch.nn.init.zeros_(self.input2hidden.bias)
        torch.nn.init.zeros_(self.hidden2hidden.bias)
        torch.nn.init.zeros_(self.hidden2output.bias)        
    
    def forward(self, x, hidden_state):
        embeding = self.input2hidden(x)
        hidden_state = self.hidden2hidden(hidden_state)
        combined = torch.mean(torch.stack([embeding, hidden_state]), dim=0)
        # hidden = torch.sigmoid(combined)
        hidden = torch.tanh(combined)
        output = self.hidden2output(combined)
        return output, hidden
    
    def init_hidden(self):
        # return nn.init.kaiming_uniform_(torch.empty(1, self.hidden_size))
        return torch.zeros(1, self.hidden_size)

In [17]:
hidden_size = 256
learning_rate = 0.001

print(num_letters, hidden_size, num_langs)

model = MyRNN(num_letters, hidden_size, num_langs)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

59 256 18


  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


In [18]:
def get_acc() :    
    
    num_correct = 0
    num_samples = len(test_dataset)

    model.eval()

    with torch.no_grad():
        for name, label in test_dataset:
            hidden_state = model.init_hidden()
            for char in name:
                output, hidden_state = model(char, hidden_state)
            _, pred = torch.max(output, dim=1)
            num_correct += bool(pred == label)
            
    model.train()

    print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")

In [19]:
num_epochs = 4
print_interval = 3000

for epoch in range(num_epochs):
    random.shuffle(train_dataset)
    for i, (name, label) in enumerate(train_dataset):
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
            # print(output)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        
        if (i + 1) % print_interval == 0:
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], "
                f"Step [{i + 1}/{len(train_dataset)}], "
                f"Loss: {loss.item():.4f}"
            )
            get_acc()

Epoch [1/4], Step [3000/18063], Loss: 0.0002
Accuracy: 59.4918%
Epoch [1/4], Step [6000/18063], Loss: 0.0126
Accuracy: 63.7270%
Epoch [1/4], Step [9000/18063], Loss: 2.9937
Accuracy: 65.0224%
Epoch [1/4], Step [12000/18063], Loss: 1.2562
Accuracy: 67.7628%
Epoch [1/4], Step [15000/18063], Loss: 0.0025
Accuracy: 65.3214%
Epoch [1/4], Step [18000/18063], Loss: 0.0158
Accuracy: 69.0085%
Epoch [2/4], Step [3000/18063], Loss: 0.0026
Accuracy: 69.9552%
Epoch [2/4], Step [6000/18063], Loss: 0.1379
Accuracy: 67.8625%
Epoch [2/4], Step [9000/18063], Loss: 0.0000
Accuracy: 69.4569%
Epoch [2/4], Step [12000/18063], Loss: 0.0004
Accuracy: 71.0513%
Epoch [2/4], Step [15000/18063], Loss: 0.0086
Accuracy: 71.6492%
Epoch [2/4], Step [18000/18063], Loss: 2.8589
Accuracy: 71.4499%
Epoch [3/4], Step [3000/18063], Loss: 0.0000
Accuracy: 72.9447%
Epoch [3/4], Step [6000/18063], Loss: 0.0016
Accuracy: 71.9980%
Epoch [3/4], Step [9000/18063], Loss: 0.0082
Accuracy: 69.1579%
Epoch [3/4], Step [12000/18063], L

In [20]:
num_correct = 0
num_samples = len(test_dataset)

model.eval()

with torch.no_grad():
    for name, label in test_dataset:
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)
        num_correct += bool(pred == label)

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")

Accuracy: 73.4928%
