In [0]:
#https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
path = '/content/drive/My Drive/data/data/names/'

In [0]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

In [5]:
def findFiles(path): return glob.glob(path)

print(findFiles(path + '*.txt'))

['/content/drive/My Drive/data/data/names/Chinese.txt', '/content/drive/My Drive/data/data/names/Arabic.txt', '/content/drive/My Drive/data/data/names/Czech.txt', '/content/drive/My Drive/data/data/names/Spanish.txt', '/content/drive/My Drive/data/data/names/Scottish.txt', '/content/drive/My Drive/data/data/names/French.txt', '/content/drive/My Drive/data/data/names/English.txt', '/content/drive/My Drive/data/data/names/Korean.txt', '/content/drive/My Drive/data/data/names/Dutch.txt', '/content/drive/My Drive/data/data/names/Polish.txt', '/content/drive/My Drive/data/data/names/Greek.txt', '/content/drive/My Drive/data/data/names/Japanese.txt', '/content/drive/My Drive/data/data/names/Italian.txt', '/content/drive/My Drive/data/data/names/Vietnamese.txt', '/content/drive/My Drive/data/data/names/German.txt', '/content/drive/My Drive/data/data/names/Irish.txt', '/content/drive/My Drive/data/data/names/Russian.txt', '/content/drive/My Drive/data/data/names/Portuguese.txt']


In [0]:
import unicodedata
import string

In [0]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

In [8]:
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicodeToAscii('Ślusàrski'))

Slusarski


In [0]:
vocab = string.ascii_letters + " .,;'"
vocab_len = len(vocab)

In [0]:
def letter_to_idx(letter): return vocab.find(letter)

In [0]:
def tokenizer(name): 
    return [w for w in name]

In [0]:
import torchtext
from torchtext import data

txt_field = data.Field(tokenize=tokenizer, include_lengths=True,pad_token=None,unk_token=None)
label_field = data.Field(sequential=False,use_vocab=False, pad_token=None,unk_token=None)

In [0]:
train_val_fields = [
    ('name', txt_field), # process it as text
    ('country', label_field) # process it as label
]

In [17]:
ex = data.Example.fromlist(['Amanda', 1], train_val_fields)

print(zip(train_val_fields[0], 'Amanda'))
for (name, country), val in zip(train_val_fields, ['Amanda', 1]):
  print(name, country)

<zip object at 0x7ff8e24e2c88>
name <torchtext.data.field.Field object at 0x7ff92bdbe358>
country <torchtext.data.field.Field object at 0x7ff92bdbe5c0>


In [18]:
# form dataset of name, country

name_ctry_pairs = []
countries = {}

cidx = 0

for filename in findFiles(path + '*.txt'):
  country = os.path.splitext(os.path.basename(filename))[0]
  lines = open(filename, encoding='utf-8').read().strip().split('\n')
  lines = [unicodeToAscii(line) for line in lines]
  countries[country] = cidx

  for l in lines:
    ex = data.Example.fromlist([l, cidx], train_val_fields)
    name_ctry_pairs.append(ex)

  cidx += 1

print(name_ctry_pairs[:10])

[<torchtext.data.example.Example object at 0x7ff8e24f9c88>, <torchtext.data.example.Example object at 0x7ff8e24ef3c8>, <torchtext.data.example.Example object at 0x7ff8e24f9c50>, <torchtext.data.example.Example object at 0x7ff8e24f9c18>, <torchtext.data.example.Example object at 0x7ff8e24f9b70>, <torchtext.data.example.Example object at 0x7ff8e24f9cf8>, <torchtext.data.example.Example object at 0x7ff8e24f9d30>, <torchtext.data.example.Example object at 0x7ff8e24f9d68>, <torchtext.data.example.Example object at 0x7ff8e24f9da0>, <torchtext.data.example.Example object at 0x7ff8e24f9dd8>]


In [20]:
import torch

cs = list(countries.keys())
cs.index('Italian')
torch.tensor([cs.index('Italian')], dtype=torch.long)

tensor([12])

In [0]:
ds = data.Dataset(examples=name_ctry_pairs, fields=train_val_fields, filter_pred=None)

In [0]:
trds, valds = ds.split(stratified=True, strata_field="country")

In [24]:
len(trds), len(valds)

(14053, 6021)

In [25]:
trds.examples[-1].country

17

In [0]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
model = RNN(n_letters, n_hidden, len(countries))

In [0]:
import time
import torch.optim as optim

losses = []
loss_fn = nn.NLLLoss()

In [0]:
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letter_to_idx(letter)] = 1
    return tensor

In [0]:
def generate_batch(batch):
  label = torch.tensor([[int(entry.country)] for entry in batch], dtype=torch.long)
  text = []
  for entry in batch:
    temp = lineToTensor(entry.name)
    text.append(temp)

  return text, label

In [0]:
from torch.utils.data import DataLoader
learning_rate = 0.01

def train_func(sub_train_):
  train_loss = 0
  train_acc = 0

  batch_data = DataLoader(sub_train_, batch_size=32, shuffle=True,
                      collate_fn=generate_batch)

  for i, (text, cls) in enumerate(batch_data):
    for t, c in zip(text, cls):
      #optimizer.zero_grad()
      model.zero_grad()

      hidden = model.initHidden()

      for i in range(t.size()[0]):
        output, hidden = model(t[i], hidden)

      loss = loss_fn(output, c)
      train_loss += loss.item()
      loss.backward()
      #optimizer.step()

      if output.argmax(1) == c:
        train_acc += 1

      for p in model.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

  return train_loss / len(sub_train_), train_acc / len(sub_train_)

In [0]:
import time
import torch.optim as optim

N_EPOCHS = 10
min_valid_loss = float('inf')

In [39]:
for epoch in range(N_EPOCHS):

  start_time = time.time()
  train_loss, train_acc = train_func(trds)

  secs = int(time.time() - start_time)
  mins = secs / 60
  secs = secs % 60

  print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
  print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')

Epoch: 1  | time in 0 minutes, 21 seconds
	Loss: 1.3421(train)	|	Acc: 60.3%(train)
Epoch: 2  | time in 0 minutes, 20 seconds
	Loss: 1.0627(train)	|	Acc: 69.0%(train)
Epoch: 3  | time in 0 minutes, 20 seconds
	Loss: 0.9844(train)	|	Acc: 70.9%(train)
Epoch: 4  | time in 0 minutes, 20 seconds
	Loss: 0.9371(train)	|	Acc: 71.9%(train)
Epoch: 5  | time in 0 minutes, 20 seconds
	Loss: 0.9121(train)	|	Acc: 72.6%(train)
Epoch: 6  | time in 0 minutes, 21 seconds
	Loss: 0.8963(train)	|	Acc: 73.1%(train)
Epoch: 7  | time in 0 minutes, 21 seconds
	Loss: 0.8893(train)	|	Acc: 72.9%(train)
Epoch: 8  | time in 0 minutes, 21 seconds
	Loss: 0.8789(train)	|	Acc: 73.1%(train)
Epoch: 9  | time in 0 minutes, 20 seconds
	Loss: 0.8680(train)	|	Acc: 73.4%(train)
Epoch: 10  | time in 0 minutes, 20 seconds
	Loss: 0.8558(train)	|	Acc: 73.7%(train)


In [0]:
a = lineToTensor('Toyama')

hidden = model.initHidden()
for i in range(a.size()[0]):
  output, hidden = model(a[i], hidden)

In [69]:
country_names = {v: k for (k, v) in countries.items()}

o = output.argmax(1).item()
print(country_names[o])

Japanese
