In [0]:
#https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
path = '/content/drive/My Drive/data/data/names/'

In [0]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

In [0]:
def findFiles(path): return glob.glob(path)

print(findFiles(path + '*.txt'))

['/content/drive/My Drive/data/data/names/Chinese.txt', '/content/drive/My Drive/data/data/names/Arabic.txt', '/content/drive/My Drive/data/data/names/Czech.txt', '/content/drive/My Drive/data/data/names/Spanish.txt', '/content/drive/My Drive/data/data/names/Scottish.txt', '/content/drive/My Drive/data/data/names/French.txt', '/content/drive/My Drive/data/data/names/English.txt', '/content/drive/My Drive/data/data/names/Korean.txt', '/content/drive/My Drive/data/data/names/Dutch.txt', '/content/drive/My Drive/data/data/names/Polish.txt', '/content/drive/My Drive/data/data/names/Greek.txt', '/content/drive/My Drive/data/data/names/Japanese.txt', '/content/drive/My Drive/data/data/names/Italian.txt', '/content/drive/My Drive/data/data/names/Vietnamese.txt', '/content/drive/My Drive/data/data/names/German.txt', '/content/drive/My Drive/data/data/names/Irish.txt', '/content/drive/My Drive/data/data/names/Russian.txt', '/content/drive/My Drive/data/data/names/Portuguese.txt']


In [0]:
import unicodedata
import string

In [0]:
vocab = string.ascii_letters + " .,;'"
vocab_len = len(vocab)

In [0]:
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in vocab
    )

print(unicodeToAscii('Ślusàrski'))

Slusarski


In [0]:
def letter_to_idx(letter): 
  if letter == 'EOS':
    return vocab_len
  else:
    return vocab.find(letter)

In [0]:
n_letters = vocab_len + 1

In [0]:
def tokenizer(name): 
  letters = [w for w in name]
  letters.append("EOS")
  return letters

In [0]:
import torchtext
from torchtext import data

txt_field = data.Field(tokenize=tokenizer, include_lengths=True,pad_token=None,unk_token=None)
label_field = data.Field(sequential=False,use_vocab=False, pad_token=None,unk_token=None)

In [0]:
train_val_fields = [
    ('name', txt_field), # process it as text
    ('country', label_field) # process it as label
]

In [0]:
ex = data.Example.fromlist(['Amanda', 1], train_val_fields)

print(zip(train_val_fields[0], 'Amanda'))
for (name, country), val in zip(train_val_fields, ['Amanda', 1]):
  print(name, country)

<zip object at 0x7f0b0dcb3808>
name <torchtext.data.field.Field object at 0x7f0b0dcc38d0>
country <torchtext.data.field.Field object at 0x7f0b0dcc3908>


In [0]:
# form dataset of name, country

name_ctry_pairs = []
countries = {}

cidx = 0

for filename in findFiles(path + '*.txt'):
  country = os.path.splitext(os.path.basename(filename))[0]
  lines = open(filename, encoding='utf-8').read().strip().split('\n')
  lines = [unicodeToAscii(line) for line in lines]
  countries[country] = cidx

  for l in lines:
    ex = data.Example.fromlist([l, cidx], train_val_fields)
    name_ctry_pairs.append(ex)

  cidx += 1

print(name_ctry_pairs[:10])

[<torchtext.data.example.Example object at 0x7f0b0dfb6240>, <torchtext.data.example.Example object at 0x7f0b0dcc3c88>, <torchtext.data.example.Example object at 0x7f0b0dcb5780>, <torchtext.data.example.Example object at 0x7f0b0dcb5da0>, <torchtext.data.example.Example object at 0x7f0b0dcb5710>, <torchtext.data.example.Example object at 0x7f0b0dcb5be0>, <torchtext.data.example.Example object at 0x7f0b0dcb5080>, <torchtext.data.example.Example object at 0x7f0b0dcb5c88>, <torchtext.data.example.Example object at 0x7f0b0dcb51d0>, <torchtext.data.example.Example object at 0x7f0b0dcb5ac8>]


In [0]:
import torch

cs = list(countries.keys())
cs.index('Italian')
torch.tensor([cs.index('Italian')], dtype=torch.long)

tensor([12])

In [0]:
ds = data.Dataset(examples=name_ctry_pairs, fields=train_val_fields, filter_pred=None)

In [0]:
trds, valds = ds.split(stratified=True, strata_field="country")

In [0]:
len(trds), len(valds)

(14053, 6021)

In [0]:
trds.examples[-1].country

17

In [0]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(len(countries) + input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(len(countries) + input_size + hidden_size, output_size)
        self.o2o = nn.Linear(output_size + hidden_size, output_size)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, country, input, hidden):
        combined = torch.cat((country, input, hidden), 1)
        hidden, output = self.i2h(combined), self.i2o(combined)
        out_combined = torch.cat((hidden, output), 1)
        output = self.o2o(out_combined)
        output = self.dropout(output)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
model = RNN(n_letters, n_hidden, n_letters)

In [0]:
import time
import torch.optim as optim

losses = []
loss_fn = nn.NLLLoss()

In [0]:
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
      tensor[li][0][letter_to_idx(letter)] = 1
    return tensor

In [0]:
def categoryTensor(category):
    tensor = torch.zeros(1, len(countries))
    tensor[0][category] = 1
    return tensor

In [0]:
def generate_batch(batch):
  label_lst = []
  text_lst = []
  target_lst = []

  for entry in batch:
    label = categoryTensor(entry.country)
    label_lst.append(label)
    text = lineToTensor(entry.name)
    text_lst.append(text)
    target = [letter_to_idx(w) for w in entry.name]
    target_lst.append(target)

  return text_lst, label_lst, target_lst

In [0]:
from torch.utils.data import DataLoader
learning_rate = 0.01

def train_func(sub_train_):
  
  train_loss = 0

  batch_data = DataLoader(sub_train_, batch_size=32, shuffle=True,
                      collate_fn=generate_batch)

  # every batch
  for i, (text, cls, target) in enumerate(batch_data):

    # every example
    for t, c, tgt in zip(text, cls, target):
      model.zero_grad()

      hidden = model.initHidden()

      loss = 0

      for i in range(t.size()[0]-1):
        output, hidden = model(c, t[i], hidden)

        tgt_val = torch.tensor([tgt[i+1]])

        l = loss_fn(output, tgt_val)
        loss += l

      loss.backward()

      for p in model.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

      train_loss += loss.item() / len(t)

  train_loss = train_loss / len(sub_train_)

  return train_loss

In [0]:
import time
import torch.optim as optim

N_EPOCHS = 10
min_valid_loss = float('inf')

In [0]:
for epoch in range(N_EPOCHS):

  start_time = time.time()
  train_loss = train_func(trds)

  secs = int(time.time() - start_time)
  mins = secs / 60
  secs = secs % 60

  print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
  print(f'\tLoss: {train_loss:.4f}(train)')

Epoch: 1  | time in 0 minutes, 50 seconds
	Loss: 2.0336(train)
Epoch: 2  | time in 0 minutes, 50 seconds
	Loss: 2.0308(train)
Epoch: 3  | time in 0 minutes, 49 seconds
	Loss: 2.0311(train)
Epoch: 4  | time in 0 minutes, 49 seconds
	Loss: 2.0270(train)
Epoch: 5  | time in 0 minutes, 49 seconds
	Loss: 2.0297(train)
Epoch: 6  | time in 0 minutes, 54 seconds
	Loss: 2.0257(train)
Epoch: 7  | time in 0 minutes, 50 seconds
	Loss: 2.0276(train)
Epoch: 8  | time in 0 minutes, 50 seconds
	Loss: 2.0260(train)
Epoch: 9  | time in 0 minutes, 50 seconds
	Loss: 2.0273(train)
Epoch: 10  | time in 0 minutes, 50 seconds
	Loss: 2.0277(train)


In [0]:
start_char = 'B'
a = lineToTensor(start_char)
h = model.initHidden()
c = countries['Spanish']
c = categoryTensor(c)

full_name = [start_char]
while True:
  o, h = model(c, a[0], h)
  max_idx = o.argmax(1).item()
  if max_idx == len(vocab):
    break

  next_char = vocab[max_idx]
  full_name.append(next_char)
  
  a = lineToTensor(next_char)

print("".join(full_name))

Balos
