In [1]:
import glob
import os
import codecs
import string

categories = []
data_dict = {}

for country in glob.glob('train/*.txt'):
    categories += [os.path.splitext(os.path.basename(country))[0]]
    file = codecs.open(country, "r", encoding='utf-8', errors='ignore')
    cities = file.read().strip().split('\n')
    data_dict.update({city:categories[-1] for city in cities})

import numpy as np
letters = "".join(np.unique([a for c in data_dict.keys() for a in c]))
print(letters)
n_letters = len(letters)
print(n_letters)
n_categories = len(categories)
print(n_categories)

 "&'()-./0123456789`abcdefghijklmnopqrstuvwxyz
46
9


In [22]:
np.unique([a for a in "".join([c for c in data_dict.keys()])])


array([' ', '"', '&', "'", '(', ')', '-', '.', '/', '0', '1', '2', '3',
       '4', '5', '6', '7', '8', '9', '`', 'a', 'b', 'c', 'd', 'e', 'f',
       'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
       't', 'u', 'v', 'w', 'x', 'y', 'z'], dtype='<U1')

In [2]:
import torch

def letter2tensor(letter):
    tensor = torch.zeros(n_letters)
    tensor[letters.find(letter)]=1
    return tensor
# print(letter2tensor('j'))
def word2tensor(word):
    tensor = [letter2tensor(letter).reshape(1, -1) for letter in word]
    tensor = torch.cat(tensor, 0)
    return tensor
# word2tensor('abcd')

In [14]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=0)

    def forward(self, input, hidden):
        print(input, hidden)
        combined = torch.cat((input, hidden), 0)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)

In [15]:
input = word2tensor('albert')
hidden = torch.zeros(n_hidden)

output, next_hidden = rnn(input[0], hidden)
print(output)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]) tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])
tensor([-2.1697, -2.2948, -2.1505, -2.1618, -2.2481, -2.1742, -2.1969, -2.1358,
        -2.2551], grad_fn=<LogSoftmaxBackward>)
