In [84]:
from __future__ import unicode_literals, division
from io import open
import glob
import unicodedata
import string

*** Our Data ***
We are going to be training a char level RNN on lists of baby names to generate new names in a given language

In [85]:
all_letters = string.ascii_letters + ".,;'-"
n_letters = len(all_letters) + 1 #end of sentence marker == \n

In [86]:
# gives you a list of files of a given type in a dir using regex
def find_files(path):
    return glob.glob(path)

filenames = find_files("data/names/*.txt")

In [87]:
# Turn a Unicode string to plain ASCII: http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [88]:
# get all lines of a file as a list of ascii_strings
def read_lines(filename):
    lines = open(filename).read().strip().split('\n')
    return [unicode_to_ascii(line) for line in lines]

read_lines(filenames[0])

[u'Khoury',
 u'Nahas',
 u'Daher',
 u'Gerges',
 u'Nazari',
 u'Maalouf',
 u'Gerges',
 u'Naifeh',
 u'Guirguis',
 u'Baba',
 u'Sabbagh',
 u'Attia',
 u'Tahan',
 u'Haddad',
 u'Aswad',
 u'Najjar',
 u'Dagher',
 u'Maloof',
 u'Isa',
 u'Asghar',
 u'Nader',
 u'Gaber',
 u'Abboud',
 u'Maalouf',
 u'Zogby',
 u'Srour',
 u'Bahar',
 u'Mustafa',
 u'Hanania',
 u'Daher',
 u'Tuma',
 u'Nahas',
 u'Saliba',
 u'Shamoon',
 u'Handal',
 u'Baba',
 u'Amari',
 u'Bahar',
 u'Atiyeh',
 u'Said',
 u'Khouri',
 u'Tahan',
 u'Baba',
 u'Mustafa',
 u'Guirguis',
 u'Sleiman',
 u'Seif',
 u'Dagher',
 u'Bahar',
 u'Gaber',
 u'Harb',
 u'Seif',
 u'Asker',
 u'Nader',
 u'Antar',
 u'Awad',
 u'Srour',
 u'Shadid',
 u'Hajjar',
 u'Hanania',
 u'Kalb',
 u'Shadid',
 u'Bazzi',
 u'Mustafa',
 u'Masih',
 u'Ghanem',
 u'Haddad',
 u'Isa',
 u'Antoun',
 u'Sarraf',
 u'Sleiman',
 u'Dagher',
 u'Najjar',
 u'Malouf',
 u'Nahas',
 u'Naser',
 u'Saliba',
 u'Shamon',
 u'Malouf',
 u'Kalb',
 u'Daher',
 u'Maalouf',
 u'Wasem',
 u'Kanaan',
 u'Naifeh',
 u'Boutros',
 u'Mog

Building categories so that we can train each language seperatly

In [89]:
# category_lines dictionary
# key: language
# value: list of names in that language
category_lines = {}
all_categories = []

for filename in find_files('data/names/*.txt'):
    category = filename.split('/')[-1].split('.')[0]
    all_categories.append(category)
    lines = read_lines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)
all_categories[0]
category_lines['Czech'][0]

u'Abl'

*** Data Representation ***
We need some representation of charecters that a network can understand as well as one that it can compare error in between.

How do you know how to update things when you are wrong? And if you are wrong, what is the differernce between precicting q vs. s when you should have predicited t

To do this we will use a "one-hot-encoding" of letters. A one-hot vector is a 1 x n_letters vector is filled with 0s except for a 1 at index of the represented value. 

In [90]:
import torch

In [91]:
def get_index(letter):
    return all_letters.find(letter)

def letter_to_vec(letter):
    vec = torch.zeros(1, n_letters)
    vec[0][get_index(letter)] = 1
    return vec

def sen_to_tensor(line):
    #tensor = torch.zeros(len(line), 1, n_letters)
    tensor = torch.zeros(n_letters, 1, len(line))
    for pos, letter in enumerate(line):
        tensor[pos][0][get_index(letter)] = 1
    return tensor


sen_to_tensor("abcdefg hijk")[:10,0,:10].transpose(0,1)


    1     0     0     0     0     0     0     0     0     0
    0     1     0     0     0     0     0     0     0     0
    0     0     1     0     0     0     0     0     0     0
    0     0     0     1     0     0     0     0     0     0
    0     0     0     0     1     0     0     0     0     0
    0     0     0     0     0     1     0     0     0     0
    0     0     0     0     0     0     1     0     0     0
    0     0     0     0     0     0     0     0     1     0
    0     0     0     0     0     0     0     0     0     1
    0     0     0     0     0     0     0     0     0     0
[torch.FloatTensor of size 10x10]

*** So what are we doing anyway? ***

char-rnn's: hour goal here is to generate text through a char-rnn but what does that really mean? 

On a very high level we're just going to throw a huge mass of text (represetnted as vectors at each step) at a model and then ask it to predicit the probablity distribution of the next character in given the characters its predicted so far. This will let us prectict words one char at a time.

So where do we start?


# Network Arch

Well, lets "mathamatically" define the inputs and outputs.

## input
To predict the next character, conditional on the language we are in and based ont he characters we've predicted so far, our true input needs three things:

Category: the language we are generating in (1-hot vector)

Input: the last char we predicited

Hidden: a vector that is some latent representation of our current state

## hidden layer(s)

From the input layers we apply essentially just a two independent linear layers on the input vector.

One to determing the probablities of the next char for the output, and a seperate to update the state.

## output layer
We next combine the hidden state and the probablities of the next letter into a single vector, run that through another linear layer, apply dropout to regularize / prevent over-fitting, and then finally, use a softmax layer to pick the most likely next letter from the predicted probalities.  

Here is the arch visually:

![alt text][logo]

[logo]: network_arch.png

### lets see what that looks like in code!

In [92]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [93]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        # the true input len = num_categories + len(1-hot of next char) + len(hidden state representation)
        self.input_layer_size = n_categories + input_size + hidden_size

        # input_size -> hidden_size
        self.i2h = nn.Linear(self.input_layer_size, hidden_size)

        # input_size -> output_size
        self.i2o = nn.Linear(self.input_layer_size, output_size)

        # hidden_size + output_size -> output_size
        self.o2o = nn.Linear(hidden_size + output_size, output_size)

        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax()

    def forward(self, category, input_vec, hidden_vec):
        
        # combine inputs by concatonating by row
        input_combined = torch.cat((category, input_vec, hidden_vec), 1)
        
        hidden_vec = self.i2h(input_combined)
        output_vec = self.i2o(input_combined)
        
        output_combined = torch.cat((hidden_vec, output_vec), 1)
        
        output_vec = self.o2o(output_combined)
        output_vec = self.dropout(output_vec)
        output_vec = self.softmax(output_vec)
        
        return output_vec, hidden_vec
    
    # when we initalize the network - the hidden state starts off blank
    def init_hidden(self):
        return Variable(torch.zeros(1, self.hidden_size))

## Some Model Assumtions

A common assumtion in many machine learning models is that the training data is that the data is drawn iid from the population.

This is SELDOM true. For example, take a look at the first 10 french names we are training our name generator on:

1.  Abel
2.  Abraham
3.  Adam
4.  Albert
5.  Allard
6.  Archambault
7.  Armistead
8.  Arthur
9.  Augustin
10. Babineaux

Notice something? This isnt a unique situation to our problem => data is many times stored in a format that maintatins some sort of relationship between adjcent data points. This is exactly what we dont want however. Therefore, we will write us some data loading functinos that will help us get a random line from a random category

In [94]:
import random

# pick random item from list l
def random_elem(l):
    return l[random.randint(0,len(l) -1)]

def random_cat_and_line():
    cat = random_elem(all_categories)
    line = random_elem(category_lines[cat])
    return category, line

## converting to tensors
Recall that the network operates on tensors - not charectars. Therefore we need to convert the category into its one-hot representation.

We will also the line (name) into a matrix where each element of the matrix is the one hot vector of the coresponding letter in the sentence

In [141]:
def get_category_tensor(category):
    i = all_categories.index(category)
    tensor = torch.zeros(1, n_categories)
    tensor[0][i] = 1
    return tensor

def sample_tensor(word):
    tensor = torch.zeros(len(word), 1, n_letters)
    for i in range(len(word)):
        letter = word[i]
        #### COME BACK TO THIS LINE TO CHECK ITT #######3
        tensor[i][0][all_letters.find(letter)] = 1
    return tensor

# LongTensor of second letter to end (EOS) for target
def get_target_tensor(line):
    letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
    letter_indexes.append(n_letters - 1) # EOS
    return torch.LongTensor(letter_indexes)

get_target_tensor("abcdefg hijk")


  1
  2
  3
  4
  5
  6
 -1
  7
  8
  9
 10
 57
[torch.LongTensor of size 12]

In [96]:
# Tying it all together, this funtion
# picks a random word from a random category
# converts the word and its category to their vector representations
# generates the label to comapre the generated word to at each step
def random_train_example():
    category, word = random_cat_and_line()
    category_tensor = Variable(get_category_tensor(category))
    name_tensor = Variable(sample_tensor(word))
    name_target_tensor = Variable(get_target_tensor(word))
    return category_tensor, name_tensor, name_target_tensor

In [145]:
# using pytorch dataset / dataloader instead
from torch.utils.data import Dataset, DataLoader

class NamesDataset(Dataset):
    def __init__(self, path):
        self.path = path
        
        # init all_categories and category_lines
        self.all_categories = []
        self.category_lines = {}
        
        for filename in find_files(path):
            category = filename.split('/')[-1].split('.')[0]
            self.all_categories.append(category)
            lines = read_lines(filename)
            self.category_lines[category] = lines
        
        self.n_categories = len(self.all_categories)
        self.n_names_per_category = [len(self.category_lines[category]) for category in self.all_categories]
    
    def __len__(self):
        return sum(self.n_names_per_category)
    
    def __getitem__(self, index):
        category_index = 0
        name_index = index
        
        # find which category the name belongs in, as well as the index of that name within that category
        while name_index >= self.n_names_per_category[category_index]:
            name_index -= self.n_names_per_category[category_index]
            category_index += 1
        
        category = self.all_categories[category_index]
        word = category_lines[category][name_index]
        
        # convert to tensors
        category_tensor = get_category_tensor(category)
        name_tensor = sample_tensor(word)
        name_target_tensor = get_target_tensor(word)
        
        return category_tensor, name_tensor, name_target_tensor

Note: I'm getting tired of writting explinations so if the quality of this degrades as we move forward blame it on 233 being a thing

# Training the network

So we have a recurrent network and a way to load examples into that network, how do we train it?

Pulling from karpathy's blog recall what we're trying to do. As a working example, suppose we only had a vocabulary of four possible letters “helo”, and wanted to train the RNN on the training sequence “hello”. 

This training sequence actually ends up feeding 4 separate training examples: 

1. The probability of “e” should be likely given the context of “h”, 
2. “l” should be likely in the context of “he”, 
3. “l” should also be likely given the context of “hel”, and finally 
4. “o” should be likely given the context of “hell”.


The high level version of training is to, at every timestep ask the network to predict what it expects the next letter to be given the context so far. Then if it mis-predicts we can propegate the error between what it predicts and what it should have predicted back through the network using back-prop.

Through back-prop we can figure out in what direction we should adjust each weight in the network to increase the scores of the correct targets. Using that we perform a parameter update, which nudges every weight a tiny amount in this gradient direction. 

***Note: for a great explination of backprop check out:*** http://neuralnetworksanddeeplearning.com/chap2.html

In [146]:
criterion = nn.NLLLoss()

learning_rate = 0.0005
rnn = RNN(n_letters, 128, n_letters)
def train(category_tensor, sample_tensor, target_tensor):
    hidden = rnn.init_hidden()
    
    rnn.zero_grad()
    
    loss = 0
    
    for i in range(sample_tensor.size()[0]):
        output, hidden = rnn(category_tensor, sample_tensor[i], hidden)
        loss += criterion(output, target_tensor[i])
        
    loss.backward()
    
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)
        
    return output, loss.data[0] / target_tensor.size()[0]

In [147]:
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [149]:
rnn = RNN(n_letters, 128, n_letters)

names_dataset = NamesDataset(path="data/names/*.txt")

n_iters = 100000
print_every = 5000
plot_every = 500
all_losses = []
total_loss = 0

# NOTE: CANNOT HAVE A BATCH SIZE > 1 BECAUSE WORDS ARE OF DIFFERENT LENGTH, THEREFORE
# TENSORS WILL ALSO BE OF DIFFERENT DIMENSIONS AND CAN'T BE COMBINED
batch_size = 1

start = time.time()

iter = 0
while iter < n_iters:
    # will this crash / cause errors with batch size > 1 and workers > 1 once this gets hit for the second time?
    # don't know because can't test batching
    dataloader = DataLoader(names_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    
    for i_batch, training_example in enumerate(dataloader):
        iter += 1
        
        if iter > n_iters:
            break
            
        category_tensor, name_tensor, name_target_tensor = training_example
        
        # dataloader adds an extra dimension because of batches
        for i in range(batch_size):
            output, loss = train(Variable(category_tensor[i]), Variable(name_tensor[i]), Variable(name_target_tensor[i]))
            total_loss += loss

        if iter % print_every == 0:
            print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss))

        if iter % plot_every == 0:
            all_losses.append(total_loss / plot_every)
            total_loss = 0

1m 17s (5000 5%) 3.3682
2m 32s (10000 10%) 2.1872
3m 46s (15000 15%) 2.8471
4m 57s (20000 20%) 2.0927
6m 3s (25000 25%) 2.7306
7m 11s (30000 30%) 2.3500
8m 18s (35000 35%) 2.0308
9m 26s (40000 40%) 2.6501
10m 36s (45000 45%) 2.5331
11m 45s (50000 50%) 3.7872
12m 54s (55000 55%) 2.0677
14m 6s (60000 60%) 2.1007
15m 18s (65000 65%) 1.6486
16m 21s (70000 70%) 2.6857
17m 32s (75000 75%) 3.7048
18m 42s (80000 80%) 1.6884
19m 54s (85000 85%) 2.2310
21m 4s (90000 90%) 1.6593
22m 8s (95000 95%) 1.8860
23m 11s (100000 100%) 2.1244


In [None]:
max_length = 20

# Sample from a category and starting letter
def sample(category, start_letter='A'):
    category_tensor = Variable(get_category_tensor(category))
    input = Variable(sample_tensor(start_letter))
    hidden = rnn.init_hidden()

    output_name = start_letter

    for i in range(max_length):
        output, hidden = rnn(category_tensor, input[0], hidden)
        topv, topi = output.data.topk(1)
        topi = topi[0][0]
        if topi == n_letters - 1:
            break
        else:
            letter = all_letters[topi]
            output_name += letter
        input = Variable(sample_tensor(letter))

    return output_name

# Get multiple samples from one category and multiple starting letters
def samples(category, start_letters='ABC'):
    for start_letter in start_letters:
        print(sample(category, start_letter))

samples('Russian', 'RUS')

samples('German', 'GER')

samples('Spanish', 'SPA')

samples('English', 'ENG')