In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline



# Name generation using Recurrent neural networks

In this excersise we will be implementing a machine learning algorithm that generates english names. For this we will be using Recurrent neural networks

The framework that we are going to use is pytorch: [Pytorch](https://pytorch.org/docs/stable/nn.html)

We will be implementing character level model that predicts next character given first N characters. First, we will transform raw text names inot numeric data, then we will normalize it(add padding), then we will build our character-level model, and train it on the normalized input. After that we will try to generate names. 


Import libraries

In [2]:

# Import pytorch https://pytorch.org/docs/stable/index.html
import torch
import torch.nn.functional as F
# Import numpy https://docs.scipy.org/doc/numpy/dev/
import numpy as np
from random import sample

import time
import math

import pdb


In [3]:

# Util function to keep track of execution time
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


In [4]:
print(torch.__version__)

1.0.1.post2


Download task data, it consists of a single file: names

In [7]:
import sys
sys.path.append('../')

import common.workshop

common.workshop.download_name_generation()

File names/names is already downloaded.


In [8]:
!ls ./

artistic_transfer.ipynb     names		  text_generation_pytorch.ipynb
charrnn_pytorch.ipynb	    NER-pytorch.ipynb
MNIST-classification.ipynb  TagsPrediction.ipynb


In [9]:

DATA_FILE = './names/names'


Implement a method that reads data from DATA_FILE into array of strings

In [10]:

# TODO: Implement

def read_names(filepath):
    names = []
    with open(DATA_FILE) as f:
        names = f.read().split('\n')[:-1]
    return names



In [11]:

names = read_names(DATA_FILE)

print(names[0:10])

assert(len(names) == 7944)


['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale']


### Explain overall approach

### Data preprocessing

in the following cells we will implement simple data processing, that will translate the text into the numeric values.

We will first construct the alphabet, then we will construct useful dictionaries and after that we will use one-hot-encoding technique to encode the input data


In [12]:
## define constants

MAX_VECTOR_LEN = 15
EOS_token = '#'


Implement function that constucts the alphabet. 
Don't forget to include EOS_token into the alphabet!

In [13]:

# TODO: Implement

def construct_alphabet(names):
    alphabet = set()
    for name in names:
        for token in name:
            alphabet.add(token)
    alphabet.add(EOS_token)
    return alphabet




In [14]:
# alphabet variable should be a set of all unique characters that are in the input data(names)
alphabet = construct_alphabet(names)
n_letters = len(alphabet)

assert(n_letters == 56)

The next step that we need to do is to construct two dictionaries:

    token2ixdx = token -> index - is a dictionary that maps character to the idividual integer
    
    idx2token = index -> token - is a dictionary that maps integer back into the character
    

In [15]:

# TODO: implement

def construct_data_dictionaries(alphabet):
    token2idx = {}
    idx2token = {}
    for ind,token in enumerate(alphabet):
        token2idx[token] = ind
        idx2token[ind] = token
    return token2idx, idx2token


In [16]:

token2idx, idx2token = construct_data_dictionaries(alphabet)

assert(len(token2idx) == len(idx2token))

assert(len(token2idx) == 56)

print(token2idx)



{'v': 0, 'P': 1, 'G': 2, 't': 3, 'A': 4, 'L': 5, 'k': 6, 'l': 7, 'o': 8, 'M': 9, 'w': 10, 'O': 11, 'U': 12, 'R': 13, 'b': 14, 'X': 15, 'S': 16, 'y': 17, 'z': 18, 'J': 19, 'c': 20, 'e': 21, 'f': 22, 'x': 23, 'E': 24, "'": 25, 'I': 26, 'W': 27, 'F': 28, 'u': 29, 'g': 30, 'N': 31, 'Q': 32, 'r': 33, 'h': 34, 'a': 35, 'D': 36, 'B': 37, 's': 38, 'i': 39, 'j': 40, ' ': 41, 'V': 42, '#': 43, 'q': 44, 'n': 45, 'd': 46, 'H': 47, 'm': 48, 'T': 49, 'Y': 50, '-': 51, 'p': 52, 'Z': 53, 'C': 54, 'K': 55}


### One hot encoding.

In the following section we will implement one hot encoding. The best way to explain one hot encoding is via example:

Input:

    dictionary = {'t': 0, 'e': 1, 's': 2}
    
    input = 'test'
    
    output = [
              [1, 0, 0], # t - has index 0
              [0, 1, 0], # e - has index 1
              [0, 0, 1], # s - has index 2
              [1, 0, 0]  # t - has index 0
             ]
             
As can be seen, one hot encoding accepts dictionary and input_string as input parameters and returns the matrix, where each row has size of the dictionary with only one index that has value 0 and all other indexes have value zero.


In the cell below you need to implement the one_hot_encode method, given line it contructs the matrix in the form above.

use token2idx as dictionary, 

use n_letters as size of your vector



In [17]:

# Implement

def one_hot_encode(line):
    matrix = np.zeros((len(line), n_letters))
    for li in range(len(line)):
        letter = line[li]
        matrix[li][token2idx[letter]] = 1
    return matrix



In [18]:

encoded = one_hot_encode('test')

print(encoded)

assert(encoded.shape == (4, n_letters))


[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]]


The following functions are convenience functions for generating input data.

get_target_vector returns the target output, which is shifted by one the input name with EOS token at the end.

    EOS_TOKEN = '#'

    dictionary = {'m': 0, 'a': 1, 'c': 2, 'h': 3, 'i': 4, 'n': 5, 'e': 6 , '#': 7}

    input = machine
    
    output = [1,2,3,4,5,6,7]

In [19]:

def get_input_vector(line):
    return one_hot_encode(line)

def get_target_vector(line):
    letter_indexes = [token2idx[letter] for letter in line[1:]]
    letter_indexes.append(token2idx[EOS_token]) # EOS
    return np.array(letter_indexes)


In [20]:

get_target_vector('line')


array([39, 45, 21, 43])

Next thing that we need to do is to implement a function that given list of names, picks *batch_size* names at random

In [21]:

# Implement, use sample function from random package

def sample_names(batch_size=32):
    return sample(names, batch_size)


In [22]:

sample_names(10)


['Martino',
 'Errol',
 'Chloris',
 'Foster',
 'Waiter',
 'Agamemnon',
 'Lucy',
 'Auroora',
 'Flora',
 'Lilllie']

So far we defined out input and output parameters. Each train sample consists of a pair: Name, Name shifted by one. 
Using function above we will transform it to the format, understandable to neural networks. 

Example:

    dictionary: {'m': 0, 'a': 1, 'c': 2, 'h': 3, 'i': 4, 'n': 5, 'e': 6 , '#': 7, 'z': 8}
    
    input:
        x: machine - input 
        y: achine# - what we need to predict
    
    
    transformed input: (get_input_vector)
    
        x: [
            [1, 0, 0, 0, 0, 0, 0, 0],  m
            [0, 1, 0, 0, 0, 0, 0, 0],  a
            [0, 0, 1, 0, 0, 0, 0, 0],  c
            [0, 0, 0, 1, 0, 0, 0, 0],  h
            [0, 0, 0, 0, 1, 0, 0, 0],  i
            [0, 0, 0, 0, 0, 1, 0, 0],  n
            [0, 0, 0, 0, 0, 0, 1, 0]   e   
        ]
        
       (get_target_vector)
       
       y: [1,2,3,4,5,6,7] achine#

Helper functions that transform numpy arrays into pytorch tensors

In [23]:

def numpy_to_tensor(numpy_matrix):
    return torch.from_numpy(numpy_matrix)

def construct_input_tensor(line):
    input_matrix = get_input_vector(line)
    input_tensor = numpy_to_tensor(input_matrix)
    input_tensor = input_tensor.unsqueeze(dim = 1)
    input_tensor = input_tensor.type(torch.FloatTensor)
    return input_tensor

def construct_output_tensor(line):
    output_vector = get_target_vector(line)
    output_tensor = numpy_to_tensor(output_vector)
    output_tensor = output_tensor.type(torch.LongTensor)
    return output_tensor

def to_input_format(name):
    i_tensor = construct_input_tensor(name)
    o_tensor = construct_output_tensor(name)
    
    return i_tensor, o_tensor

    
def random_sample():
    name = sample_names(1)[0]
    return to_input_format(name)
    


## Brief Intro to RNNs

In the cell below is described the simple recursive neural network implemented in Pytorch.

The recurrent network is the extension of the linear network, developed to solve problems that involve sequential data(e.g. text). The brief schema is presented below:

![title](../images/rnn1.png)

In the picture above the red rectangle represents the input parameters, the green rectangle represents the interanls of RNN(hidden layer) and the blue rectangle represents the output parameters. 

Lest walk through the formulas to understand the RNN:

![title](../images/rnn1_ht.png)

The formula above combines the input data on timestamp(t) with the state of the hidden outputs on timestamp(t-1) and applies nonlinearity(RELU function). RELU is a [rectified linear unit](https://medium.com/tinymind/a-practical-guide-to-relu-b83ca804f1f7) function which makes sure that the RNN is not a linear combination of input vectors.

The second part is to produce outputs(y) on each timestamp. It is done via:

![title](../images/rnn1_y.png)

Since RNN is operates over time series data, during execution it unfolds into:

![title](../images/rnn_unfolded.png)


In our case, on each timestamp we output a vector of length(alphabet), which corresponds to the scores of each letter in alphabet. Note: In order to find out the probability distributions of the letters, we need to apply *softmax* operation.[Softmax](https://medium.com/data-science-bootcamp/understand-the-softmax-function-in-minutes-f3a59641e86d)

It is much clear with the example. 

    dictionary: {'t':0, 'e': 1, 's': 2, '#': 3}
    
    input:  test
    target: est#
    
    input_one_hot_encoded: [[1,0,0,0],[0,1,0,0],[0,0,1,0],[1,0,0,0]]
    target_encoded: [[1,2,0,3]]
    
    hidden_size: 4 , hidden_vector: [0,0,0,0]
    
    timestamp1:
    
    input: [input_one_hot_encoded, hidden_vector] 
    output: [0.1,-0.3, 0.8, -0.1]
    next_hidden: [0.3,-0.4, 0.1, -0.7]


Additional resources:

[IntroToRNN](https://medium.com/explore-artificial-intelligence/an-introduction-to-recurrent-neural-networks-72c97bf0912)


### Build simple RNN using pytorch


The cell below defines the simple RNN. This RNN has two linear layers and the nonlinearity function(RELU).

The schema is presented below:

![title](../images/char_rnn_model.png)

The single run of the forward function corresponds to a single pass through this schema. 

As a result, for name: 'Alex' we would need to run this function four times(each time with a new letter).


In [35]:

import torch.nn as nn

class SimpleRNN(torch.nn.Module):
    def __init__(self, n_input, n_hidden, n_output):
        super(SimpleRNN, self).__init__()
        # size of the hidden layer
        self.n_hidden = n_hidden
        # define linear layer, that will contain Weight matrix of ( n_input + n_hidden x n_hidden )
        self.input2hidden_layer = nn.Linear(n_input + n_hidden, n_hidden)
        # define linear layer that will produce outputs, the dimension is (h_hidden x n_output)
        self.hidden2output_layer = nn.Linear(n_hidden, n_output)
        # define nonlinearity function
        self.relu = nn.ReLU()

    def forward(self, input, hidden):
        # Combine input vector(step t) and hidden vector(step t-1)
        combined = torch.cat((input, hidden), 1)
        # execute W*combined + b 
        next_hidden = self.input2hidden_layer(combined)
        # apply nonlinearity
        next_hidden = self.relu(next_hidden)
        # produce output via W*next_hidden + b
        output = self.hidden2output_layer(next_hidden)
        # return hidden at time(t) and output at time(t)
        return next_hidden, output
    
    def initHidden(self):
        return torch.zeros(1, self.n_hidden)



You can try to play with the network defined above.

Try to specify different hidden_size and see what happens.

Print input and output parameters. Try to understand why do they look like that.

In [36]:
hidden_size = 128

rnn = SimpleRNN(n_letters, hidden_size, n_letters)

hidden = rnn.initHidden()

input_tensor, output_tensor = random_sample()

next_hidden, output = rnn(input_tensor[0], hidden)

print(next_hidden.shape)


torch.Size([1, 128])


Here, we define the loss function. [CrossEntropyLoss](https://pytorch.org/docs/stable/nn.html)

In general, loss functions correspond to the algorithm that define how your network is trained.


Try to play with the parameters and try to understand what are the input and output parameters of the loss_function

use pdf.set_trace for debugging

In [26]:


loss_function = nn.CrossEntropyLoss()

input_tensor, target_tensor = random_sample()

# 
target_tensor = target_tensor.unsqueeze_(-1)

next_hidden, output_layer = rnn(input_tensor[0], rnn.initHidden())

loss_function(output_layer, target_tensor[0])



tensor(3.9095, grad_fn=<NllLossBackward>)

Below is the initialisation of the rnn that we will be using for training.

In order to understand better, change the learning rate and hidden_size

In [27]:
hidden_size = 128 

loss_function = nn.CrossEntropyLoss()
rnn = SimpleRNN(n_letters, hidden_size, n_letters)
learning_rate = 0.005

In [28]:


def train_on_random_sample():
    # pick a random sample
    input_tensor, target_tensor = random_sample()
    # since the target_tensor is returned in the format [1,2,3,4], we transfor it to [[1,2,3,4]]
    target_tensor = target_tensor.unsqueeze_(-1)
    # init hidden layer
    hidden = rnn.initHidden()
    # the size of our sequence, e.g. if sample is Alex, the size is four
    nletters = input_tensor.size(0)

    # Since pytorch stores computed derivatives in local parameters, we need to manually reset them befor each train loop 
    rnn.zero_grad()
    # total loss

    loss = 0
    
    # iterate through each letter
    for i in range(nletters):
        # execute single rnn loop
        next_hidden, output = rnn(input_tensor[i], hidden)
        # compute loss function
        l = loss_function(output, target_tensor[i])
        # replace hidden parameters from step t-1 to step t
        hidden = next_hidden
        loss+=l
        
    loss.backward()
    
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)
    
    return output, loss.item()


In [37]:

# we will execute this amount of iterations
n_iters = 10000
# every 5k iterations we will print the result
print_every = 5000
# we will modify our loss array every 500 iterations
plot_every = 500
# loss array
all_losses = []
total_loss = 0 # Reset every plot_every iters

start = time.time()

for ind in range(1, n_iters + 1):
    output, loss = train_on_random_sample()
    total_loss += loss

    if ind % print_every == 0:
        print('%s (%d %d%%) loss: %.4f, total_loss: %.4f' % (timeSince(start), ind, ind / n_iters * 100, loss, total_loss))

    if ind % plot_every == 0:
        all_losses.append(total_loss / plot_every)
        total_loss = 0



0m 12s (5000 50%) loss: 8.7410, total_loss: 6827.9772
0m 25s (10000 100%) loss: 10.2750, total_loss: 6635.5179


### Plot loss

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure()
plt.plot(all_losses)


Lest print some outputs!

Two functions below define the ability to generate names, try to run them!

In [None]:

max_length = 15
def sample_from_nn(start_letter='A'):
    with torch.no_grad():  # no need to track history in sampling
        input = construct_input_tensor(start_letter)
        hidden = rnn.initHidden()

        output_name = start_letter

        for i in range(max_length):
            hidden, output = rnn(input[0], hidden)
            topv, topi = output.topk(1)
            topi = topi[0][0]
            if topi == n_letters - 1:
                break
            else:
                letter = idx2token[topi.item()]
                output_name += letter
            input = construct_input_tensor(letter)
        return output_name

import pdb
    
def sample_from_nn_distr(start_letter='A'):
    with torch.no_grad():  # no need to track history in sampling
        input = construct_input_tensor(start_letter)
        hidden = rnn.initHidden()

        output_name = start_letter

        for i in range(max_length):
            hidden, output = rnn(input[0], hidden)
#             pdb.set_trace()
            output = F.softmax(output, dim = 1)
            token_id = torch.multinomial(output[-1], 1).item()
            letter = idx2token[token_id]
            output_name += letter    
            input = construct_input_tensor(start_letter)

        return output_name



In [None]:

sample_from_nn_distr('C')