#### Agenda - Given a sequence of previous characters, model the probability distribution of the next character in the sequence. 

Here we try harry potter text

We will be using kears for this note 

In [1]:
from __future__ import print_function

import numpy as np
import random
import sys

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file

Using TensorFlow backend.


# Get data

In [3]:
# load data
path = "./../data/harry_potter_3.txt"
text = open(path).read().lower()
print('corpus length:', len(text))

corpus length: 626260


In [4]:
total_chars = sorted(list(set(text)))
print('total chars:', len(total_chars))


total chars: 54


In [5]:
#dictionaries to map characters to IDs and vice-a-versa

char_indices = dict((c, i) for i, c in enumerate(total_chars))
indices_char = dict((i, c) for i, c in enumerate(total_chars))


In [6]:
# cut the text in semi-redundant sequences of maxlen characters

maxlen = 40 # length of data window
step = 1    # step by which to shift the data window 

sentences = []    # this stores X
next_chars = []   # this stores y

# (i:i+40) as X, (i+40) as corresponding y
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen]) # get sentence
    next_chars.append(text[i + maxlen])   # get next character
    

num_of_sentences = len(sentences)

print('Number of datapoints/sequences:', num_of_sentences)

Number of datapoints/sequences: 626220


In [7]:
#Vectorize the input 

# X will be a 3D tensor - num_of_sentences * maxlen * total_chars
# y will be 2D tensor - sentences * total_chars

print('Vectorization...')
X = np.zeros((num_of_sentences, maxlen, len(total_chars)), dtype=np.bool)
y = np.zeros((num_of_sentences, len(total_chars)), dtype=np.bool)

# populate the tensors
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


# Model

In [8]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()

# lstm takes a single slice of 3D tensor along num_of_sentences axiss
model.add(LSTM(128, input_shape=(maxlen, len(total_chars)))) 

# add a dense layer - takes 128 input and spits output of num_of_chars 
model.add(Dense(len(total_chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [9]:
#Preds is scores over output space (characters)
# Its nothing but - confidence the RNN currently assigns to each character coming next in the sequence

# we convert scores into probability distribution and pick the best bet. 

# Temperature. We can also play with the temperature of the Softmax during sampling. Decreasing the temperature 
# from 1 to some lower number (e.g. 0.5) makes the RNN more confident, but also more conservative in its samples. 
# Conversely, higher temperatures will give more diversity but at cost of more mistakes (e.g. spelling mistakes, 
# etc). In particular, setting temperature very near zero will give the most likely thing

def sample(preds, temperature=1.0):
    
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds) # convert scores to prob via softmax
    probas = np.random.multinomial(1, preds, 1)
    
    return np.argmax(probas) # return the one with max probability

In [None]:
for iteration in range(1, 60):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y,batch_size=128,epochs=1)
    
    start_index = random.randint(0, len(text) - maxlen - 1) # pick an index at random
    
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)
        
        generated = ''
        sentence = text[start_index: start_index + maxlen] # pick sentence at randomly gerenated index
        generated += sentence
        
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
        sys.stdout.write("\n")
        
        for i in range(400):
            x = np.zeros((1, maxlen, len(total_chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1. # put the sentence in 1-hot format
                
            preds = model.predict(x, verbose=0)[0] # make the model spit out predictions
            
            next_index = sample(preds, diversity) # choose a character index
            next_char = indices_char[next_index]  # get the corresponding chracater
            
            generated += next_char                # append this character to the sentence
            sentence = sentence[1:] + next_char   # now shift the the sentence by one character
            
            sys.stdout.write(next_char)           # write the character to buffer
            sys.stdout.flush()
            
        print()


--------------------------------------------------
Iteration 1
Epoch 1/1

----- diversity: 0.2
----- Generating with seed: "ered up all his presents and his birthda"
ered up all his presents and his birthda
y and harry and her had to the firebolt and startly.

"what had the truth, it was the truth, harry and her had a black to the starting the class to the firebolt of the firebolt of the starting to the firebolt and the firebolt to the truth to the firebolt with a starting to harry and the stand and her had the truth and the class with his back of the starting to the truth to the class of the fireb

----- diversity: 0.5
----- Generating with seed: "ered up all his presents and his birthda"
ered up all his presents and his birthda
y.

"it was it, some was the dementors made him of the firebolt to one was
black had the momin to the started at harry and starting his firebolt about looking the other had your hand and course growisting the sitch for his unter a striughter to large snape to