Create character embeddings

Borrowed from https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py



Agenda - given a sequence of previous characters, model the probability distribution of the next character in the sequence. 

In [1]:
from __future__ import print_function

import numpy as np
import random
import sys

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file

Using TensorFlow backend.


# Get data

In [3]:
path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('corpus length:', len(text))

corpus length: 600901


In [4]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))


total chars: 59


In [5]:
#dictionaries to map characters to IDs and vice-a-versa

char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


In [6]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen]) # get sentence
    next_chars.append(text[i + maxlen])   # get next character
    
print('nb sequences:', len(sentences))

nb sequences: 200287


In [7]:
#Vectorize the input 

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


# Model

In [9]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [10]:
#Preds is nothing but - confidence the RNN currently assigns to each character coming next in the sequence

# we convert scores into probability distribution and pick the best bet. 

# Temperature. We can also play with the temperature of the Softmax during sampling. Decreasing the temperature 
# from 1 to some lower number (e.g. 0.5) makes the RNN more confident, but also more conservative in its samples. 
# Conversely, higher temperatures will give more diversity but at cost of more mistakes (e.g. spelling mistakes, 
# etc). In particular, setting temperature very near zero will give the most likely thing

def sample(preds, temperature=1.0):
    
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [11]:
for iteration in range(1, 60):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y,batch_size=128,epochs=1)
    
    start_index = random.randint(0, len(text) - maxlen - 1) # pick an index at random
    
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)
        
        generated = ''
        sentence = text[start_index: start_index + maxlen] # pick sentence at randomly gerenated index
        generated += sentence
        
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
        
        for i in range(400):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1. # put the sentence in 1-hot format
                
            preds = model.predict(x, verbose=0)[0] # make the model spit out predictions
            
            next_index = sample(preds, diversity) # choose a character index
            next_char = indices_char[next_index]  # get the corresponding chracater
            
            generated += next_char                # append this character to the sentence
            sentence = sentence[1:] + next_char   # now shift the the sentence by one character
            
            sys.stdout.write(next_char)           # write the character to buffer
            sys.stdout.flush()
            
        print()


--------------------------------------------------
Iteration 1
Epoch 1/1

----- diversity: 0.2
----- Generating with seed: ")
an idea of the fact that philosophizin"
)
an idea of the fact that philosophizing in the earther to the say to the east that the say to the east to the enterity to deer to deer to the proposition of the propectionally to the say to the east to desire and the extrance of the say to the solities in the east is the say of the pricial to the south is to the say to the say to the say to the prepen that the south that the pricial to the extrance of the say and the every the south o

----- diversity: 0.5
----- Generating with seed: ")
an idea of the fact that philosophizin"
)
an idea of the fact that philosophizing in the spirit for that it of the pricitionally to ak the ents of man which it is all and which as sund as interthes seet caltent and entertainess is the internt philosophers is art is to desiral exceptionally he her moral sympathy is to the prised and the p

KeyboardInterrupt: 

$h_t$ = $tanh(Ux_t + Wh_{t-1})$

$o_t$ = softmax($Vh_t$)









