In [23]:
import nltk
import numpy as np
import os
import random
import sys

from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop

In [None]:
# Download NLTK model data (you need to do this once)
nltk.download("book")

In [25]:
corpora_dir = "/Users/Ajay/nltk_data/corpora/state_union"

# Read all file paths in corpora directory
file_list = []
for root, _ , files in os.walk(corpora_dir):  
    for filename in files:
        file_list.append(os.path.join(root, filename))
        
print("Read ", len(file_list), " files..." )

# Extract text from all documents
docs = []

for files in file_list:
    with open(files, 'r') as fin:
        try:
            str_form = fin.read().lower().replace('\n', '')
            docs.append(str_form)
        except UnicodeDecodeError: 
            # Some sentences have wierd characters. Ignore them for now
            pass
# Combine them all into a string of text
text = ' '.join(docs)

print('corpus length:', len(text))

Read  66  files...
corpus length: 1915949


In [43]:
chars = sorted(list(set(text)))
print('Total Number of Unique Characters:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars)) # Character to index
indices_char = dict((i, c) for i, c in enumerate(chars)) # Index to Character

Total Number of Unique Characters: 57


In [45]:
"""
Recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
"""

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40 # Number of characters considered
step = 3 # Stide of our window
sentences = []
next_chars = []

# Rading the text in terms of sequence of characters
# Extract only 'maxlen' characters every time
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    # The character just after the sequence is the label
    next_chars.append(text[i + maxlen]) 
print('nb sequences:', len(sentences))

print('Vectorization...')
# Initializing Tensor (training data)
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) 
# Initializing Output that holds next character (label)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool) 
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        # Populate Tensor Input
        x[i, t, char_indices[char]] = 1 
    # Populate y with the character just after the sequence
    y[i, char_indices[next_chars[i]]] = 1


def sample(preds, temperature=1.0):
    """Perform Temperature Sampling"""
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature 
    exp_preds = np.exp(preds)
    # Softmax of predictions
    preds = exp_preds / np.sum(exp_preds) 
    # Sample a single characters, with probabilities defined in `preds`
    probas = np.random.multinomial(1, preds, 1) 
    return np.argmax(probas)


def on_epoch_end(epoch, _):
    """Function invoked at end of each epoch. Prints generated text"""
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- Diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            # Generate next character
            next_index = sample(preds, diversity) 
            next_char = indices_char[next_index]
            
            # Append character to generated sequence
            generated += next_char 
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
    
    # Save model weights into file
    model.save_weights('saved_weights.hdf5', overwrite=True)
        

# After every single epoch, we are going to call the function on_epoch_end
# to generate some text.
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [41]:
print('Building model...')
# Size of vector in the hidden layer.
hidden_size = 128 
# Initialize Sequential Model
model = Sequential()
model.add(LSTM(hidden_size, input_shape=(maxlen, len(chars))))
# Add the output layer that is a softmax of the number of characters
model.add(Dense(len(chars), activation='softmax')) 
# Optimization through RMSprop
optimizer_new = RMSprop() 
# Consider cross Entropy loss. Why? MLE of P(D | theta)
model.compile(loss='categorical_crossentropy', optimizer=optimizer_new) 

# Train this for 30 epochs. Size of output from LSTM i.e. hidden layer vector shape=128
model.fit(x, y,
          batch_size=128,
          epochs=30,
          callbacks=[print_callback, checkpointer])

Building model...
Epoch 1/30
----- Generating text after Epoch: 0
----- Diversity: 0.2
----- Generating with seed: "with national security. we have held our"
with national security. we have held our commentical security and the streng and the were of the program of the state and the prost and the congress of the world the streng the for the wark to prople to the program the proples and the program of the sompersion of the program of the rest and the congress of the will the provice the streng the program in the with the proples and the will our a belical and the for the proples and the were 
----- Diversity: 0.5
----- Generating with seed: "with national security. we have held our"
with national security. we have held our cansion of the secolical seacly councress the force a for the will nater and the congress of cat of the faith of the state you start bection and to conting expenting the progras consicitions in which for is in congress the hand be to mist maring the free in the wars s



Epoch 2/30
----- Generating text after Epoch: 1
----- Diversity: 0.2
----- Generating with seed: "ruction and development for all the peop"
ruction and development for all the people and the enery of the states and to the states to the endory and the states of the insternst and the enery of the congress of the perion of the world states and the new to the enery of the states and the states of the enery and the states of the world and contrints to the congress of the congress of the security to the entorical people of the eneromment and the security of the congress of the un
----- Diversity: 0.5
----- Generating with seed: "ruction and development for all the peop"
ruction and development for all the people to worker past in the sount a states in to mest and the gongress to the nation of the assictation in the past the workers of our not the union and workers, the more community resourcest consurty to the this country being the resure to the tades of the congress of the entory possible 

ter consequences of more than three centual afeary people, butinesss.it presence--by mangers we heaping a world. keep vivole i values convicted, cally lost contrisement congress. we have will as a unsure our handlanst, from to thten our great processitiantay. eurape man and control commencaing thir beyenses are cavilo of our health can pled as a must we need, i cannatially. re modern confis and communities. is statue. in re-share ow as 
----- Diversity: 1.2
----- Generating with seed: "ter consequences of more than three cent"
ter consequences of more than three centuur on scalls thmich criditayly leos believe bewoldning who are solve taking stanhter thir de1bly against hulf-retere for, crutice from it. your cirits, eppolting our economy higher dollors as a0ghting tax credsts, in this dutuil people fanish, all world upder.for the presidents which will this progrems and prosperity.we cansold-bat buindal livess we have reductionaty greated nnext they will unnea
Epoch 6/30
----- Generatin

and meaningful opportunities and honest and the federal government and the people and the congress to the next for the world will be a community of the world and the congress to provide the federal program and state of the congress to propose and the world of the congress to depend to the president and the congress and the congress to strengthen the federal construction of the forming people of the world that we can be to the federal go
----- Diversity: 0.5
----- Generating with seed: "and meaningful opportunities and honest "
and meaningful opportunities and honest and today and the basic families and protection of americans we hard we shall continue the budget of federal aggression in the deficit of the prices of constitutional prosperity and the community for peace and responsibility to the best the congress to the solution of standards from the congress to end the post to state of support of the american people of the responsibility to fell america in the
----- Diversity: 1.0
-----

raduate from college.why, then, this reserve in defense independency.and they been seen. historing the new foreign american policy. and there we can come. it's eppiciess howing to prevent parking. they will in to most at them budget whowe me comiles, the world. thanking the building none campot and other asser complex already ad a year. becear and pass attend it now that a special, but our deficits essabulaving hardier. those federal ab
----- Diversity: 1.2
----- Generating with seed: "raduate from college.why, then, this res"
raduate from college.why, then, this responsibility, can alt keen commit owgencef,, and  these means to ahpaiance.. , yearso second crime, will not work howous half ownally leave surginesy longorw. but to amont first of posting no required bewer,. these 30011 necesser i conviet parents of avery chilf saventure, 1rom a supported the federal governmen, as, miner men, theit the first ancimizens and monthno0. menther to.nownait--the leaders
Epoch 13/30
----- Generati

: almost 6 million new jobs in the last contributions and the program of the family of the congress to deter the congress to determinate the state of the management and the families and the expection of the family of the congress to continue the program of the program of the program of the fiscal year 1947. the state of the people of the congress to the congress to strengthen the first and continued by the congress to be a the security 
----- Diversity: 0.5
----- Generating with seed: ": almost 6 million new jobs in the last "
: almost 6 million new jobs in the last test at the soviet union for the part of the businesses will be increased its civil range of the 'spaided to propose against a research to address the most important to make some in and its personal and every china, it down all americans that less than we will all fight will be accomplished at an economic growth to decent change is to continue the power and the budget and local strength in america
----- Diversity: 1.0
-----

merican people have responded, in the mose. we must your job, for confidence to this-line. two war estabuter that will huch the war for the use of approvemence than for our secretary in unifereen for the american investmenty. hope we have not moders entire. we can need you like our farger have imment the freelilities in the free newed it is we must acreates laids all applients and resourcks. pointsive and laster has passed a funday shou
----- Diversity: 1.2
----- Generating with seed: "merican people have responded, in the mo"
merican people have responded, in the most our time. we will preport our life, when - then 4or water. but we from that 190ce america with our hunders. howing is itself waste, and vetaned making planned the actistiens produce that thurbil aid person instead and lighted atomic independrapborger out to tonight, ines that an agdeeter, and te mahis in the fissedlappecoxe i will make they beaids in another promises i mminds. government and se
Epoch 20/30
----- Generati

 processes of growth are gradual - bearing the congress to act to the congress to act the congress to act the congress to the congress to deter a source of the congress on the congress to achieve the free world to all of the soviet union and the term not the congress on the state of the congress to the congress on the fiscal year 1947. the congress to the soviet union of the world which will be a state of the congress to all care of the
----- Diversity: 0.5
----- Generating with seed: " processes of growth are gradual - beari"
 processes of growth are gradual - bearing the security in the new time is a more conservation and production and areas of the congress can be to all americans to the problem to retell on a senate to active the world to the traditions of the world, and our free responsting economic programs in the new taken to federal budget in the material and the basic constitutions have harder the support of a some of the federal government republic 
----- Diversity: 1.0
-----

hical. i wanted to give you my judgment, and vooped to me evit need for commit to orea, of a concerted for produced abkens.our national enactment to work can be better competitive sinciver and real children, every elementury. for the world is long good tontging a passion when we do it take politics of the existing purposes to begun to believe the enemy. many labard her for a longs. this whoer allies. this submit sincementory. where need
----- Diversity: 1.2
----- Generating with seed: "hical. i wanted to give you my judgment,"
hical. i wanted to give you my judgment, and we facess a time by free men at a higher ciecies is us business. visinn of palos.we week and civilngexed by offeet.naver far, unmolot in spent and fawort and the war one-happines of . the world, whose directly, brotheed the innent diaborements to retrappine growth. since western follusing their chasic needs, a isneasomos and since to fou statuble 5,000 conscititions to orab, society to look h
Epoch 27/30
----- Generati



, and we enolsw, which who democre. whe island non-new programs can be trade and choice. we've seen for my tellinality conscault will concerned aid. campaign that in our places of america. on this grade. putlisitian--in courraig. some at the people of the world 
----- Diversity: 1.2
----- Generating with seed: " pending legislation, but the tentative "
 pending legislation, but the tentative low-secunity to the environmentaims here in thates reinforcome.o're in every rideshia knows.we have more than it and humanity. and does not oblow larget, by an people. by ending indepandly contributed mutuemen, what caset, and not only been responsibility our speete of its or january earners all of this commitity of good boastrow that be people senate. everyone its food any agricultural with difni
Epoch 29/30
----- Generating text after Epoch: 28
----- Diversity: 0.2
----- Generating with seed: "rn for those who have served this nation"
rn for those who have served this nation to the congress to th

<keras.callbacks.History at 0x1562316a0>

In [39]:
# To continue training...
model.load_weights("saved_weights.hdf5")

model.fit(x, y,
          batch_size=128,
          epochs=30,
          callbacks=[print_callback, checkpointer])