In [1]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

Using TensorFlow backend.


## 1. Reading and Preprocessing Data

In [58]:
raw_text = ""

### 1.1 Segregating files into .txt and .pdf files

In [66]:
import os
from os import listdir
from os.path import isfile, join
mypath = "../data/"
# pdf_files = [f for f in listdir(mypath) if isfile(join(mypath, f))]
pdf_files = []
txt_files = []

for file in os.listdir(mypath):
    if file.endswith(".txt"):
        txt_files.append(os.path.join(mypath, file))
    elif file.endswith(".pdf"):
        pdf_files.append(os.path.join(mypath, file))

### 1.2 Extracting text from PDF files

In [75]:
def extract_from_pdf(list_of_pdf_files):
    global raw_text
    for pdf_file in list_of_pdf_files:
        
        #Creating pdf file and reader objects from source PDF file
        pdfFileObj = open('../data/Love_Poems_of_Frederick_Harper.pdf', 'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj) 
        

        for page_number in range(10, pdfReader.numPages):
            pageObj = pdfReader.getPage(page_number) 

            # extracting text from page 
            raw_text+=pageObj.extractText().lower().replace("........................................................................"," ")

extract_from_pdf(pdf_files)

In [76]:
raw_text

'  vi \na love to walk with me.......................................................78 \nyou have and i have you.....................................................79 \n\nmathematical love...............................................................80 \n\nsitting in a chair with a teddy bear............................81 \n\ntrue love lasts........................................................................82 \n\na gift to you...............................................................................83 \n\ni will not promise....................................................................84 \n\ni love you.....................................................................................85 \n\n poems on love of family, children, friends, \nand nature.................................................................................87 \nlove your chain.......................................................................89 \nthanks mother..................................

In [22]:
chars = sorted(raw_text)
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [23]:
char_to_int

{'\n': 1860,
 ' ': 10797,
 '!': 10798,
 ',': 11359,
 '-': 11375,
 '.': 14942,
 '/': 14944,
 '0': 14963,
 '1': 15063,
 '2': 15084,
 '3': 15108,
 '4': 15128,
 '5': 15188,
 '6': 15212,
 '7': 15244,
 '8': 15380,
 '9': 15473,
 ':': 15593,
 ';': 15882,
 '?': 15914,
 'a': 18677,
 'b': 19018,
 'c': 20013,
 'd': 21386,
 'e': 26110,
 'f': 27082,
 'g': 27776,
 'h': 29357,
 'i': 31655,
 'j': 31688,
 'k': 31950,
 'l': 33655,
 'm': 34684,
 'n': 37083,
 'o': 40580,
 'p': 41457,
 'q': 41470,
 'r': 43996,
 's': 46127,
 't': 48803,
 'u': 50027,
 'v': 50616,
 'w': 51347,
 'x': 51395,
 'y': 52266,
 'z': 52285,
 'š': 52329,
 '–': 52335,
 '‚': 52339,
 '™': 52589,
 'ﬁ': 52620,
 'ﬂ': 52651}

In [25]:
n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

Total Characters:  52652
Total Vocab:  52652


In [27]:
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = raw_text[i:i + seq_length]
	seq_out = raw_text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)

print ("Total Patterns: ", n_patterns)

Total Patterns:  52552


In [28]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [29]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [30]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [51]:
model.fit(X, y, epochs=10, batch_size=256, callbacks=callbacks_list)

Epoch 1/10

Epoch 00001: loss improved from 3.09525 to 3.08867, saving model to weights-improvement-01-3.0887.hdf5
Epoch 2/10

Epoch 00002: loss improved from 3.08867 to 3.07637, saving model to weights-improvement-02-3.0764.hdf5
Epoch 3/10

Epoch 00003: loss improved from 3.07637 to 3.03870, saving model to weights-improvement-03-3.0387.hdf5
Epoch 4/10

Epoch 00004: loss did not improve from 3.03870
Epoch 5/10

Epoch 00005: loss improved from 3.03870 to 2.95075, saving model to weights-improvement-05-2.9508.hdf5
Epoch 6/10

Epoch 00006: loss improved from 2.95075 to 2.84155, saving model to weights-improvement-06-2.8416.hdf5
Epoch 7/10

Epoch 00007: loss improved from 2.84155 to 2.78277, saving model to weights-improvement-07-2.7828.hdf5
Epoch 8/10

KeyboardInterrupt: 

In [54]:
filename = "weights-improvement-07-2.7828.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [55]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [57]:
import sys 
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]

Seed:
" ™s remain.ﬂ 
 reprinted: harper™s 
romantica: on peace and romance
, 1988. 
love poems of frederick  "
      ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

KeyboardInterrupt: 