# Baseline Char RNN
***

## 1. Import Text Data

In [1]:
total = ""
data = ""

In [2]:
import os
directory = "texts/individual/"
files = []
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        with open (directory+filename, "r") as myfile:
            data = myfile.read().replace('\n', ' ')
        files.append(filename)
    total = total + data

In [3]:
# Replace commas
total = total.replace("\xe2\x80\x99", "\'")
# Replace ...
total = total.replace("\xe2\x80\xa6", "...")
# Replace backslashes
total = total.replace("\\", "")
# Convert to lowercase
total = total.lower()

In [4]:
files

['oh-the-places-youll-go.txt',
 'wocket-in-my-pocket.txt',
 'hop-on-pop.txt',
 'cat-in-a-hat.txt',
 'how-the-grinch-stole-christmas.txt',
 'one-fish-two-fish-red-fish-blue-fish.txt',
 'fox-in-socks.txt',
 'green-eggs-and-ham.txt']

In [5]:
total

'congratulations! today is your day. you\'re off to great places! you\'re off and away!  you have brains in your head. you have feet in your shoes. you can steer yourself  any direction you choose. you\'re on your own. and you know what you know. and you are the guy who\'ll decide where to go.  you\'ll look up and down streets. look \'em over with care. about some you will say, "i don\'t choose to go there." with your head full of brains and your shoes full of feet,  you\'re too smart to go down any not-so-good street.  and you may not find any you\'ll want to go down. in that case, of course, you\'ll head straight out of town.  it\'s opener there in the wide open air.  out there things can happen and frequently do to people as brainy and footsy as you.  and then things start to happen, don\'t worry. don\'t stew. just go right along. you\'ll start happening too.  oh! the places you\'ll go!  you\'ll be on y our way up! you\'ll be seeing great sights! you\'ll join the high fliers who soa

## 2. Create Char Dictionary

In [6]:
def create_dict(texts):
    wl = {}
    for char in texts:
        if char not in wl:
            wl[char] = 1
        else:
            wl[char] += 1
    return wl

In [7]:
import operator
d = create_dict(total)
sorted_d = sorted(d.items(), key=lambda x: x[1], reverse=True)

In [8]:
print('Dictionary Size: %d' % len(sorted_d))

Dictionary Size: 46


In [9]:
print('Dictionary:\n %s' % sorted_d)

Dictionary:
 [(' ', 7706), ('e', 2920), ('t', 2509), ('o', 2487), ('a', 2039), ('h', 1949), ('i', 1825), ('n', 1737), ('s', 1608), ('l', 1304), ('r', 1105), ('d', 1089), ('w', 898), ('u', 877), ('.', 822), ('m', 694), ('y', 644), ('c', 585), ('g', 532), ('k', 523), ('b', 447), (',', 429), ('p', 415), ('f', 414), ('!', 328), ('"', 184), ("'", 173), ('v', 126), ('x', 90), ('?', 84), ('-', 61), ('z', 35), ('j', 30), ('q', 21), (':', 2), ('\xef', 1), (')', 1), ('(', 1), ('/', 1), ('3', 1), ('4', 1), ('9', 1), ('8', 1), (';', 1), ('\xbf', 1), ('\xbd', 1)]


## 3. Vectorize on Char Dict

In [10]:
SEQ_LENGTH = 100

In [11]:
characters = []
for pair in sorted_d:
    characters.append(pair[0])

In [12]:
n_to_char = {n:char for n, char in enumerate(characters)}
char_to_n = {char:n for n, char in enumerate(characters)}

In [13]:
import numpy as np
from keras.utils import np_utils
X = []
Y = []
length = len(total)
for i in range(0, length - SEQ_LENGTH, 1):
    sequence = total[i:i + SEQ_LENGTH]
    label = total[i + SEQ_LENGTH]
    X.append([char_to_n[char] for char in sequence])
    Y.append(char_to_n[label])

Using TensorFlow backend.


In [14]:
X_mod = np.reshape(X, (len(X), SEQ_LENGTH, 1))
X_mod = X_mod / float(len(characters))
Y_mod = np_utils.to_categorical(Y)

In [15]:
X_mod.shape

(36603, 100, 1)

## 4. LSTM Model

In [16]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

In [17]:
model = Sequential()
model.add(LSTM(400, input_shape=(X_mod.shape[1], X_mod.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(400))
model.add(Dropout(0.2))
model.add(Dense(Y_mod.shape[1], activation = 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

2 LSTM layers, 400 hidden layers, dropout ratio 0.2

In [18]:
#model.fit(X_mod, Y_mod, epochs=1, batch_size=100)
#model.save_weights('models/dr-seuss-baseline-400-0.2-400-0.2.h5')

1 epoch, batch size of 100

In [19]:
model.load_weights('models/dr-seuss-baseline-400-0.2-400-0.2.h5')

## 5. Generate Text

In [22]:
string_mapped = list(X[0])
full_string = [n_to_char[value] for value in string_mapped]
# generating characters
for i in range(200):
    x = np.reshape(string_mapped,(1,len(string_mapped), 1))
    x = x / float(len(characters))
    
    pred_index = np.argmax(model.predict(x, verbose=0))
    seq = [n_to_char[value] for value in string_mapped]
    full_string.append(n_to_char[pred_index])
    
    string_mapped.append(pred_index)
    string_mapped = string_mapped[1:len(string_mapped)]

In [23]:
#combining text
txt=""
for char in full_string:
    txt = txt+char
print(txt[:100])
print(txt)

congratulations! today is your day. you're off to great places! you're off and away!  you have brain
congratulations! today is your day. you're off to great places! you're off and away!  you have brain                                                                                                                                                                                                        
