# Char RNN Exploration
***

## 1. Import Text Data

In [1]:
total = ""
data = ""

In [2]:
import os
directory = "texts/individual/"
files = []
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        with open(directory+filename, "r") as myfile:
            data = myfile.read().replace('\n', ' ')
        files.append(filename)
    total = total + data

In [3]:
# Replace commas
total = total.replace("\xe2\x80\x99", "\'")
# Replace ...
total = total.replace("\xe2\x80\xa6", "...")
# Replace quotations
total = total.replace('\xe2\x80\x9c','"')
total = total.replace('\xe2\x80\x9d','"')
# Replace backslashes
total = total.replace("\\", "")
# Convert to lowercase
total = total.lower()

In [4]:
files

['fox-in-socks.txt',
 'green-eggs-and-ham.txt',
 'happy-birthday-to-you.txt',
 'hop-on-pop.txt',
 'horton-hears-a-who.txt',
 'how-the-grinch-stole-christmas.txt',
 'if-i-ran-the-zoo.txt',
 'marvin-k-mooney-will-you-please-go-now.txt',
 'mr-brown-can-moo-can-you.txt',
 'oh-the-places-youll-go.txt',
 'one-fish-two-fish-red-fish-blue-fish.txt',
 'ten-apples-up-on-top.txt',
 'the-butter-battle-book.txt',
 'the-cat-in-a-hat.txt',
 'the-foot-book.txt',
 'the-lorax.txt',
 'the-sneetches.txt',
 'wocket-in-my-pocket.txt']

In [5]:
len(total)

79576

## 2. Create Char Dictionary

In [6]:
def create_dict(texts):
    wl = {}
    for char in texts:
        if char not in wl:
            wl[char] = 1
        else:
            wl[char] += 1
    return wl

In [7]:
import operator
d = create_dict(total)
sorted_d = sorted(d.items(), key=lambda x: x[1], reverse=True)

In [8]:
# create a string composed of irrelevant characters (used less than 50 times)
unwanted_chars = ""
for c in sorted_d:
    if c[1] < 50:
        print(c)
        unwanted_chars = unwanted_chars+str(c[0])

('\xad', 10)
(':', 10)
('\xc2', 10)
('_', 10)
('\x80', 6)
('\xe2', 6)
('\x98', 5)
('1', 5)
('5', 4)
('\xef', 3)
(')', 3)
('0', 3)
('3', 3)
('2', 3)
('(', 3)
('6', 3)
('\xbf', 3)
('4', 3)
('\xbb', 2)
(';', 2)
('/', 1)
('9', 1)
('8', 1)
('\xbd', 1)
('\xbe', 1)
('\xcd', 1)
('\x94', 1)


In [9]:
# remove irrelevant characters from text data 
total=total.translate(None,unwanted_chars)

In [10]:
d = create_dict(total)
sorted_d = sorted(d.items(), key=lambda x: x[1], reverse=True)

In [11]:
print('Dictionary Size: %d' % len(sorted_d))

Dictionary Size: 34


In [12]:
print('Dictionary:\n %s' % sorted_d)

Dictionary:
 [(' ', 16037), ('e', 6664), ('o', 5335), ('t', 5265), ('a', 4504), ('h', 3973), ('n', 3686), ('i', 3671), ('s', 3402), ('r', 2852), ('l', 2819), ('d', 2325), ('u', 1963), ('w', 1708), ('.', 1648), ('m', 1489), ('y', 1487), ('c', 1330), ('g', 1330), ('p', 1102), ('f', 1078), ('k', 1044), ('b', 1016), (',', 916), ('!', 645), ('"', 480), ("'", 454), ('v', 422), ('-', 240), ('x', 163), ('?', 134), ('z', 128), ('j', 105), ('q', 57)]


In [23]:
import matploblib.pyplot as plt

ImportError: No module named matploblib.pyplot

## 3. Vectorize on Char Dict

In [13]:
SEQ_LENGTH = 100

In [14]:
characters = []
for pair in sorted_d:
    characters.append(pair[0])

In [15]:
n_to_char = {n:char for n, char in enumerate(characters)}
char_to_n = {char:n for n, char in enumerate(characters)}

In [16]:
import numpy as np
from keras.utils import np_utils
X = []
Y = []
length = len(total)
for i in range(0, length - SEQ_LENGTH, 1):
    sequence = total[i:i + SEQ_LENGTH]
    label = total[i + SEQ_LENGTH]
    X.append([char_to_n[char] for char in sequence])
    Y.append(char_to_n[label])

  from ._conv import register_converters as _register_converters
Using Theano backend.


In [17]:
X_mod = np.reshape(X, (len(X), SEQ_LENGTH, 1))
X_mod = X_mod / float(len(characters))
Y_mod = np_utils.to_categorical(Y)

In [18]:
X_mod.shape

(79372L, 100L, 1L)

## 4. LSTM Model

In [19]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

In [None]:
model = Sequential()
model.add(LSTM(700, input_shape=(X_mod.shape[1], X_mod.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(700, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(700))
model.add(Dropout(0.2))
model.add(Dense(Y_mod.shape[1], activation = 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

2 LSTM layers, 700 hidden states, dropout ratio = 0.2

In [None]:
model.fit(X_mod, Y_mod, epochs=10, batch_size=100)
model.save_weights('models/dr-seuss-700-0.2-700-0.2-700-0.2-10-epochs.h5')

1 epoch, batch size of 100

In [None]:
model.load_weights('models/dr-seuss-700-0.2-700-0.2-700-0.2-10-epochs.h5')

## 5. Generate Text

In [None]:
string_mapped = list(X[0])
full_string = [n_to_char[value] for value in string_mapped]
# Number of characters in generated passage
TEXT_LENGTH = 1000
for i in range(TEXT_LENGTH):
    x = np.reshape(string_mapped,(1,len(string_mapped), 1))
    x = x / float(len(characters))
    
    pred_index = np.argmax(model.predict(x, verbose=0))
    seq = [n_to_char[value] for value in string_mapped]
    full_string.append(n_to_char[pred_index])
    
    string_mapped.append(pred_index)
    string_mapped = string_mapped[1:len(string_mapped)]

In [None]:
#combining text
txt=""
for char in full_string:
    txt = txt+char
print(txt[:SEQ_LENGTH])

In [None]:
print(txt)