In [4]:
import sys
import re
import numpy as np
import json
import pickle
from string import ascii_letters

from keras.models import Sequential, model_from_json
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM

from ivanatrumpalot import clean_text, predict, sample
import os
import subprocess as subp

In [7]:

curr_dir=subp.check_output(["pwd"]).replace("\n","")
# Code directory
os.chdir(curr_dir)

# Read and clean corpus
text = clean_text(open("../data/trump_corpus").read())

In [9]:

# Corpus length
print("Corpus : {} characters, approximately {} sentences.".format(len(text), len(text.split("."))))

# Generate a dictionaries mapping from characters in our alphabet to an index, and the reverse
alphabet = set(text).union(set(ascii_letters)).union(set("1234567890"))
alphabet_size = len(alphabet)
alphabet_indices = dict((c, i) for i, c in enumerate(alphabet))
indices_alphabet = dict((i, c) for i, c in enumerate(alphabet))
print("Size of the alphabet : {} characters.".format(alphabet_size))


Corpus : 249546 characters, approximately 3806 sentences.
Size of the alphabet : 75 characters.


In [14]:

# Generate sequences of characters that the RNN will use to predict the next character.
primer_length = 50
step = 3
sentences = []
next_character = []
for i in range(0, len(text) - primer_length, step):
    sentences.append(text[i : i + primer_length])
    next_character.append(text[i + primer_length])
print("Number of sequences generated from the corpus : {}.".format(len(sentences)))


Number of sequences generated from the corpus : 83166.


In [25]:
# Vectorise the text sequences : go from N sentences of length primer_length to
# a binary array of size (N, primer_length, alphabet_size). Do the same for the
# next_character array.
print("One-Hot Vectorising.")
X = np.zeros((len(sentences), primer_length, alphabet_size), dtype=np.bool)
y = np.zeros((len(sentences), alphabet_size), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, alphabet_indices[char]] = 1
    y[i, alphabet_indices[next_character[i]]] = 1

One-Hot Vectorising.


In [None]:

# Pickle the necessary objects for future prediction
required_objects = { "alphabet" : alphabet,
                     "alphabet_indices" : alphabet_indices,
                     "indices_alphabet" : indices_alphabet,
                     "primer_length" : primer_length
                   }
with open("required_objects.pickle", "wb") as f:
    pickle.dump(required_objects, f)