In [37]:
import numpy 
import re
import sys

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [2]:
# Load text file
path_to_file = 'trump.txt'
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

In [3]:
# Clean the text
# Function to clean the text
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = " ".join(filter(lambda x:x[0]!="@", text.split()))
    return text


In [4]:
# Apply the function
text = clean_text(text)

In [5]:
# Vectorize
# Creating a mapping of unique char > integers
chars = sorted(list(set(text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [6]:
# Check the number of unique chars
n_chars = len(text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  1745674
Total Vocab:  163


In [7]:
# Define the training data
# Split the text into subsequences of 'seq_length'
# 100 time steps of one char of x followed by y
# A 'window' will slide across the text data one character at a time, allowing the character to be learned from the 100 characters preceding it
# E.g. seq_length = 3
# Oba > m
# bam > a

seq_length = 100

tx = []
ty = []

# Input-Output pairs
for i in range(0, n_chars - seq_length, 1):
	seq_in = text[i:i + seq_length]
	seq_out = text[i + seq_length]
	tx.append([char_to_int[char] for char in seq_in])
	ty.append(char_to_int[seq_out])
n_patterns = len(tx)

# When we print we should have total number of char - seq_length
print("Total Patterns: ", n_patterns)

Total Patterns:  1745574


In [9]:
# Reshape input sequences to [samples, time_steps, features]
x = numpy.reshape(tx, (n_patterns, seq_length, 1))
# Normalize to 0-1 (sigmoid)
x = x/float(n_vocab)
# One-hot encoding unique chars
y = np_utils.to_categorical(ty)

In [10]:
# LSTM Model
model = Sequential()
model.add(LSTM(256, dropout=0.2, return_sequences=True, input_shape=(x.shape[1], x.shape[2])))
model.add(LSTM(128, dropout=0.2))

# Softmax for multi-class classification | Probability for ea char
model.add(Dense(y.shape[1], activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam')

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100, 256)          264192    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dense (Dense)                (None, 163)               21027     
Total params: 482,339
Trainable params: 482,339
Non-trainable params: 0
_________________________________________________________________


In [11]:
# Checkpoints
filepath="{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [12]:
# Fit model
model.fit(x, y, epochs=2, batch_size=128, callbacks=callbacks_list)

Epoch 1/2
Epoch 00001: loss improved from inf to 3.09004, saving model to 01-3.0900.hdf5
Epoch 2/2
Epoch 00002: loss improved from 3.09004 to 3.01486, saving model to 02-3.0149.hdf5


<tensorflow.python.keras.callbacks.History at 0x7fbcbd7e7cc0>

In [54]:
# Our results are in integers, so, 
# Create a reverse mapping from integers > char
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [65]:
# Pick a random seed to start the sequence of predicted chars 
# Seed > generate1, generate1 > generate2, and so on..
# E.g. i > c > e > c > r >..
# Randomize seed
start = numpy.random.randint(0, len(tx)-1)
pattern = tx[start]
result = []

print("Seed: ", ''.join([int_to_char[value] for value in pattern]))

Seed:  er and person, of real estate weekly, for the wonderful story on me. very much appreciated! “winning


In [72]:
def sample_prediction(prediction):
  X = prediction[0]
  rnd_idx = numpy.random.choice(len(X), p=X)
  return rnd_idx

In [73]:
# Generate text of length _ 
for i in range(100):
  x = numpy.reshape(pattern, (1, len(pattern), 1))
  x = x / float(n_vocab)
  prediction = model.predict(x, verbose=0)
  index = sample_prediction(prediction)
  #print(index)
  result.append(int_to_char[index])
  
  pattern.append(index)
  pattern = pattern[1:len(pattern)]

print(result)
print("Done")

['s', 't', 'e', 'e', 'n', 't', ':', ' ', 'b', ' ', 'c', 'o', 'v', ')', ' ', 'y', 'n', 'p', 'a', 't', 't', 'd', 'f', 'u', 'n', 'n', ' ', 'w', 'i', 'r', 'y', ' ', 'f', 'r', 'e', ' ', 'd', 'o', 'a', '.', 'd', 'm', 't', ' ', 'o', 'l', 'o', 'v', 'n', ' ', 'a', 'n', 'u', 's', 'a', ' ', 'h', 'v', ' ', 'f', 'r', ' ', 's', 'a', ' ', 'a', '@', '.', 'd', 'k', ' ', ' ', 'y', '.', '#', 'w', 'h', 'e', ' ', 'a', ' ', 'i', 'u', 'a', 'y', 'g', 'e', ' ', ' ', 'a', 'm', 'r', 'b', 'm', 'y', 'o', 'a', ' ', 'g', 'v', 'o', 'm', 'l', 's', 'e', ' ', 'y', 'a', 'f', ' ', 'a', ' ', 'b', 'e', 't', 's', 's', 't', ':', 'y', 'i', 'w', ',', ' ', 't', 'n', ' ', 'm', 'n', ' ', 'n', 'i', ' ', 't', 'n', 'l', 'u', 'e', 'e', ' ', 'e', 'r', 'i', 'n', 't', ' ', ' ', '.', 'y', 't', "'", 's', 'n', 'r', 'd', ' ', 'i', ' ', ' ', 'd', 'o', 'o', 'f', ' ', 'r', 's', 'g', '.', 'o', ' ', 'l', 't', ' ', 'c', 'y', 'g', ' ', 'n', 'n', ' ', 'c', ' ', 'u', 'n', 'g', 'c', ' ', 'n', 't', 's', ' ', 'a', 'n', 'm', ' ', 't', 'y', 'u', ' ', 'n']

In [76]:
def toString(s):  
    str1 = ""  
    for ele in s:  
        str1 += ele   
    return str1  

In [77]:
toString(result)

"steent: b cov) ynpattdfunn wiry fre doa.dmt olovn anusa hv fr sa a@.dk  y.#whe a iuayge  amrbmyoa gvomlse yaf a betsst:yiw, tn mn ni tnluee erint  .yt'snrd i  doof rsg.o lt cyg nn c ungc nts anm tyu n"