In [2]:
####
# Original network adapted from karpathy
# minesh.mathew@gmail.com
# modified version of text generation example in keras;
# trained in a many-to-many fashion using a time distributed dense layer

####
from __future__ import print_function


from matplotlib import pyplot as plt
from matplotlib import pyplot
import matplotlib as mptl
import pylab

import keras
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, LeakyReLU, BatchNormalization
# from keras.layers import LSTM, TimeDistributedDense, SimpleRNN  #DEPRECATED TimeDistributedDense
from keras.layers import LSTM, TimeDistributed, SimpleRNN, CuDNNGRU, CuDNNLSTM
from keras.utils.data_utils import get_file
import numpy as np
from time import sleep
import random
import sys
import os
import pickle
import gzip
from sklearn.metrics import confusion_matrix
import itertools
import tensorflow as tf

gpu_options = tf.GPUOptions(allow_growth=True)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
keras.backend.tensorflow_backend.set_session(sess)
#import h5py

In [3]:
#-------------------- TEXT FILE PREPROCESSING -----------------

def preprocess_text_file(filename, maxlen=40):
    """load a file and split the text it contains into sequences of length = maxlen
    returns text, chars, char_indices, indices_char, sentences, next_chars
    text: the raw text (turned into lowercase)
    chars: a list of unique characters in the text
    char_indices: a dictionary of the character-to-index conversion
    indices_char: a dictionary of the index-to-character conversion
    sequences: a list of the sequences of max length extracted from the file (stride specified by the step variable below)
    next_chars: a list of the corresponding sequences of max_length next-characters following each of the sequence character members
      in other words, each member of next_chars is contains the last maxlen-1 chars of the correspondiong sequence ...
      and the next character from the text after the last character in that sequence """
    print('loading: ', filename)
    text = open(filename).read().lower()
    print('corpus length:', len(text))
    chars = sorted(list(set(text)- set(["\n"])))   #returns unique characters from the text
    print('total chars:', len(chars))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    text = None
    # split the corpus into sequences of length=maxlen
    # input is a sequence of 40 chars and target is also a sequence of 40 chars shifted by one position
    # for eg: if you maxlen=3 and the text corpus is abcdefghi, your input ---> target pairs will be
    # [a,b,c] --> [b,c,d], [b,c,d]--->[c,d,e]....and so on
    step = 1
    sequences = []
    next_chars = []
    tweets = open(filename).readlines()
    #first generate sentences of characters
    for text in tweets:
        text = text.lower().strip()
        for i in range(0, len(text) - maxlen + 1, step):
            sequences.append(text[i: i + maxlen])  # input seq is from i to i  + maxlen
            next_chars.append(text[i + 1:i + 1 + maxlen])  # output seq is from i+1 to i+1+maxlen
    print('number of sequences:', len(sequences))
    return text, chars, char_indices, indices_char, sequences, next_chars


def vectorize_text(chars, char_indices, sentences, next_chars, maxlen=40):  #UNUSED: "next_chars"
    """Accepts a list of sentences to convert to indices.  Used characters, their corresponding indeces to produce a set of sequences
    of X and corresponding labels y"""
    # now generate dummy variables (1-hot vectors) for the sequences of characters
    print('Vectorization processing... this could take a while...')
    X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), maxlen, len(chars)),
                 dtype=np.bool)  # y is also a sequence , or  a seq of 1 hot vectors
    joblength = len(sentences)
    tenpercent = joblength/10
    nextpercent = tenpercent
    print(" part 1 of 2")
    for i, sentence in enumerate(sentences):
        if i>nextpercent:
            print(i, " of ", joblength, " completed")
            nextpercent += tenpercent
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1  # X has dimension [sentence_count, sentence_length, char_count]
    print(" part 2 of 2")
    nextpercent = tenpercent

    for i, sentence in enumerate(next_chars):
        if i>nextpercent:
            print(i, " of ", joblength, " completed")
            nextpercent += tenpercent
        for t, char in enumerate(sentence):
            y[i, t, char_indices[char]] = 1  # y has dimension [sentence_count, sentence_length, char_count]
    print('vetorization completed')
    return X, y


def generate_text(model, char_indices, indices_char, seed_string="brutus:", generate_character_count=320):
    """Generates text using a model"""
    print("seed string --> ", seed_string)
    print('The generated text is: ')
    sys.stdout.write(seed_string),
    # x=np.zeros((1, len(seed_string), len(chars)))
    for i in range(generate_character_count):
        x = np.zeros((1, len(seed_string), len(chars)))
        for t, char in enumerate(seed_string):
            x[0, t, char_indices[char]] = 1.
        preds = model.predict(x, verbose=0)[0]
        # print (np.argmax(preds[7]))
        next_index = np.argmax(preds[len(seed_string) - 1])

        # next_index=np.argmax(preds[len(seed_string)-11])
        # print (preds.shape)
        # print (preds)
        # next_index = sample(preds, 1) #diversity is 1
        next_char = indices_char[next_index]
        seed_string = seed_string + next_char

        # print (seed_string)
        # print ('##############')
        # if i==40:
        #    print ('####')
        sys.stdout.write(next_char)
    sys.stdout.flush()
    return seed_string

    

In [4]:
# ---------------- MODEL FILE I/O ---------------------------
def save_model(model, save_dir=os.path.join(os.getcwd(), 'saved_models'),
               model_file_name='keras_trumptweets_trained_model.h5'):
    """
    Save model and current weights
    :param model: Keras model
    :param save_dir: path name to save directory
    :param model_file_name: filename for saved model
    :return: nothing
    """
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    model_path = os.path.join(save_dir, model_file_name)
    model.save(model_path)
    print('Saved trained model at %s ' % model_path)


def load_model(save_dir, model_file_name):
    # Load model and weights
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    model_path = os.path.join(save_dir, model_file_name)
    model = keras.models.load_model(model_path)
    print('Loaded trained model from %s ' % model_path)
    return model


# ---------------- MODEL ARCHITECTURE ---------------------------
def build_model(characters):
    # build the model: 2 stacked LSTM
    print('Build model...')
    model = Sequential()
    model.add(CuDNNLSTM(1024, return_sequences=True, input_shape=(None,len(characters)) ))  # minesh witout specifying the input_length
    model.add(CuDNNLSTM(1024, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(Dense(512, activation="relu", bias_initializer=keras.initializers.Constant(value=0.01))) 
    model.add(TimeDistributed(Dense(len(characters)))) 
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    print('model is made')
    # train the model, output generated text after each iteration
    print(model.summary())
    return model



# ---------------- MODEL TRAINING ---------------------------
def train_net(model, x, y, training_iterations=25, maxlen=40, save_all_model_iterations=True):
    for training_iteration in range(1, training_iterations+1):
        print()
        print('-' * 50)
        print('Training Iteration (epoch) #:', training_iteration)
        history = model.fit(x, y, batch_size=512, epochs=1, verbose=1)    #train 1 epoch at a time using previous weights
        sleep(0.1)  # https://github.com/fchollet/keras/issues/2110

        # saving models at the following iterations -- uncomment it if you want tos save weights and load it later
        # if training_iteration==1 or training_iteration==3 or training_iteration==5 or training_iteration==10 or training_iteration==20 or training_iteration==30 or training_iteration==50 or training_iteration==60 :

        # # save every training_iteration of weights
        # model.save_weights('Karpathy_LSTM_weights_' + str(training_iteration) + '.h5', overwrite=True)
        # start_index = random.randint(0, len(text) - maxlen - 1)
        save_dir = os.path.join(os.getcwd(), 'saved_models')
        current_model_file_name = 'Trump_LSTM_model_' + str(training_iteration) + '.h5'
        if save_all_model_iterations:
            save_model(model=model, save_dir=save_dir, model_file_name=current_model_file_name)
        sys.stdout.flush()
        print('loss is')
        print(history.history['loss'][0])
        print()
    return model

In [5]:
################################################################
#--------------------- Main Code -----------------------------


# pick the filename you want to use, and comment out the rest
# make sure you have this directory structure
raw_text_filename='./textdatasets/trumptweets.txt'
    

text, chars, char_indices, indices_char, sentences, next_chars = preprocess_text_file(raw_text_filename)

#vectorized form takes too much space to save... so process in real time
X, y = vectorize_text(chars, char_indices, sentences, next_chars)

loading:  ./textdatasets/trumptweets.txt
corpus length: 2264736
total chars: 86
number of sequences: 1496335
Vectorization processing... this could take a while...
 part 1 of 2
149634  of  1496335  completed
299268  of  1496335  completed
448901  of  1496335  completed
598535  of  1496335  completed
748168  of  1496335  completed
897802  of  1496335  completed
1047435  of  1496335  completed
1197069  of  1496335  completed
1346702  of  1496335  completed
 part 2 of 2
149634  of  1496335  completed
299268  of  1496335  completed
448901  of  1496335  completed
598535  of  1496335  completed
748168  of  1496335  completed
897802  of  1496335  completed
1047435  of  1496335  completed
1197069  of  1496335  completed
1346702  of  1496335  completed
vetorization completed


In [7]:
TRAIN_MODE = False
if TRAIN_MODE:
    model = build_model(characters=chars)
    model_epoch_training_iterations = 25    #the bigger your text corpus, the smaller you can make this
    model = train_net(model=model, x=X, y=y,
                      training_iterations=model_epoch_training_iterations,
                      save_all_model_iterations=True)
else:  # load a model from a file
    # decide which iteration of the trained model you want to explore
    model_training_iteration = 25
    save_dir = os.path.join(os.getcwd(), 'saved_models')
    current_model_file_name = 'Trump_LSTM_model_' + str(model_training_iteration) + '.h5'
    model = load_model(save_dir=save_dir, model_file_name=current_model_file_name)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Loaded trained model from C:\Users\Andrew\Desktop\trumpbot\saved_models\Trump_LSTM_model_25.h5 


In [8]:
GENERATE_TEXT_MODE = True
if GENERATE_TEXT_MODE:   # generate text mode
    #decide which saved model to load
    #make up a string of characters to start with
    seed_string = "covfefe" # pick something from the news
    # decide how many text characters you want to generate:
    gen_char_count = 140 - len(seed_string) # has to be a tweet!
    tweet = generate_text(model, char_indices, indices_char, seed_string, generate_character_count=gen_char_count)

seed string -->  covfefe
The generated text is: 
covfefering the democrat debate live on twitter! lets do this!!! @realdonaldtrump #1 • with a lead 2x's higher than #2 jeb bush. #trump2016?