In [6]:
import pandas as pd
import numpy as np
import spacy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils


In [137]:
df = pd.read_csv('/Users/brandonbryant/Desktop/fivethirtyeight-trump-twitter/data/realdonaldtrump_poll_tweets.csv')

In [112]:
tweets = df['text'].values
goodTweets = []
for tweet in tweets:
    if 'RT' in tweet:
        continue
    if '@' in tweet:
        continue
    goodTweets.append(tweet)

In [138]:
text = ""
for tweet in tweets:
    text = text + " " + tweet

In [139]:
text = text.lower()
text = text.replace('.', "")
len(text)

53773

In [140]:
nlp = spacy.load('en', tagger=False, entity=False)
doc = nlp(text.lower())

tokens = []
for token in doc:
    if not 'http' in str(token):
        tokens.append(str(token))
        

n_tokens = len(tokens)
tokens_to_int = dict((c, i) for i, c in enumerate(tokens))
n_vocab = len(set(tokens))

In [141]:
print("Total Tokens: ", n_tokens)
print("Total Vocab: ", n_vocab)

Total Tokens:  10535
Total Vocab:  1914


In [142]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 11
dataX = []
dataY = []
for i in range(0, n_tokens - seq_length, 1):
    seq_in = tokens[i:i + seq_length]
    seq_out = tokens[i + seq_length]
    gen = ([tokens_to_int[token]] for token in seq_in)
    dataX.append([next(gen)[0],next(gen)[0],next(gen)[0],next(gen)[0],next(gen)[0],next(gen)[0],next(gen)[0], next(gen)[0],next(gen)[0],next(gen)[0],next(gen)[0]])
    dataY.append(tokens_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  10524


In [143]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [147]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.4))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [148]:
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20
Epoch 00001: loss improved from inf to 7.06318, saving model to weights-improvement-01-7.0632.hdf5
Epoch 2/20
Epoch 00002: loss improved from 7.06318 to 6.41479, saving model to weights-improvement-02-6.4148.hdf5
Epoch 3/20
Epoch 00003: loss improved from 6.41479 to 6.35135, saving model to weights-improvement-03-6.3513.hdf5
Epoch 4/20
Epoch 00004: loss improved from 6.35135 to 6.29867, saving model to weights-improvement-04-6.2987.hdf5
Epoch 5/20
Epoch 00005: loss improved from 6.29867 to 6.27511, saving model to weights-improvement-05-6.2751.hdf5
Epoch 6/20
Epoch 00006: loss improved from 6.27511 to 6.26308, saving model to weights-improvement-06-6.2631.hdf5
Epoch 7/20
Epoch 00007: loss did not improve
Epoch 8/20
Epoch 00008: loss did not improve
Epoch 9/20
Epoch 00009: loss did not improve
Epoch 10/20
Epoch 00010: loss did not improve
Epoch 11/20
Epoch 00011: loss did not improve
Epoch 12/20
Epoch 00012: loss did not improve
Epoch 13/20
Epoch 00013: loss improved from 6.2

<keras.callbacks.History at 0x1145e4400>

In [149]:
# load the network weights
filename = "weights-improvement-13-6.2619.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [150]:
int_to_token = dict((i, c) for i, c in enumerate(tokens))

In [151]:
generateTrumptweets = list()

In [160]:
for i in range(0,1):
    # pick a random seed
    start = np.random.randint(0, len(dataX)-1)
    pattern = dataX[start]
    tweet = ""
    tweet = tweet + ''.join([int_to_token[value] for value in pattern])
    # generate characters
    for i in range(40):
        x = np.reshape(pattern, (1, len(pattern), 1))
        x = x / float(n_vocab)
        prediction = model.predict(x, verbose=0)
        index = np.argmax(prediction)
        result = int_to_token[index]
        seq_in = [int_to_token[value] for value in pattern]
        #print(result)
        tweet = tweet + result
        pattern.append(index)
        pattern = pattern[1:len(pattern)]
        #print(pattern[1:len(pattern)])
    generateTrumptweets.append(tweet)

In [161]:
generateTrumptweets[0]

'keepingitrealhttp://tco/asrjp5acmz"i\'mleadingbybigmargins""""""""""""""""""""'