In [1]:
import re
import numpy as np
import pandas as pd

tweets = pd.read_csv('trump.csv')['Tweet_Text']
tweets

0       Today we express our deepest gratitude to all ...
1       Busy day planned in New York. Will soon be mak...
2       Love the fact that the small groups of protest...
3       Just had a very open and successful presidenti...
4       A fantastic day in D.C. Met with President Oba...
                              ...                        
7370    I loved firing goofball atheist Penn @pennjill...
7371    I hear @pennjillette show on Broadway is terri...
7372    Irrelevant clown @KarlRove sweats and shakes n...
7373    "@HoustonWelder: Donald Trump is one of the se...
7374    RT @marklevinshow: Trump: Rove is a clown and ...
Name: Tweet_Text, Length: 7375, dtype: object

In [2]:
raw_text = ''

for row in tweets:
    raw_text += row
    raw_text += ' '
    
raw_text = raw_text.replace('\n', ' ')
raw_text = re.sub(r'http\S+', '', raw_text)

In [3]:
raw_text = raw_text.lower()
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  779711
Total Vocab:  79


In [4]:
import pickle

with open('chars.txt', 'wb') as fp:
    pickle.dump(chars, fp)

In [5]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataX, dataY, test_size=0.1, random_state=42)

print("Total Patterns: ", len(X_train))

Total Patterns:  701649


In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.utils import to_categorical

# reshape X to be [samples, time steps, features]
X = np.reshape(X_train, (len(X_train), seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = to_categorical(y_train)
# define the LSTM model
model = Sequential([
    LSTM(len(chars)*7, input_shape=(None, X.shape[2])),
    Dropout(0.2),
    Dense(y.shape[1], activation='softmax')
])

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 553)               1227660   
_________________________________________________________________
dropout_1 (Dropout)          (None, 553)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 79)                43766     
Total params: 1,271,426
Trainable params: 1,271,426
Non-trainable params: 0
_________________________________________________________________


In [10]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = [EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min', baseline=None)]

# fit the model
model.fit(X, y, epochs=10, batch_size=128, callbacks=early_stop, validation_split=0.1)

Train on 631484 samples, validate on 70165 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x25e38408048>

In [11]:
model.save('models/model02.h5')

# model validation

In [12]:
X = np.reshape(X_test, (len(X_test), seq_length, 1))

# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = to_categorical(y_test)

score = model.evaluate(X, y, verbose = 0) 

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

IndexError: invalid index to scalar variable.