In [None]:
import re
import numpy as np
import pandas as pd

tweets = pd.read_csv('trump.csv')['Tweet_Text']
tweets

In [None]:
raw_text = ''

for row in tweets:
    raw_text += row
    raw_text += ' '
    
raw_text = raw_text.replace('\n', ' ')
raw_text = re.sub(r'http\S+', '', raw_text)

In [None]:
raw_text = raw_text.lower()
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

In [None]:
import pickle

with open('chars.txt', 'wb') as fp:
    pickle.dump(chars, fp)

In [None]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataX, dataY, test_size=0.1, random_state=42)

print("Total Patterns: ", len(X_train))

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.utils import to_categorical

# reshape X to be [samples, time steps, features]
X = np.reshape(X_train, (len(X_train), seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = to_categorical(y_train)
# define the LSTM model
model = Sequential([
    LSTM(512, input_shape=(None, X.shape[2]), return_sequences=True),
    Dropout(0.2),
    LSTM(512),
    Dropout(0.2),
    Dense(y.shape[1], activation='softmax')
])

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = [EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min', baseline=None)]

# fit the model
model.fit(X, y, epochs=200, batch_size=128, callbacks=early_stop, validation_split=0.1)

In [None]:
model.save('models/model02.h5')

# model validation

In [None]:
X = np.reshape(X_test, (len(X_test), seq_length, 1))

# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = to_categorical(y_test)

score = model.evaluate(X, y, verbose = 0) 

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])