In [None]:
# base
import re, pickle
import numpy as np

# tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
with open('raw_text.txt', 'r', encoding='utf8') as myfile:
    raw_text = myfile.read()

In [None]:
def clean_tweet(t):
    # remove quotes
    t = re.sub(r'"@.*', '', t)
    t = re.sub(r'^“.*”$', '', t)
    # remove URLs
    t = re.sub(r'https*:\/\/\S*', '', t)
    t = re.sub(r'pic\.twitter\.com\/\S*', '', t)
    # remove \n
    t = re.sub('\n', '', t)
    # remove extra whitespaces
    t = re.sub(r'\s+', ' ', t)
    # replace '&amp' with 'and'
    t = re.sub('&amp;', 'and', t) 
    return(t)

raw_text = clean_tweet(raw_text)

In [None]:
def prepare_tweet_clf(t):
    # clean
    t = t.lower()
    t = re.sub("'ll", ' will', t) # replace abbreviations
    t = re.sub("won't", 'will not', t)
    t = re.sub("n't", ' not', t) 
    t = re.sub(r'@[A-Za-z0-9_]+', '', t) # remove @mention
    t = re.sub(r'#[A-Za-z0-9_]+', '', t) # remove #tag
    t = re.sub(r'[^a-zA-Z ]', '', t) # remove special characters
    # remove stopwords 
    #stop = set(stopwords.words('english')) 
    #stop.update(['rt', 'cc']) 
    #stop = stop - set(['no', 'not', 'never']) 
    #t = [word for word in t.split(' ') if word not in stop] 
    #t = ' '.join(t)    
    return t

raw_text = prepare_tweet_clf(raw_text)

In [None]:
raw_text = raw_text.lower()
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

In [None]:
with open('chars.txt', 'wb') as fp:
    pickle.dump(chars, fp)

In [None]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataX, dataY, test_size=0.1, random_state=42)

print("Total Patterns: ", len(X_train))

In [None]:
# reshape X to be [samples, time steps, features]
X = np.reshape(X_train, (len(X_train), seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = to_categorical(y_train)
# define the LSTM model
model = Sequential([
    LSTM(256, input_shape=(None, X.shape[2]), return_sequences=True),
    Dropout(0.2),
    LSTM(256, return_sequences=True),
    Dropout(0.2),
    LSTM(256),
    Dropout(0.2),
    Dense(y.shape[1], activation='softmax')
])

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.summary()

In [None]:
early_stop = [EarlyStopping(monitor='val_loss', min_delta=0, patience=7, verbose=0, mode='auto')]

# fit the model
model.fit(X, y, epochs=200, batch_size=128, callbacks=early_stop, validation_split=0.1)

In [None]:
model.save('models/model02.h5')