In [1]:
import pandas as pd

df = pd.read_csv('tweets.csv', encoding ='latin1').head(200000)
df

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
199995,0,1971570223,Sat May 30 07:24:14 PDT 2009,NO_QUERY,gogetphilled,work... again
199996,0,1971570317,Sat May 30 07:24:15 PDT 2009,NO_QUERY,CocaBeenSlinky,@damienfranco Its so common for it to crash no...
199997,0,1971570508,Sat May 30 07:24:16 PDT 2009,NO_QUERY,dwfavoritegirl,my baby boy is wearing big boy underwear
199998,0,1971570762,Sat May 30 07:24:18 PDT 2009,NO_QUERY,LuisIsLegend,Fml! I forgot my phone charger @home!


In [3]:
tweet_list = df.iloc[:,-1].astype(str).tolist()
tweets_string = ' '.join(tweet_list)

with open("tweets_string.txt", "w", encoding="utf-8") as text_file:
    text_file.write(tweets_string)

In [None]:
import numpy, sys
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.utils import to_categorical

In [None]:
# load ascii text and covert to lowercase
filename = "tweets_string.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

In [None]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

In [None]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = to_categorical(dataY)
# define the LSTM model
model = Sequential([
    LSTM(32, input_shape=(None, X.shape[2]), return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(y.shape[1], activation='softmax')
])

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.summary()

In [None]:
# fit the model
model.fit(X, y, epochs=5, batch_size=128)

In [None]:
model.save('models/model02.h5')

# generating test

In [None]:
from tensorflow.keras.models import load_model

model = load_model('models/model01.h5')
model.summary()

In [None]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

# generate characters
for i in range(140):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")