In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

import numpy as np
import pandas as pd
import re

In [2]:
file_path = '/content/drive/MyDrive/Colab Notebooks/tweets_11-06-2020.csv'
data = pd.read_csv(file_path)

In [3]:
data.head()

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date
0,98454970654916608,Republicans and Democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02 18:07:48
1,1234653427789070336,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58
4,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59


In [4]:
# Clean text
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = " ".join(filter(lambda x:x[0]!="@", text.split()))
    return text

data['text'] = data['text'].apply(lambda x: clean_text(x))
data = data['text']

In [5]:
data.head()

0    republicans and democrats have both created ou...
1    i was thrilled to be back in the great city of...
2    rt read: letter to surveillance court obtained...
3    the unsolicited mail in ballot scam is a major...
4    rt very friendly telling of events here about ...
Name: text, dtype: object

In [6]:
# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
total_words = len(tokenizer.word_index) + 1
input_sequences = []

for line in data:
	token_list = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(token_list)):
   n_gram_sequence = token_list[:i+1]
   input_sequences.append(n_gram_sequence)


In [7]:
# Padding
max_sequence_length = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre'))

In [None]:
x, labels = input_sequences[:,:-1],input_sequences[:,-1]
y = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [None]:
# Model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length-1))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(50))
model.add(LSTM(25))
model.add(tf.keras.layers.Dropout(0.1))
model.add(Dense(total_words/20))
model.add(Dense(total_words, activation='softmax'))
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

In [None]:
history = model.fit(x, y, epochs=20, verbose=1)

In [2]:
# Generation
def generate_text(seed, limit):
  for _ in range(limit):
      token_list = tokenizer.texts_to_sequences([seed])[0]
      token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
      predicted = np.argmax(model.predict(token_list), axis=-1)
      output_word = ""
      for word, index in tokenizer.word_index.items():
          if index == predicted:
              output_word = word
              break
      seed += " " + output_word

In [None]:
generate_text("America", 50)