In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [5]:
df = pd.read_csv('trained_model/preds/tweets_preds.csv')
df = df[df["toxic"] == 1] # keep positive comments
print(df.shape)
df.head(2)

(2075, 9)


Unnamed: 0,created_at,text,lang,tweet_id,possibly_sensitive,type,clean_tweet,toxic,pred_scores
3,2021-11-21 20:17:03+00:00,An Israeli doctor says he believes he caught t...,en,1466139299263533060,False,,"['people', ' say', ' belief', ' doctor', ' lon...",1,0.89556
6,2021-11-21 20:17:03+00:00,meu deus eu preciso fazer processos de emenda ...,pt,1466139299137695747,False,,"['de', ' emenda', ' meu', ' pra', ' e', ' faze...",1,0.998193


# Raw Text

In [6]:
text = ' '.join(df.text.values)
text = text.lower()

In [7]:
# create mapping of unique chars to integers
chars = sorted(list(set(text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [8]:
chars[:10]

['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(']

## Text Characteristics

In [9]:
n_chars = len(text)
n_vocab = len(chars)
print( "Total Characters: ", n_chars)
print( "Total Vocab: ", n_vocab)

Total Characters:  303934
Total Vocab:  622


## Prepare the dataset of input to output pairs encoded as integers

In [10]:
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = text[i:i + seq_length]
    seq_out = text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print( "Total Patterns: ", n_patterns)

Total Patterns:  303834


## Reshape X to be [samples, time steps, features]

In [11]:
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

## Define the LSTM model - Choose among single and deep, comment one

In [12]:
# Single Model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

## Define the checkpoint
Because of the slowness and because of our optimization requirements, we will use model checkpointing to record all of the network weights to file each time an improvement in loss is observed at the end of the epoch. We will use the best set of weights (lowest loss) to instantiate our generative model in the next section.

In [13]:
filepath = "trained_model/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

## Fit model to data

In [18]:
# Lesser epochs lesser time worst results
model.fit(X, y, epochs=5, batch_size=128, callbacks = callbacks_list)

# More epochs, more time, best results
#model.fit(X, y, epochs=50, batch_size=64, callbacks = callbacks_list)

Epoch 1/5
Epoch 00001: loss improved from inf to 3.30275, saving model to trained_model\weights-improvement-01-3.3028.hdf5
Epoch 2/5
Epoch 00002: loss improved from 3.30275 to 3.27748, saving model to trained_model\weights-improvement-02-3.2775.hdf5
Epoch 3/5
Epoch 00003: loss improved from 3.27748 to 3.18497, saving model to trained_model\weights-improvement-03-3.1850.hdf5
Epoch 4/5
Epoch 00004: loss improved from 3.18497 to 3.03886, saving model to trained_model\weights-improvement-04-3.0389.hdf5
Epoch 5/5
Epoch 00005: loss improved from 3.03886 to 2.96611, saving model to trained_model\weights-improvement-05-2.9661.hdf5


<keras.callbacks.History at 0x2a90738f130>

After running the example, you should have a number of weight checkpoint files in the local directory.

You can delete them all except the one with the smallest loss value. For example, when I ran this example, below was the checkpoint with the smallest loss that I achieved.

In [None]:
# Best checkpoint name here

## Generating text

Generating text using the trained LSTM network is relatively straightforward.

Firstly, we load the data and define the network in exactly the same way, except the network weights are loaded from a checkpoint file and the network does not need to be trained.

In [20]:
# load the network weights
filename = "trained_model/weights-improvement-05-2.9661.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [21]:
# Reverse mapping
int_to_char = dict((i, c) for i, c in enumerate(chars))

## Make predictions

The simplest way to use the Keras LSTM model to make predictions is to first start off with a seed sequence as input, generate the next character then update the seed sequence to add the generated character on the end and trim off the first character. This process is repeated for as long as we want to predict new characters (e.g. a sequence of 1,000 characters in length).

We can pick a random input pattern as our seed sequence, then print generated characters as we generate them.

In [None]:
import sys
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print( "Seed:")
print( "\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print( "\nDone.")

Seed:
" is it a myth? i have this comfort blanket that rhinovirus stops covid 😭😂 is it nuts !? gets me throu "
  oo poe  oo po  ho po  ho po  ho po  ho po  ho po  ho po  ho po  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  ho  oo to  

## 10 Extension Ideas to Improve the Model


Predict fewer than 1,000 characters as output for a given seed.

Remove all punctuation from the source text, and therefore from the models’ vocabulary.

Try a one hot encoded for the input sequences.

Train the model on padded sentences rather than random sequences of characters.

Increase the number of training epochs to 100 or many hundreds.

Add dropout to the visible input layer and consider tuning the dropout percentage.

Tune the batch size, try a batch size of 1 as a (very slow) baseline and larger sizes from there.

Add more memory units to the layers and/or more layers.

Experiment with scale factors (temperature) when interpreting the prediction probabilities.

Change the LSTM layers to be “stateful” to maintain state across batches.