In [2]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [3]:
df = pd.read_csv('trained_model/preds/tweets_preds.csv')
df = df[df["toxic"] == 0] # keep positive comments
print(df.shape)
df.head(2)

(2075, 9)


Unnamed: 0,created_at,text,lang,tweet_id,possibly_sensitive,type,clean_tweet,toxic,pred_scores
3,2021-11-21 20:17:03+00:00,An Israeli doctor says he believes he caught t...,en,1466139299263533060,False,,"['people', ' say', ' belief', ' doctor', ' lon...",1,0.89556
6,2021-11-21 20:17:03+00:00,meu deus eu preciso fazer processos de emenda ...,pt,1466139299137695747,False,,"['de', ' emenda', ' meu', ' pra', ' e', ' faze...",1,0.998193


# Raw Text

In [4]:
text = ' '.join(df.text.values)
text = text.lower()

In [5]:
# create mapping of unique chars to integers
chars = sorted(list(set(text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [6]:
chars[:10]

['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(']

## Text Characteristics

In [7]:
n_chars = len(text)
n_vocab = len(chars)
print( "Total Characters: ", n_chars)
print( "Total Vocab: ", n_vocab)

Total Characters:  303934
Total Vocab:  622


## Prepare the dataset of input to output pairs encoded as integers

In [8]:
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = text[i:i + seq_length]
    seq_out = text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print( "Total Patterns: ", n_patterns)

Total Patterns:  303834


## Reshape X to be [samples, time steps, features]

In [9]:
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

## Define the LSTM model - Choose among single and deep, comment one

In [10]:
# Single Model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

## Define the checkpoint
Because of the slowness and because of our optimization requirements, we will use model checkpointing to record all of the network weights to file each time an improvement in loss is observed at the end of the epoch. We will use the best set of weights (lowest loss) to instantiate our generative model in the next section.

In [11]:
#filepath = "trained_model/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
#checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
#callbacks_list = [checkpoint]

## Fit model to data

In [12]:
# Lesser epochs lesser time worst results
#model.fit(X, y, epochs=5, batch_size=128, callbacks = callbacks_list)

# More epochs, more time, best results
#model.fit(X, y, epochs=50, batch_size=64, callbacks = callbacks_list)

After running the example, you should have a number of weight checkpoint files in the local directory.

You can delete them all except the one with the smallest loss value. For example, when I ran this example, below was the checkpoint with the smallest loss that I achieved.

In [13]:
# Best checkpoint name here

## Generating text

Generating text using the trained LSTM network is relatively straightforward.

Firstly, we load the data and define the network in exactly the same way, except the network weights are loaded from a checkpoint file and the network does not need to be trained.

In [14]:
# load the network weights
filename = "trained_model/weights-improvement-05-2.9661.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [15]:
# Reverse mapping
int_to_char = dict((i, c) for i, c in enumerate(chars))

## Make predictions

The simplest way to use the Keras LSTM model to make predictions is to first start off with a seed sequence as input, generate the next character then update the seed sequence to add the generated character on the end and trim off the first character. This process is repeated for as long as we want to predict new characters (e.g. a sequence of 1,000 characters in length).

We can pick a random input pattern as our seed sequence, then print generated characters as we generate them.

In [16]:
"""import sys
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print( "Seed:")
print( "\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(100):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print( "\nDone.")"""

'import sys\n# pick a random seed\nstart = np.random.randint(0, len(dataX)-1)\npattern = dataX[start]\nprint( "Seed:")\nprint( """, \'\'.join([int_to_char[value] for value in pattern]), """)\n# generate characters\nfor i in range(100):\n\tx = np.reshape(pattern, (1, len(pattern), 1))\n\tx = x / float(n_vocab)\n\tprediction = model.predict(x, verbose=0)\n\tindex = np.argmax(prediction)\n\tresult = int_to_char[index]\n\tseq_in = [int_to_char[value] for value in pattern]\n\tsys.stdout.write(result)\n\tpattern.append(index)\n\tpattern = pattern[1:len(pattern)]\nprint( "\nDone.")'

## 10 Extension Ideas to Improve the Model


Predict fewer than 1,000 characters as output for a given seed.

Remove all punctuation from the source text, and therefore from the models’ vocabulary.

Try a one hot encoded for the input sequences.

Train the model on padded sentences rather than random sequences of characters.

Increase the number of training epochs to 100 or many hundreds.

Add dropout to the visible input layer and consider tuning the dropout percentage.

Tune the batch size, try a batch size of 1 as a (very slow) baseline and larger sizes from there.

Add more memory units to the layers and/or more layers.

Experiment with scale factors (temperature) when interpreting the prediction probabilities.

Change the LSTM layers to be “stateful” to maintain state across batches.

In [18]:
text = text.replace("\n", " ")  # We remove newlines chars for nicer display
print("Corpus length:", len(text))

chars = sorted(list(set(text)))
print("Total chars:", len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print("Number of sequences:", len(sentences))

x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Corpus length: 303934
Total chars: 621
Number of sequences: 101298


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = np.zeros((len(sentences), len(chars)), dtype=np.bool)


In [19]:
from tensorflow import keras
from tensorflow.keras import layers
model = Sequential(
    [
        keras.Input(shape=(maxlen, len(chars))),
        layers.LSTM(128),
        layers.Dense(len(chars), activation="softmax"),
    ]
)
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)

In [20]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [21]:
from numpy import random
epochs = 10
batch_size = 128

for epoch in range(epochs):
    model.fit(x, y, batch_size=batch_size, epochs=1)
    print()
    print("Generating text after epoch: %d" % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print("...Diversity:", diversity)

        generated = ""
        sentence = text[start_index : start_index + maxlen]
        print('...Generating with seed: "' + sentence + '"')

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.0
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("...Generated: ", generated)
        print()


Generating text after epoch: 0
...Diversity: 0.2
...Generating with seed: "hild for covid 🥴🚬 fail! kylie irving was"
...Generated:   the and covid a ported covid and the and covid a a covid a the covid covid and the covid a dick a de prople a the sore to have a on that covid a covid and the and the and covid sond the ast the proter the ont a proter covid covid a sont a proter a proter a the covid a manted and the and the covid a proter a deater the and covid are to a dick a a covid and covid and a so the and the and a proter a

...Diversity: 0.5
...Generating with seed: "hild for covid 🥴🚬 fail! kylie irving was"
...Generated:   a with a post a agest a terit covid hate sout a mident arous rawe in infiction a how us aringit a shat in that in covid have covid in ond anows a dide to a searted hare tran covid a ding anterely ase wores ale the covid to and covid concorvid the so https://t.co/spwsakzke @norcendored @repcapisvir @artontedton @gonithtarter @artisusshingon like a @covidic @hcro