In [None]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io
from keras.callbacks import ModelCheckpoint


with io.open("Tweets_no-toxic_pre_all.txt", encoding='utf-8') as f:
    text = f.read().lower()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


corpus length: 140845
total chars: 28


In [None]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

#Model architecture
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars), activation='softmax'))
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, _):
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

nb sequences: 46935
Vectorization...
Build model...


In [None]:
verbose = 1
def train_model(model, X, y, batch_size=128, nb_epoch=100, verbose=0):
    checkpointer = ModelCheckpoint(filepath="weights_E.hdf5", monitor='loss', verbose=verbose, save_best_only=True, mode='min')
    model.fit(X, y, batch_size=batch_size, epochs=nb_epoch, verbose=verbose, callbacks=[checkpointer])
    model.save('GenerativeModel_compiled')
    
train_model(model, x, y, verbose=verbose)

Epoch 1/100
Epoch 00001: loss improved from inf to 1.87155, saving model to weights_E.hdf5
Epoch 2/100
Epoch 00002: loss improved from 1.87155 to 1.81495, saving model to weights_E.hdf5
Epoch 3/100
Epoch 00003: loss improved from 1.81495 to 1.76580, saving model to weights_E.hdf5
Epoch 4/100
Epoch 00004: loss improved from 1.76580 to 1.72559, saving model to weights_E.hdf5
Epoch 5/100
Epoch 00005: loss improved from 1.72559 to 1.70111, saving model to weights_E.hdf5
Epoch 6/100
Epoch 00006: loss did not improve from 1.70111
Epoch 7/100
Epoch 00007: loss improved from 1.70111 to 1.67675, saving model to weights_E.hdf5
Epoch 8/100
Epoch 00008: loss improved from 1.67675 to 1.64869, saving model to weights_E.hdf5
Epoch 9/100
Epoch 00009: loss improved from 1.64869 to 1.64250, saving model to weights_E.hdf5
Epoch 10/100
Epoch 00010: loss did not improve from 1.64250
Epoch 11/100
Epoch 00011: loss did not improve from 1.64250
Epoch 12/100
Epoch 00012: loss did not improve from 1.64250
Epoch

Epoch 00037: loss improved from 1.42117 to 1.41682, saving model to weights_E.hdf5
Epoch 38/100
Epoch 00038: loss improved from 1.41682 to 1.41060, saving model to weights_E.hdf5
Epoch 39/100
Epoch 00039: loss improved from 1.41060 to 1.40304, saving model to weights_E.hdf5
Epoch 40/100
Epoch 00040: loss improved from 1.40304 to 1.39462, saving model to weights_E.hdf5
Epoch 41/100
Epoch 00041: loss did not improve from 1.39462
Epoch 42/100
Epoch 00042: loss improved from 1.39462 to 1.38825, saving model to weights_E.hdf5
Epoch 43/100
Epoch 00043: loss did not improve from 1.38825
Epoch 44/100
Epoch 00044: loss improved from 1.38825 to 1.37711, saving model to weights_E.hdf5
Epoch 45/100
Epoch 00045: loss improved from 1.37711 to 1.37690, saving model to weights_E.hdf5
Epoch 46/100
Epoch 00046: loss improved from 1.37690 to 1.37016, saving model to weights_E.hdf5
Epoch 47/100
Epoch 00047: loss improved from 1.37016 to 1.36909, saving model to weights_E.hdf5
Epoch 48/100
Epoch 00048: los

Epoch 00073: loss did not improve from 1.27077
Epoch 74/100
Epoch 00074: loss improved from 1.27077 to 1.26928, saving model to weights_E.hdf5
Epoch 75/100
Epoch 00075: loss improved from 1.26928 to 1.26048, saving model to weights_E.hdf5
Epoch 76/100
Epoch 00076: loss did not improve from 1.26048
Epoch 77/100
Epoch 00077: loss improved from 1.26048 to 1.24211, saving model to weights_E.hdf5
Epoch 78/100
Epoch 00078: loss did not improve from 1.24211
Epoch 79/100
Epoch 00079: loss did not improve from 1.24211
Epoch 80/100
Epoch 00080: loss did not improve from 1.24211
Epoch 81/100
Epoch 00081: loss did not improve from 1.24211
Epoch 82/100
Epoch 00082: loss did not improve from 1.24211
Epoch 83/100
Epoch 00083: loss did not improve from 1.24211
Epoch 84/100
Epoch 00084: loss did not improve from 1.24211
Epoch 85/100
Epoch 00085: loss did not improve from 1.24211
Epoch 86/100
Epoch 00086: loss improved from 1.24211 to 1.23810, saving model to weights_E.hdf5
Epoch 87/100
Epoch 00087: los

In [None]:
np.random.seed(1337)

In [None]:
def sample(preds):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / 0.2
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
N_CHARS = None

def create_index_char_map(corpus, verbose=0):
    chars = sorted(list(set(corpus)))
    global N_CHARS
    N_CHARS = len(chars)
    if verbose:
        print('No. of unique characters:', N_CHARS)
    char_to_idx = {c: i for i, c in enumerate(chars)}
    idx_to_char = {i: c for i, c in enumerate(chars)}
    return chars, char_to_idx, idx_to_char

chars, char_to_idx, idx_to_char = create_index_char_map(text, verbose=verbose)

No. of unique characters: 28


In [None]:
def generate_tweets(model, corpus, char_to_idx, idx_to_char, n_tweets=10, verbose=0): 
    model.load_weights('weights_E.hdf5')
    tweets = []
    spaces_in_corpus = np.array([idx for idx in range(len(corpus)) if corpus[idx] == ' '])
    for i in range(1, n_tweets + 1):
        begin = np.random.choice(spaces_in_corpus)
        tweet = u''
        sequence = corpus[begin:begin + maxlen]
        tweet += sequence
        if verbose:
            print('Tweet no. %03d' % i)
            print('=' * 13)
            print('Generating with seed:')
            print(sequence)
            print('_' * len(sequence))
        for _ in range(100):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sequence):
                x[0, t, char_to_idx[char]] = 1.0

            preds = model.predict(x, verbose=0)[0]
            next_idx = sample(preds)
            next_char = idx_to_char[next_idx]

            tweet += next_char
            sequence = sequence[1:] + next_char
        if verbose:
            print(tweet)
            print()
        tweets.append(tweet)
    return tweets

tweets = generate_tweets(model, text, char_to_idx, idx_to_char, verbose=verbose)

Tweet no. 001
Generating with seed:
 enterprise
fred hollow inured hope gene
________________________________________


  This is separate from the ipykernel package so we can avoid doing imports until


 enterprise
fred hollow inured hope genefit claim state week adest wear mask best post service contact supplay said walk case study bussia c

Tweet no. 002
Generating with seed:
 karl
would wear mask advertise job like
________________________________________
 karl
would wear mask advertise job like carrer assist world first train senior student story person thank doman story policy state research

Tweet no. 003
Generating with seed:
 couture pm speak listen
lockdown restri
________________________________________
 couture pm speak listen
lockdown restrict world blook complete lockdown week adest plan go research surveyan study wear mask story world fi

Tweet no. 004
Generating with seed:
 mask advertise job like murder wear mas
________________________________________
 mask advertise job like murder wear mask story world fire complete lockdown moneth support continue study coronaviru response stay street f

Tweet no. 005
Generating with seed:
 mass billion made big company want mone
_

### Evaluating the model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances

In [None]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(sentences)
Xval = vectorizer.transform(tweets)
print(pairwise_distances(Xval, Y=tfidf, metric='cosine').min(axis=1).mean())

0.4433820981217025
