In [1]:
# Set up IPython to show all outputs from a cell
import warnings
import numpy as np
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

warnings.filterwarnings('ignore', category=RuntimeWarning)

RANDOM_STATE = 50
EPOCHS = 150
BATCH_SIZE = 2048
TRAINING_LENGTH = 15
TRAIN_FRACTION = 0.7
LSTM_CELLS = 64
VERBOSE = 0
SAVE_MODEL = True



from utils_2 import format_text, remove_spaces, make_sequences, create_train_valid

Using TensorFlow backend.


In [2]:
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional
from keras.optimizers import Adam

from keras.utils import plot_model

def make_word_level_model(num_words,
                          embedding_matrix,
                          lstm_cells=64,
                          trainable=False,
                          lstm_layers=1,
                          bi_direc=False):
    """Make a word level recurrent neural network with option for pretrained embeddings
       and varying numbers of LSTM cell layers."""

    model = Sequential()

    # Map words to an embedding
    if not trainable:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                weights=[embedding_matrix],
                trainable=False,
                mask_zero=True))
        model.add(Masking())
    else:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                weights=[embedding_matrix],
                trainable=True))

    # If want to add multiple LSTM layers
    if lstm_layers > 1:
        for i in range(lstm_layers - 1):
            model.add(
                LSTM(
                    lstm_cells,
                    return_sequences=True,
                    dropout=0.1,
                    recurrent_dropout=0.1))

    # Add final LSTM cell layer
    if bi_direc:
        model.add(
            Bidirectional(
                LSTM(
                    lstm_cells,
                    return_sequences=False,
                    dropout=0.1,
                    recurrent_dropout=0.1)))
    else:
        model.add(
            LSTM(
                lstm_cells,
                return_sequences=False,
                dropout=0.1,
                recurrent_dropout=0.1))
    model.add(Dense(128, activation='relu'))
    # Dropout for regularization
    model.add(Dropout(0.5))

    # Output layer
    model.add(Dense(num_words, activation='softmax'))

    # Compile the model
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy'])
    return model

In [3]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

def make_callbacks(model_name, save=SAVE_MODEL):
    """Make list of callbacks for training"""

    callbacks = [EarlyStopping(monitor='val_loss', patience=40)]
        #print("callback")
    if save:
          callbacks.append(
             ModelCheckpoint('{}{}.h5'.format(model_dir,model_name),
                save_best_only=True,
                save_weights_only=False))
    return callbacks

In [4]:
def find_answer(index):
    """Find label corresponding to features for index in training data"""

    # Find features and label
    feats = ' '.join(idx_word[i] for i in features[index])
    answer = idx_word[labels[index]]

    print('Features:', feats)
    print('\nLabel: ', answer)
    
#find_answer(100000)

In [5]:
import os
formatted = []
titles=[]
print("reading data from: ")
for file in os.listdir("data"):
    if file.endswith(".txt"):
        titles.append(file[:-4])
        print(os.path.join("data", file))
        f = open(os.path.join("data", file), "r")
        raw=f.read()
        formatted.append(format_text(raw))
titles
len(formatted)
#formatted[0]

reading data from: 
data/dickinson.txt
data/shakespeare.txt
data/seuss.txt
data/frost.txt
data/whitman.txt


['dickinson', 'shakespeare', 'seuss', 'frost', 'whitman']

5

In [7]:
filters = '%[\\]^_`{|}~\t'
word_idx, idx_word, num_words, word_counts, abstracts, sequences, features, labels = make_sequences(formatted, TRAINING_LENGTH, lower=True)

from numpy import save
save('idx_word.npy', idx_word)
#word_idx

dictionary size: 
19113
trainingseqLength: 
38852


In [8]:
#import stanford pre-trained word association vectors
import os
from keras.utils import get_file
import numpy as np

glove_vectors = 'glove.6B.100d.txt'
glove = np.loadtxt(glove_vectors, dtype='str', comments=None)
#glove.shape


vectors = glove[:, 1:].astype('float')
words = glove[:, 0]

del glove

#vectors[100], words[100]

word_lookup = {word: vector for word, vector in zip(words, vectors)}

embedding_matrix = np.zeros((num_words, vectors.shape[1]))

not_found = 0

for i in range(1,len(idx_word)):
    # Look up the word embedding
    word=idx_word[i]
    vector = word_lookup.get(word, None)

    # Record in matrix
    if vector is not None:
        #print(word)
        embedding_matrix[i, :] = vector
    else:
        not_found += 1

print('There were {} words without pre-trained embeddings.'.format(not_found))


import gc
gc.enable()
del vectors
gc.collect()

# Normalize and convert nan to 0
embedding_matrix = embedding_matrix / \
    np.linalg.norm(embedding_matrix, axis=1).reshape((-1, 1))
embedding_matrix = np.nan_to_num(embedding_matrix)

def find_closest(query, embedding_matrix, word_idx, idx_word, n=10):
    """Find closest words to a query word in embeddings"""

    idx = word_idx.get(query, None)
    # Handle case where query is not in vocab
    if idx is None:
        print('{query} not found in vocab.'.format(query=query))
        return
    else:
        vec = embedding_matrix[idx]
        # Handle case where word doesn't have an embedding
        if np.all(vec == 0):
            print('{query} has no pre-trained embedding.'.format(query=query))
            return
        else:
            # Calculate distance between vector and all others
            dists = np.dot(embedding_matrix, vec)

            # Sort indexes in reverse order
            idxs = np.argsort(dists)[::-1][:n]
            sorted_dists = dists[idxs]
            closest = [idx_word[i] for i in idxs]

    print('Query: {query}\n'.format(query=query))
    max_len = max([len(i) for i in closest])
    # Print out the word and cosine distances
    for word, dist in zip(closest, sorted_dists):
        print('Word: {wor} Cosine Similarity: {sec}'.format(wor=word[:15], sec=round(dist, 4)))
        
#find_closest('swamp', embedding_matrix, word_idx, idx_word)

There were 4294 words without pre-trained embeddings.


0

In [9]:
for i in range(0,len(titles)):
    TRAINING_MODEL_FOR_AUTHOR = i

    print("generating training data for " + titles[TRAINING_MODEL_FOR_AUTHOR])
    #word_idx, idx_word, num_words, word_counts, features, labels = make_sequences(
    #    formatted, TRAINING_LENGTH, lower=True)
    filters = '%[\\]^_`{|}~\t'
    word_idx, idx_word, num_words, word_counts, abstracts, sequences, features, labels = make_sequences(formatted, TRAINING_LENGTH, lower=True, target=TRAINING_MODEL_FOR_AUTHOR)


    X_train, X_valid, y_train, y_valid = create_train_valid(
        features, labels, num_words)
    X_train.shape
    y_train.shape
    import sys
    def check_sizes(gb_min=1):
        for x in globals():
            size = sys.getsizeof(eval(x)) / 1e9
            if size > gb_min:
                print('Object:', x)
                print('\tSize (GB): ',size)
    check_sizes(gb_min=.05)




    model = make_word_level_model(
        num_words,
        embedding_matrix,
        lstm_cells=LSTM_CELLS,
        trainable=True,
        bi_direc=False,
        lstm_layers=1)
    model.summary()


    model_name = titles[TRAINING_MODEL_FOR_AUTHOR]
    model_dir = 'models/'

    callbacks = make_callbacks(model_name)
    model.compile(
        optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

    history = model.fit(
        X_train,
        y_train,
        batch_size=2048,
        verbose=1,
        epochs=1000,
        callbacks=callbacks,
        validation_data=(X_valid, y_valid))

generating training data for dickinson
dictionary size: 
19113
trainingseqLength: 
38852


(27196, 15)

(27196, 19113)

Object: y_valid
	Size (GB):  0.22278124
Object: y_train
	Size (GB):  0.51979726
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         1911300   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 19113)             2465577   
Total params: 4,427,437
Trainable params: 4,427,437
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 27196 samples, validate on 11656 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
generating training data for shakespeare
dictionary size: 
19113
trainingseqLength: 
22366


(15656, 15)

(15656, 19113)

Object: y_valid
	Size (GB):  0.128248342
Object: y_train
	Size (GB):  0.29923324
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 100)         1911300   
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_3 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 19113)             2465577   
Total params: 4,427,437
Trainable params: 4,427,437
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 15656 samples, validate on 6710 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
generating training data for seuss
dictionary size: 
19113
trainingseqLength: 
21051


(14735, 15)

(14735, 19113)

Object: y_valid
	Size (GB):  0.12071782
Object: y_train
	Size (GB):  0.281630167
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 100)         1911300   
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_5 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 19113)             2465577   
Total params: 4,427,437
Trainable params: 4,427,437
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 14735 samples, validate on 6316 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
generating training data for frost
dictionary size

(20238, 15)

(20238, 19113)

Object: y_valid
	Size (GB):  0.165786274
Object: y_train
	Size (GB):  0.386809006
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 100)         1911300   
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_7 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 19113)             2465577   
Total params: 4,427,437
Trainable params: 4,427,437
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20238 samples, validate on 8674 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
generating training data for whitman
dictionary size: 
19113
trainingseqLength: 
167005


(116903, 15)

(116903, 19113)

Object: y_valid
	Size (GB):  0.957599638
Object: y_train
	Size (GB):  2.234367151
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 100)         1911300   
_________________________________________________________________
lstm_5 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_9 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 19113)             2465577   
Total params: 4,427,437
Trainable params: 4,427,437
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 116903 samples, validate on 50102 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000


In [37]:
def load_and_evaluate(model_name, return_model=False):
    """Load in a trained model and evaluate with log loss and accuracy"""

    model = load_model('{}{}.h5'.format(model_dir,model_name))
    r = model.evaluate(X_valid, y_valid, batch_size=2048, verbose=1)

    valid_crossentropy = r[0]
    valid_accuracy = r[1]

    print('Cross Entropy: {}'.format(round(valid_crossentropy, 4)))
    print('Accuracy: {}%'.format(round(100 * valid_accuracy, 2)))

    if return_model:
        return model

In [19]:
from keras.models import Sequential, load_model
model_name = 'testModel3'
model_dir=''
model = load_and_evaluate(model_name, return_model=True)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Cross Entropy: 5.8587
Accuracy: 22.26%


In [20]:
#np.random.seed(40)

# Number of all words
total_words = sum(word_counts.values())

topWords = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:15]
topWords
a = []
for i in topWords:  
    a.append(np.asarray(i))
for i in range(len(a)) :
    a[i][1]= 100* (topWords[i][1]/total_words);
a

# Compute frequency of each word in vocab
frequencies = [word_counts[word] / total_words for word in word_idx.keys()]
frequencies.insert(0, 0)

[(',', 18441),
 ('\n', 14409),
 ('the', 10461),
 ('and', 5546),
 ('of', 4458),
 ('i', 3002),
 ('to', 2290),
 ('.', 2025),
 ('in', 1946),
 ('you', 1680),
 ('a', 1390),
 ('with', 1300),
 ('!', 1294),
 ('is', 1196),
 ('all', 1100)]

[array([',', '11.01'], dtype='<U5'),
 array(['\n', '8.609'], dtype='<U5'),
 array(['the', '6.250'], dtype='<U5'),
 array(['and', '3.31'], dtype='<U4'),
 array(['of', '2.66'], dtype='<U4'),
 array(['i', '1.79'], dtype='<U4'),
 array(['to', '1.36'], dtype='<U4'),
 array(['.', '1.21'], dtype='<U4'),
 array(['in', '1.16'], dtype='<U4'),
 array(['you', '1.00'], dtype='<U4'),
 array(['a', '0.83'], dtype='<U4'),
 array(['with', '0.77'], dtype='<U4'),
 array(['!', '0.77'], dtype='<U4'),
 array(['is', '0.71'], dtype='<U4'),
 array(['all', '0.65'], dtype='<U4')]

In [21]:
from IPython.display import HTML


def header(text, color='black'):
    raw_html = '<h1 style="color: {};"><center>'.format(color) + \
        str(text) + '</center></h1>'
    return raw_html


def box(text):
    raw_html = '<div style="border:1px inset black;padding:1em;font-size: 20px;">' + \
        str(text)+'</div>'
    return raw_html


def addContent(old_html, raw_html):
    old_html += raw_html
    return old_html

In [31]:
import random


def generate_output(model,
                    sequences,
                    training_length=50,
                    new_words=50,
                    diversity=1,
                    return_output=False,
                    n_gen=1):
    """Generate `new_words` words of output from a trained model and format into HTML."""

    # Choose a random sequence
    seq = random.choice(sequences)

    # Choose a random starting point
    seed_idx = 3#random.randint(0, len(seq) - training_length - 1)
    # Ending index for seed
    end_idx = seed_idx + training_length

    gen_list = []

    for n in range(n_gen):
        # Extract the seed sequence
        seed = seq[seed_idx:end_idx]
        print(seq)
        original_sequence = [idx_word[i] for i in seed]
        generated = seed[:] + ['#']

        # Find the actual entire sequence
        actual = generated[:] + seq[end_idx:end_idx + new_words]

        # Keep adding new words
        for i in range(new_words):

            # Make a prediction from the seed
            preds = model.predict(np.array(seed).reshape(1, -1))[0].astype(
                np.float64)

            # Diversify
            preds = np.log(preds) / diversity
            exp_preds = np.exp(preds)

            # Softmax
            preds = exp_preds / sum(exp_preds)

            # Choose the next word
            probas = np.random.multinomial(1, preds, 1)[0]

            next_idx = np.argmax(probas)

            # New seed adds on old word
            seed = seed[1:] + [next_idx]
            generated.append(next_idx)

        # Showing generated and actual abstract
        n = []

        for i in generated:
            n.append(idx_word.get(i, '< --- >'))

        gen_list.append(n)

    a = []

    for i in actual:
        a.append(idx_word.get(i, '< --- >'))

    a = a[training_length:]

    gen_list = [
        gen[training_length:training_length + len(a)] for gen in gen_list
    ]

    if return_output:
        return original_sequence, gen_list, a

    # HTML formatting
    seed_html = ''
    seed_html = addContent(seed_html, header(
        'Seed Sequence', color='darkblue'))
    seed_html = addContent(seed_html,
                           box(remove_spaces(' '.join(original_sequence))))

    gen_html = ''
    gen_html = addContent(gen_html, header('RNN Generated', color='darkred'))
    gen_html = addContent(gen_html, box(remove_spaces(' '.join(gen_list[0]))))

    a_html = ''
    a_html = addContent(a_html, header('Actual', color='darkgreen'))
    a_html = addContent(a_html, box(remove_spaces(' '.join(a))))

    return seed_html, gen_html, a_html

In [32]:
seed_html, gen_html, a_html = generate_output(model, sequences, TRAINING_LENGTH,diversity=1)
HTML(seed_html)
HTML(gen_html)
HTML(a_html)

[3555, 4, 4681, 6, 179, 7, 3, 230, 326, 13, 2, 1589, 1, 247, 1, 3, 109, 134, 16, 1, 2, 3, 73, 681, 1426, 134, 16, 1, 1590, 4682, 6, 4683, 13, 2, 396, 6, 1179, 20, 2845, 1, 6, 84, 67, 2845, 1, 2, 396, 6, 4684, 49, 59, 1, 3556, 49, 59, 1, 447, 133, 1, 2, 265, 12, 3557, 2846, 1, 1427, 1, 4685, 2847, 1, 2, 169, 4, 397, 6, 600, 3, 230, 326, 8, 2, 3, 6976, 14, 1591, 1, 2, 6, 62, 20, 1284, 3, 2848, 77, 1285, 1, 2, 6, 88, 31, 28, 330, 120, 70, 31, 28, 1, 2, 6, 88, 31, 4686, 18, 85, 75, 1086, 7, 38, 8, 2, 190, 65, 6, 813, 19, 55, 645, 2849, 1, 2, 6, 813, 38, 1, 86, 4, 6977, 813, 38, 12, 16, 764, 6, 118, 1, 2, 6, 508, 23, 14, 3558, 18, 16, 7, 1286, 4687, 5, 38, 1, 2, 6, 67, 448, 12, 38, 1, 4, 6, 52, 449, 38, 9, 287, 8, 2, 10, 326, 6, 600, 4, 151, 177, 13, 6, 362, 10, 28, 20, 15, 21, 14, 65, 13, 2, 6, 362, 21, 213, 433, 14, 210, 65, 8, 2, 65, 14, 3, 2850, 601, 5, 4688, 1, 646, 4689, 17, 3559, 39, 2, 3, 323, 12, 41, 3560, 188, 1, 3, 2851, 1, 3, 6978, 1, 3, 2852, 363, 1, 28, 20, 2853, 1, 2, 3, 545,

In [48]:
def get_embeddings(model):
    embedding_layer = model.get_layer(index=0)
    embedding_matrix = embedding_layer.get_weights()[0]
    embedding_matrix = embedding_matrix / \
        np.linalg.norm(embedding_matrix, axis=1).reshape((-1, 1))
    embedding_matrix = np.nan_to_num(embedding_matrix)
    return embedding_matrix


embedding_matrix = get_embeddings(model)
embedding_matrix.shape

(14270, 100)

In [49]:
find_closest('a', embedding_matrix, word_idx, idx_word)

Query: a

Word: a Cosine Similarity: 1.0
Word: thy Cosine Similarity: 0.9610000252723694
Word: centenarian's Cosine Similarity: 0.9269999861717224
Word: his Cosine Similarity: 0.9204999804496765
Word: whose Cosine Similarity: 0.9182999730110168
Word: he-birds Cosine Similarity: 0.9180999994277954
Word: lesser Cosine Similarity: 0.9162999987602234
Word: magical Cosine Similarity: 0.9140999913215637
Word: surly Cosine Similarity: 0.9140999913215637
Word: soils Cosine Similarity: 0.9140999913215637
