In [1]:
import os
import numpy as np
np.random.seed(420)
from IPython.display import HTML

from tqdm import tqdm

from HMM import unsupervised_HMM, from_hmm
from HMM_helper import (
    parse_seqs,
    parse_text,
    update_syll_map,
    sample_sentence,
    visualize_sparsities,
    rhyme_dict_gen
)
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
with open(os.path.join(os.getcwd(), 'data/shakespeare.txt'), 'r') as f:
    text = f.read()

with open(os.path.join(os.getcwd(), 'data/Syllable_dictionary.txt'), 'r') as f:
    syll_map0 = {}
    for i, line in enumerate(f):
        line = line.strip().split()
        word = line[0] 
        # Add 10 to denote end of line syll_count
        sylls = np.array([int(s.replace('E', '1')) for s in line[1:]])
        assert(word not in syll_map0)
        syll_map0[word] = sylls

In [3]:
sonnets = parse_text(text, by='sonnet')
chars = sorted(list(set("".join(sonnets))))
char_to_int = dict((c, i) for i,c in enumerate(chars))
int_to_char = dict((i, c) for i,c in enumerate(chars))
num_chars = len(char_to_int)

# source: https://blog.usejournal.com/how-to-develop-a-character-based-neural-language-model-99c18de1d4d2
# organize into sequences of characters
def make_data(step, length = 40):
    char_seqs = list()
    for curr in sonnets:
        for i in range(length, len(curr), step):
            # select sequence of tokens
            seq = curr[i-length:i+1]
            # store
            char_seqs.append(seq)
    print('Total Sequences: %d' % len(char_seqs))

    # convert sequences of characters into sequences of integers using the mapping dictionary
    int_seqs = list()
    for seq in char_seqs:
        new_seq = [char_to_int[char] for char in seq]
        int_seqs.append(new_seq)
    
    int_seqs = np.array(int_seqs)
    # last character is y, first 40 characters are x
    train_X = int_seqs[:, :-1]
    train_Y = int_seqs[:, -1]
    return train_X, train_Y

In [4]:
def generate_seq(model, seq_length, seed_text, n_chars, verbose = False):
    in_text = seed_text
    # generate a fixed number of characters
    for i in tqdm(range(n_chars)):
        # encode the characters as integers
        encoded = [char_to_int[char] for char in in_text]
        # truncate sequences to a fixed length
        encoded = np.array([encoded[-seq_length:]])
        # one hot encode
        encoded = to_categorical(encoded, num_classes=len(char_to_int))
        # predict character
        pred = model.predict_classes(encoded, verbose=0)[0]
        # reverse map integer to character
        out_char = int_to_char[pred]
        in_text += out_char
    if verbose:
        print("Random seed: " + in_text[0:seq_length])
        print("Generated: " + in_text[seq_length:])
    return in_text

In [5]:
def train_rnn(step = 10, length = 40, epochs = 20):
    train_X, train_Y = make_data(step, length = length)
    ohe_X = np.array([to_categorical(x, num_classes = num_chars) for x in train_X])
    ohe_Y = to_categorical(train_Y, num_classes = num_chars)
    
    model = Sequential()
    model.add(LSTM(150, input_shape = (ohe_X.shape[1], ohe_X.shape[2])))
    model.add(Dense(num_chars, activation='softmax'))
    print(model.summary())
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(ohe_X, ohe_Y, epochs = epochs)
    
    return model

In [6]:
model = train_rnn(step = 3, length = 40, epochs = 40)

Total Sequences: 27921
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 150)               108000    
_________________________________________________________________
dense (Dense)                (None, 29)                4379      
Total params: 112,379
Trainable params: 112,379
Non-trainable params: 0
_________________________________________________________________
None
Train on 27921 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Ep

In [10]:
train_X, train_Y = make_data(3, 40)
start = np.random.randint(0, len(train_X) - 1)
pattern = ''.join([int_to_char[value] for value in train_X[start]])
generate_seq(model, 40, pattern, 40, verbose = True);

Total Sequences: 27921


100%|██████████████████████████████████████████████████████████████████████████████████| 40/40 [00:21<00:00,  1.90it/s]

Random seed: o remove o no it is an ever-fixed mark t
Generated: he sen my fair all thy hell of love mout



