In [None]:
#for google drive only
from google.colab import drive
drive.mount('/content/drive')
%cd ../content/drive/My\ Drive/Code/HaikuBot/notebooks/

# import nltk
# nltk.download('punkt')

Mounted at /content/drive
/content/drive/My Drive/Code/HaikuBot/notebooks


In [None]:
%load_ext autoreload
%autoreload 2
import sys, os
sys.path.append('../')
import utils
from utils.haiku_scrape import scrape_haiku, replace_all, detokenize, load_haiku, prepare_cl_data

import matplotlib.pyplot as plt
import numpy as np
from pickle import dump, load

#TF,nltk, scikit
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
#from nltk import word_tokenize

# Character level Haiku RNN

Why this and the word level are "tough" tasks: haikus are very small, so the context or lookback window has to be a fraction of the haiku itself, otherwise it will "leak" into other haikus of the dataset.

## Resources
- [Jason](https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/)
- [Khanrad](https://github.com/KhanradCoder/LearnKeras/blob/master/4.RNNs/TextGeneration.ipynb)

In [None]:
h_set = load_haiku()
h_set = h_set[:int(len(h_set)*0.6)] # accomodate colab RAM

In [None]:
flattened = " ".join(h_set).replace('  ',' ')
chars = sorted(list(set(flattened)))
char_to_id = {c:i for i,c in enumerate(chars)}
n_chars = len(flattened)
n_vocab = len(chars)
print(f'total # chars: {n_chars} \n # unique chars: {n_vocab}')

total # chars: 497483 
 # unique chars: 41


In [None]:
def prepare_cl_data(raw_text, n_chars, seq_length = 10):
    """prepares data for character level RNN
    also returns non reshaped version of X for generation later"""
    X_pre = [] # non reshaped id sequences
    y = []
    for i in range(0, n_chars - seq_length, 1):
        _x = raw_text[i : i+seq_length]
        if '$' in _x and _x.index('$') != len(_x)-1: #cutting off haiku data when they end
            continue
        _y = raw_text[i+seq_length]
        X_pre.append([char_to_id[c] for c in _x])
        y.append(char_to_id[_y])
    train_size = len(X_pre)
    print(f'train size: {train_size}')
    
    # reshape X to [samples, time_steps, features]
    X = np.reshape(X_pre, (train_size, seq_length, 1))
    X = X / float(n_vocab) # normalize
    y = to_categorical(y)
    return X_pre, X, y

X_pre,X,y = prepare_cl_data(flattened, n_chars=n_chars, seq_length=10)

train size: 436633


In [None]:
X.shape

(436633, 10, 1)

In [None]:
model = Sequential([
    LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True),
    Dropout(0.1),
    LSTM(256),
    Dropout(0.1),
    Dense(100, activation='relu'),
    Dense(y.shape[1], activation='softmax')
])

print(model.summary())

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_49 (LSTM)               (None, 10, 256)           264192    
_________________________________________________________________
dropout_37 (Dropout)         (None, 10, 256)           0         
_________________________________________________________________
lstm_50 (LSTM)               (None, 256)               525312    
_________________________________________________________________
dropout_38 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 100)               25700     
_________________________________________________________________
dense_23 (Dense)             (None, 41)                4141      
Total params: 819,345
Trainable params: 819,345
Non-trainable params: 0
_______________________________________________

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam')
filepath = '../models/cl_checkpoints/optimized-{epoch:02d}-{loss:.4f}.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=True, save_best_only=True)
model.fit(X,y, epochs=50, batch_size=80, callbacks=[checkpoint])

Epoch 1/50

Epoch 00001: loss improved from inf to 2.38447, saving model to ../models/cl_checkpoints/optimized-01-2.3845.hdf5
Epoch 2/50

Epoch 00002: loss improved from 2.38447 to 2.03043, saving model to ../models/cl_checkpoints/optimized-02-2.0304.hdf5
Epoch 3/50

Epoch 00003: loss improved from 2.03043 to 1.88840, saving model to ../models/cl_checkpoints/optimized-03-1.8884.hdf5
Epoch 4/50

Epoch 00004: loss improved from 1.88840 to 1.80493, saving model to ../models/cl_checkpoints/optimized-04-1.8049.hdf5
Epoch 5/50

Epoch 00005: loss improved from 1.80493 to 1.74818, saving model to ../models/cl_checkpoints/optimized-05-1.7482.hdf5
Epoch 6/50

Epoch 00006: loss improved from 1.74818 to 1.70325, saving model to ../models/cl_checkpoints/optimized-06-1.7033.hdf5
Epoch 7/50

Epoch 00007: loss improved from 1.70325 to 1.67106, saving model to ../models/cl_checkpoints/optimized-07-1.6711.hdf5
Epoch 8/50

Epoch 00008: loss improved from 1.67106 to 1.63941, saving model to ../models/cl_c

<tensorflow.python.keras.callbacks.History at 0x7f373bebc050>

In [None]:
model.load_weights("../models/cl_checkpoints/optimized-50-1.3583.hdf5")
model.compile(loss='categorical_crossentropy', optimizer='adam')
id_to_char = {i:c for i,c in enumerate(chars)}

In [None]:
def generate_sequences(model, Xpre, seed='AUTO'):
    if seed == 'AUTO':
        start = np.random.randint(0, len(Xpre)-1)
        sequence = Xpre[start]
        print(sequence)
        print(f'Auto seed: {"".join([id_to_char[v] for v in sequence])}')
    else:
        sequence = [char_to_id[c] for c in seed]
    end = False
    out = ""
    # while not end:
    for i in range(100):
        x = np.reshape(sequence, (1, len(sequence), 1))
        x = x/float(n_vocab)
        prediction = model.predict(x, verbose=False)
        index = np.argmax(prediction)
        result = id_to_char[index]
        if result == '$':
            end = True 
        sequence.append(index)
        sequence = sequence[1:len(sequence)]
        out += result
    #out = [id_to_char[v] for v in sequence]
    return out

In [None]:
generate_sequences(model, Xpre=X_pre, seed="this is a stanza / about somethin")

"g / movem i fall rowring all oakes meak / wornd is alome / i can't iosende / it alr sast instinn is "

`"The stars"` appears in oscillatory output for short input sequences, maybe due to its presence in the training set, resulting in overfitting

In [None]:
for h in h_set:
  if 'the stars' in h:
    print(h)

i can float away / into the deep dark abyss / welcomed by the stars $
the shine of the moon / and the glimmer of the stars / damn mortality $
the stars are smeared red /  the old ones arrive /  our very last sunrise $
in limpid silver / she shines bright amongst the stars / down here a dog barks $
the cityscape shifts /  when i remove my glasses /  the stars stay the same $
i'd bleed myself dry / and rename the stars for you / just to kiss you once $
something like a dream /  lost in the stars of your eyes /  that brought be back to older days $
the stars are abode  / to sky deities aloft / pure imagination $
crackling fires glowing / embers rise to meet the stars / tin cups clink with joy $
sleeping mountain dreams /  cloaked by the whispering rain /  waiting for the stars $
the stars have aligned / the great heroes have returned / they are twilight force $
a conversation / we had between the daisies / what do the stars mean $
the bright sun sets far / further than the stars and skies