In [21]:
%load_ext autoreload
%autoreload 2
import sys, os
sys.path.append('../')
import utils
from utils.haiku_scrape import scrape_haiku, replace_all, detokenize

import matplotlib.pyplot as plt
import numpy as np
from pickle import dump, load

#TF,nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

from nltk import word_tokenize


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Haiku bot

### How to enforce stanza structure?
- 3 different rnns, each feeding into the next?

### Several approaches to enforce syllable rule:
1. Let the RNN learn on its own and see if it figures it out automatically 
    - Though syllables as a latent variable has very little hints in written text, this might work better with audio data of haikus
2. Restrict number of output units for each stanza as the max number of words/stanza from the data 
    - Then in output, iterate until syllable count satisfied `(while count_syl(stanza) != 5...`
        - This might not be so computationally expensive depending on number of possibile syllables (check this)      
3. Break data down into phoneme/syllable components and feed that through RNN

### Data cleaning
- Some post titles have a preface, amend this by taking all text after a colon?

### Embeddings
- transfer learning compare pre trained vs trained 
    - maybe find a poem embedding
- vis with umap

### Architectures 
- consider the bidirectional
- consider train/val split with scikit

# Resources
- https://medium.com/analytics-vidhya/a-comprehensive-guide-to-build-your-own-language-model-in-python-5141b3917d6d
- https://github.com/KhanradCoder/LearnKeras/blob/master/4.RNNs/TextGeneration.ipynb
- [Jason word based 1](https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/)
- [Jason word based 2](https://machinelearningmastery.com/develop-word-based-neural-language-models-python-keras/)
- [return seq](https://machinelearningmastery.com/return-sequences-and-return-states-for-lstms-in-keras/)

In [2]:
h_set = scrape_haiku()
h_set.append('Keep this in your mind. / Only after the clouds cry, / will the rainbow come') #good haiku but had trailing '/'
len(h_set)

828

In [3]:
h_set[:4]

['Money obsession / Unhealthy neglect of life / Better to live now',
 "Hanging out with friends / It is always a good time / Don't forget the mask",
 'Each time the phone rings / I think Grandma and forget / You left in the spring',
 "When I learned Morse code / I couldn't get restful sleep / The rain kept talking"]

In [4]:
def prepare_data(haikus,n=3):
    """n: n-gram size"""
    train = [] #n_grams
    for h in haikus:
        tokens = word_tokenize(h)
        for i in range(0,len(tokens)):
            n_gram_list = tokens[i:i+n]
            if len(n_gram_list) == n:
                train.append(detokenize(n_gram_list))
            else:
                break
    return train
train = prepare_data(h_set)

In [5]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train)
sequences = tokenizer.texts_to_sequences(train)
sequences = [i for i in sequences if len(i) == 3] # removing entries with apostrophe for now
vocab_size = len(tokenizer.word_index) + 1

#separate into input and output
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [None]:
#instead of tf tokenizer could also do 
chars = sorted(list(set(processed_text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


# or

chars = sorted(list(set(data_new)))
mapping = dict((c, i) for i, c in enumerate(chars))

def encode_seq(seq):
    sequences = list()
    for line in seq:
        # integer encode line
        encoded_seq = [mapping[char] for char in line]
        # store
        sequences.append(encoded_seq)
    return sequences

# encode the sequences
sequences = encode_seq(sequences)

# Model

Sequential LSTM

In [161]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=seq_length),
    LSTM(100,return_sequences=True),
    LSTM(100),
    Dense(100,activation = 'relu'),
    Dense(vocab_size, activation = 'softmax')
])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2, 50)             130900    
_________________________________________________________________
lstm (LSTM)                  (None, 2, 100)            60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 2618)              264418    
Total params: 546,218
Trainable params: 546,218
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, batch_size=128, epochs=75)
save = 1
if save:
    model.save('../models/model.h5')
    dump(tokenizer, open('../models/tokenizer.pkl', 'wb'))

# Text generation

In [23]:
model = load_model('../models/model.h5',compile=False)
tokenizer = load(open('../models/tokenizer.pkl', 'rb'))

In [33]:
index_word = {v:k for k,v in tokenizer.word_index.items()}
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    output = []
    text = seed_text
    for _ in range(n_words):
        encoded = tokenizer.texts_to_sequences([text])[0] # encode the text as integer
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre') #truncate sequences toa  fixed length (since reinputted)
        yhat = model.predict_classes(encoded, verbose=False)
        _word = index_word[yhat[0]] # map predicted word index to word
        text += ' ' + _word
        output.append(_word)
    return ' '.join(output)
        
seq_length = len(train[0].split()) - 1 # also defined as X.shape[1] above
seed_text = train[np.random.randint(0,len(train))]
print(seed_text + '\n')
generated = generate_seq(model, tokenizer, seq_length, seed_text, 100)
print(generated)

I called my

legs upon the leaves ns died with harsh broken coming ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work


# Evaluation

Some weird oscillating output (even though I am going beyond haiku length wanted to see what it generates)

```
I called my

legs upon the leaves ns died with harsh broken coming ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work ns i am stuck at work
```

- Need to figure out better tokens
- Hyperparams
    - see [this](https://stackoverflow.com/questions/56849552/lstm-getting-caught-up-in-loop)