In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import os
import requests
import csv
import TextAnalyzer as ta
import json

In [None]:
#### ONLY USED FOR GOOGLE COLAB TO DETECT GPU #####
# %tensorflow_version 2.x
# import tensorflow as tf
# device_name = tf.test.gpu_device_name()
# if device_name != '/device:GPU:0':
#     raise SystemError('GPU device not found')
# print('Found GPU at: {}'.format(device_name))

In [3]:
df = pd.read_csv('Haikus/PhonemeHaikusStructured.csv')
df.columns = ['text']
formatter = ta.TextFormat()
haiku_text = formatter.arraytotext(df['text'])

# Transform Data

In [4]:
# Convert Haikus into a DataFrame
df.columns = ['text']
# Get Unique Letters
vocab = sorted(set(haiku_text))
print(vocab)
print('Unique Characters: {}'.format(len(vocab)))

['\n', ' ', '0', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
Unique Characters: 29


In [5]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

def encodehaikus(haiku):
    encoded_haiku = np.array([char2idx[c] for c in haiku])
    return encoded_haiku
encoded_haikus = df['text'].apply(encodehaikus)
df['encoded'] = encoded_haikus
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '0' :   2,
  'A' :   3,
  'B' :   4,
  'C' :   5,
  'D' :   6,
  'E' :   7,
  'F' :   8,
  'G' :   9,
  'H' :  10,
  'I' :  11,
  'J' :  12,
  'K' :  13,
  'L' :  14,
  'M' :  15,
  'N' :  16,
  'O' :  17,
  'P' :  18,
  'Q' :  19,
  ...
}


In [6]:
# Show how the first 13 characters from the text are mapped to integers
print ('{}\n v ---- characters mapped to int ---- v \n{}'\
       .format(repr(df.loc[0,'text']), df.loc[0,'encoded']))

'SKEY0N AH0KRAO0S NAO0RTHBAW0ND KAO0RAH0S FAO0R NUW0 GRAW0S MIH0SIH0NG ER0AW0ND DEY0 DHAH0 FAO0R \n'
 v ---- characters mapped to int ---- v 
[21 13  7 27  2 16  1  3 10  2 13 20  3 17  2 21  1 16  3 17  2 20 22 10
  4  3 25  2 16  6  1 13  3 17  2 20  3 10  2 21  1  8  3 17  2 20  1 16
 23 25  2  1  9 20  3 25  2 21  1 15 11 10  2 21 11 10  2 16  9  1  7 20
  2  3 25  2 16  6  1  6  7 27  2  1  6 10  3 10  2  1  8  3 17  2 20  1
  0]


# Create training examples and targets

In [7]:
# Pad sequences with 0s so they are all the same length

# Get character lengths of each haiku 
df['length'] = df['encoded'].apply(lambda x:len(x))
max_length = df['length'].max()

def getpadded(row):
    leng = row['length']
    zeros = np.zeros((max_length-leng), dtype=np.int32)
    padded = np.append(row['encoded'],zeros)
    return padded

df['padded'] = df.apply(getpadded,axis=1)   
df['input_text'] = df['padded'].apply(lambda x: x[:-1])
df['target_text'] = df['padded'].apply(lambda x: x[1:])
print(max_length)

128


In [8]:
print(max_length)

128


In [9]:
dataset = tf.data.Dataset.from_tensor_slices((list(df['input_text']),list(df['target_text'])))
dataset

<TensorSliceDataset shapes: ((127,), (127,)), types: (tf.int32, tf.int32)>

In [10]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'SKEY0N AH0KRAO0S NAO0RTHBAW0ND KAO0RAH0S FAO0R NUW0 GRAW0S MIH0SIH0NG ER0AW0ND DEY0 DHAH0 FAO0R \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'
Target data: 'KEY0N AH0KRAO0S NAO0RTHBAW0ND KAO0RAH0S FAO0R NUW0 GRAW0S MIH0SIH0NG ER0AW0ND DEY0 DHAH0 FAO0R \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'


In [11]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 21 ('S')
  expected output: 13 ('K')
Step    1
  input: 13 ('K')
  expected output: 7 ('E')
Step    2
  input: 7 ('E')
  expected output: 27 ('Y')
Step    3
  input: 27 ('Y')
  expected output: 2 ('0')
Step    4
  input: 2 ('0')
  expected output: 16 ('N')


In [12]:
# Batch size
BATCH_SIZE = 32 
# 64 # 128 #256

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((32, 127), (32, 127)), types: (tf.int32, tf.int32)>

In [13]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

# Build the Model

In [14]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
    return model

In [15]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [16]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(32, 127, 29) # (batch_size, sequence_length, vocab_size)


In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (32, None, 256)           7424      
_________________________________________________________________
gru (GRU)                    (32, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (32, None, 29)            29725     
Total params: 3,975,453
Trainable params: 3,975,453
Non-trainable params: 0
_________________________________________________________________


In [18]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [19]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices])))

Input: 
 'DHAH0 DHAH0 YIH0R DHAH0S MAY0 YUW0NAH0VER0S SHEY0DIY0 WUH0D FRAH0NT DHAH0 AE0T FUH0L KAA0M DHAH0 \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'

Next Char Predictions: 
 'LK\nUQGJVTMCY0\nBSADIXPYLFIOVKFYPZU BN0ASOGYCMN JB\nRUVIMJVRBRTTLMAJFQGNF\nSYQSIXKHAJXBM IMZGPQGLQYOFMPFQXGLHZRKWZQQWLAXM0EBZZFRPVF'


In [20]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (32, 127, 29)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       3.3743708


In [21]:
model.compile(optimizer='adam', loss=loss)

In [None]:
#### ONLY RUN ON GOOGLE COLAB TO UTILZE GPU #####

# # Directory where the checkpoints will be saved
# checkpoint_dir = './training_checkpoints'
# # Name of the checkpoint files
# checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

# checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
#     filepath=checkpoint_prefix,
#     save_weights_only=True)
# EPOCHS=50
# history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])
# model.save('haiku_v3.h5') 
# from google.colab import drive
# drive.mount('/content/gdrive')
# model.save("/content/gdrive/My Drive/haiku_v3.h5")

In [23]:
from tensorflow.keras.models import load_model

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights('haiku_v2.h5')

model.build(tf.TensorShape([1, None]))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 256)            7424      
_________________________________________________________________
gru_2 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_2 (Dense)              (1, None, 29)             29725     
Total params: 3,975,453
Trainable params: 3,975,453
Non-trainable params: 0
_________________________________________________________________


In [24]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = max_length

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [25]:
with open("idict.json", "r") as read_file:
    idict = json.load(read_file)

with open("pdict.json", "r") as read_file:
    pdict = json.load(read_file) 
rtransformer = ta.PhonemeReverse(pdict,idict)

In [32]:
# Transform a seed into a phoneme
inputs = rtransformer.convertsyllables(rtransformer.transform('cloudy')) # This function needs to be refactored to one
# Use the phoneme to genereate a phoneme haiku
output = generate_text(model, inputs)
# Transform the phoneme into english
rtransformer.getenglish(output,runs=2)

['cloudy',
 'blue',
 'oak',
 'destroys',
 'if',
 'wouldst',
 'tide',
 'born',
 'the',
 'my',
 'may',
 'we']

Although the model is able to generate english, it is mostly incoherent. The ability to generate actual english is also partly because of unknown words being substitued with words generated from BERT - not due to the effectivness of the RNN model. Results are also inconsistent and the model didnt seem to learn that each output should be 17 syllables long. 

Next Steps:
* Use all types of poems, not just haikus, and split them into 17 syllables in order to have a larger corpous.
* Explore using other text generation tools such as GPT-2. 
