In [18]:
from __future__ import print_function
import tensorflow as tf
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import unidecode
import random
import sys
import io
import re
import os

In [2]:
path = '../data/lyrics_rock.txt'
with io.open(path, encoding='utf-8') as f:
    text = f.read().lower()
print('corpus length:', len(text))

corpus length: 557845755


In [3]:
text = re.sub(' +', ' ', text)
text = unidecode.unidecode(text)
text = text.lower()
text = re.sub('\x7f', '', text)
text = re.sub('[$`_@*+~}{|=<>%&#\[\]^\/]', '', text)
text = re.sub('\\\\', '', text)

text[:2000]

" i could feel at the time. there was no way of knowing. fallen leaves in the night. who can say where they're blowing. as free as the wind. hopefully learning. why the sea on the tide. has no way of turning. more than this. you know there's nothing. more than this. tell me one thing. more than this. you know there's nothing. it was fun for a while. there was no way of knowing. like a dream in the night. who can say where we're going. no care in the world. maybe i'm learning. why the sea on the tide. has no way of turning. more than this. you know there's nothing. more than this. tell me one thing. more than this. you know there's nothing. more than this. you know there's nothing. more than this. tell me one thing. more than this. there's nothing.\n take me now, baby, here as i am. hold me close, and try and understand. desire is hunger is the fire i breathe. love is a banquet on which we feed. come on now, try and understand. the way i feel under your command. take my hand, as the sun

In [4]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
print(chars)

total chars: 49
['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [5]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

print(char_indices)
print(indices_char)

{'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, ',': 7, '-': 8, '.': 9, '0': 10, '1': 11, '2': 12, '3': 13, '4': 14, '5': 15, '6': 16, '7': 17, '8': 18, '9': 19, ':': 20, ';': 21, '?': 22, 'a': 23, 'b': 24, 'c': 25, 'd': 26, 'e': 27, 'f': 28, 'g': 29, 'h': 30, 'i': 31, 'j': 32, 'k': 33, 'l': 34, 'm': 35, 'n': 36, 'o': 37, 'p': 38, 'q': 39, 'r': 40, 's': 41, 't': 42, 'u': 43, 'v': 44, 'w': 45, 'x': 46, 'y': 47, 'z': 48}
{0: '\n', 1: ' ', 2: '!', 3: '"', 4: "'", 5: '(', 6: ')', 7: ',', 8: '-', 9: '.', 10: '0', 11: '1', 12: '2', 13: '3', 14: '4', 15: '5', 16: '6', 17: '7', 18: '8', 19: '9', 20: ':', 21: ';', 22: '?', 23: 'a', 24: 'b', 25: 'c', 26: 'd', 27: 'e', 28: 'f', 29: 'g', 30: 'h', 31: 'i', 32: 'j', 33: 'k', 34: 'l', 35: 'm', 36: 'n', 37: 'o', 38: 'p', 39: 'q', 40: 'r', 41: 's', 42: 't', 43: 'u', 44: 'v', 45: 'w', 46: 'x', 47: 'y', 48: 'z'}


In [6]:
text_as_int = np.array([char_indices[c] for c in text])

In [7]:
# use the tf.data.Dataset.from_tensor_slices function to convert the text vector into a stream of character indices.

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
for i in char_dataset.take(10):
    print(indices_char[i.numpy()])

 
i
 
c
o
u
l
d
 
f


In [8]:
# the batch method lets us easily convert these individual characters to sequences of the desired size.

# the maximum length sentence we want for a single input in characters
seq_length = 100

sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(10):
    print(repr(''.join([indices_char[e.numpy()] for e in item])))
    
# TODO: use windows = char_dataset.window(20, shift=3, drop_remainder=True) to create semi-redundant batches

' i could feel at the time. there was no way of knowing. fallen leaves in the night. who can say where'
" they're blowing. as free as the wind. hopefully learning. why the sea on the tide. has no way of tur"
"ning. more than this. you know there's nothing. more than this. tell me one thing. more than this. yo"
"u know there's nothing. it was fun for a while. there was no way of knowing. like a dream in the nigh"
"t. who can say where we're going. no care in the world. maybe i'm learning. why the sea on the tide. "
"has no way of turning. more than this. you know there's nothing. more than this. tell me one thing. m"
"ore than this. you know there's nothing. more than this. you know there's nothing. more than this. te"
"ll me one thing. more than this. there's nothing.\n take me now, baby, here as i am. hold me close, an"
'd try and understand. desire is hunger is the fire i breathe. love is a banquet on which we feed. com'
'e on now, try and understand. the way i feel under your comman

In [9]:
# for each sequence, duplicate and shift it to form the input and target text by using the map method to apply a simple function to each batch

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

for input_example, target_example in dataset.take(5):
    print(repr(''.join([indices_char[e.numpy()] for e in input_example])))
    print(repr(''.join([indices_char[e.numpy()] for e in target_example])))

' i could feel at the time. there was no way of knowing. fallen leaves in the night. who can say wher'
'i could feel at the time. there was no way of knowing. fallen leaves in the night. who can say where'
" they're blowing. as free as the wind. hopefully learning. why the sea on the tide. has no way of tu"
"they're blowing. as free as the wind. hopefully learning. why the sea on the tide. has no way of tur"
"ning. more than this. you know there's nothing. more than this. tell me one thing. more than this. y"
"ing. more than this. you know there's nothing. more than this. tell me one thing. more than this. yo"
"u know there's nothing. it was fun for a while. there was no way of knowing. like a dream in the nig"
" know there's nothing. it was fun for a while. there was no way of knowing. like a dream in the nigh"
"t. who can say where we're going. no care in the world. maybe i'm learning. why the sea on the tide."
". who can say where we're going. no care in the world. maybe i'm learnin

In [10]:
BATCH_SIZE = 64

# buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements)
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [11]:
# build the lstm model

# length of the vocabulary in chars
vocab_size = len(chars)

# the embedding dimension
embedding_dim = 256

# number of RNN units
rnn_units = 1024

# TODO: best GRU or LSTM? improve model? --> LSTM
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
    ])
    return model

# TODO: check tf.keras.layers.Embedding

In [12]:
model = build_model(
    vocab_size = len(chars),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

In [13]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 49) # (batch_size, sequence_length, vocab_size)


In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           12544     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 49)            50225     
Total params: 4,001,073
Trainable params: 4,001,073
Non-trainable params: 0
_________________________________________________________________


In [15]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 49)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       3.8926039


In [16]:
model.compile(optimizer='adam', loss=loss)

In [19]:
# directory where the checkpoints will be saved
checkpoint_dir = '../output/'
# name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [20]:
EPOCHS= 10 

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

# TODO: crear clases

Train for 9371 steps
  11/9371 [..............................] - ETA: 9:25:00 - loss: 4.1113

KeyboardInterrupt: 

In [None]:
# generate text
# rebuild the model and restore the weights from the checkpoint
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))
model.summary()

In [None]:
def generate_text(model, start_string):
    # evaluation step (generating text using the learned model)

    # number of characters to generate
    num_generate = 1000

    # converting our start string to numbers (vectorizing)
    input_eval = [char_indices[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # empty string to store our results
    text_generated = []

    # low temperatures results in more predictable text.
    # higher temperatures results in more surprising text.
    # experiment to find the best setting.
    temperature = 1.0

    # here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # we pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(indices_char[predicted_id])

    return (start_string + ''.join(text_generated))

# TODO: how is the previous hidden state is passed to the model?

In [None]:
# TODO: accept capital letters in input
text_generated = generate_text(model, start_string="i miss you")

p = re.compile('(?<=[\.\?!]\s)(\w+)')

def cap(match):
    return(match.group().capitalize())

final_text = p.sub(cap, text_generated.capitalize())

In [None]:
print(final_text)