In [103]:
from abc import ABC

import tensorflow as tf

import os
import pandas as pd

In [104]:
DELIMITER = '|'
seq_length = 100

BATCH_SIZE = 64
BUFFER_SIZE = 10000
EPOCHS = 10

In [105]:
dataset_df = pd.read_csv('dataset.csv', dtype=str, delimiter=DELIMITER).sample(frac=1)
lyrics_list = []
for index, row in dataset_df.iterrows():
    lyrics = row["lyrics"]
    if isinstance(lyrics, str):
        lyrics_list.append(lyrics.lower())

In [106]:
lyrics = ' '.join(lyrics_list)
vocab = sorted(set(lyrics))
print(f'{len(vocab)} unique characters')

ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

def text_from_ids(_ids):
    return tf.strings.reduce_join(chars_from_ids(_ids), axis=-1)

65 unique characters


In [107]:
all_ids = ids_from_chars(tf.strings.unicode_split(lyrics, 'UTF-8'))
all_ids = tf.cast(all_ids, dtype=tf.int8)
print(all_ids)

ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)
for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor([41 40 40 ... 39 44 40], shape=(12945659,), dtype=int8)
tf.Tensor(
[b'f' b'e' b'e' b'l' b's' b' ' b'l' b'i' b'k' b'e' b' ' b't' b'i' b'm'
 b'e' b"'" b's' b' ' b'm' b'o' b'v' b'i' b'n' b'g' b' ' b'i' b'n' b' '
 b's' b'l' b'o' b'w' b' ' b'm' b'o' b't' b'i' b'o' b'n' b'\n' b'j' b'u'
 b's' b't' b' ' b't' b'r' b'y' b'i' b'n' b'g' b' ' b't' b'o' b' ' b'o'
 b'c' b'c' b'u' b'p' b'y' b' ' b'm' b'y' b' ' b'm' b'i' b'n' b'd' b'\n'
 b's' b'o' b' ' b't' b'h' b'a' b't' b' ' b'i' b' ' b'd' b'o' b'n' b"'"
 b't' b'g' b'o' b'l' b'o' b'o' b'n' b'e' b'y' b' ' b'o' b'v' b'e' b'r'
 b' ' b'y' b'o'], shape=(101,), dtype=string)


In [108]:
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b"feels like time's moving in slow motion\njust trying to occupy my mind\nso that i don'tgolooney over yo"
b'u\njust trying to amplify the sound\ntodrown out all of this need for you\nbiting my nails, got me nervo'
b"us, so anxious\nsee it's one o'clock now\nnoon felt like three hours ago\n\ni just wanna know your e.t.a."
b", e.t.a.\nout the window, got me looking out the street\nwhat's your e.t.a.?\ndistance only made us grow"
b" fonder\nof one another\nbe honest, what's your e.t.a.?\nwhat's your e.t.a.?\nsay you almost right here n"


In [109]:
def split_input_target(sequence: list[str]) -> tuple[list[str], list[str]]:
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [110]:
dataset = (
    sequences
    .map(split_input_target)
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int8, name=None), TensorSpec(shape=(64, 100), dtype=tf.int8, name=None))>

In [111]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.lstm = tf.keras.layers.LSTM(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.lstm.get_initial_state(x)
    x, state1, state2  = self.lstm(x, initial_state=states, training=training)
    states = (state1, state2)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [112]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [113]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss)

In [114]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

In [116]:
class OneStep(tf.keras.Model, ABC):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [117]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars, temperature=0.69)

In [118]:
states = None

while True:
    #string_input = input("Enter seed text: ")
    string_input = "i am capable of "
    if string_input == "":
        break
    string_input = tf.constant([string_input])
    result = [string_input]
    for n in range(500):
      next_char, states = one_step_model.generate_one_step(string_input, states=states)
      string_input = string_input + next_char

    print(string_input.numpy()[0].decode())
    break


i am capable of light
making bad girls are bright
come on with me on your body
work the chanel to my body
how many nights i get the chance?
like a freak shit, and i still look like my nigga got a nigga go
what it feel a tool
he call me cream, we sharpened by him
do you like it when you say?
'cause i'm not the one for you
you don't want to say that i am everything
you know i want ya
i want you here right now
you want me but i won't be sad when you're with my mum

if this is all i want
i'll live my life of you
i'
