In [1]:
import tensorflow as tf
import tensorflow_text as text

import re
import pandas as pd

In [None]:
DELIMITER = '|'
seq_length = 128

BATCH_SIZE = 128
BUFFER_SIZE = 10000
EPOCHS = 10

In [None]:
dataset_df = pd.read_csv('dataset.csv', dtype=str, delimiter=DELIMITER).sample(frac=1)

lyrics = dataset_df['lyrics'].str.cat()
lyrics = re.sub(r'\n{3,}', '\n', lyrics)

In [2]:
model = tf.io.gfile.GFile('sp_model.model', 'rb').read()
tokenizer = text.SentencepieceTokenizer(model=model, out_type=tf.int32)

In [None]:
tokens = tokenizer.tokenize(lyrics)
all_ids = tf.convert_to_tensor(tokens, dtype=tf.int32)
print(all_ids)

ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True, name="batched_lyrics_as_ids")
sequences

In [None]:
for seq in sequences.take(1):
  print(tokenizer.id_to_string(seq))
  print(tokenizer.detokenize(seq))

In [None]:
def split_input_target(sequence: list[str]) -> tuple[list[str], list[str]]:
  input_text = sequence[:-1]
  target_text = sequence[1:]
  return input_text, target_text

In [None]:
dataset = (
    sequences
    .map(split_input_target)
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

In [3]:
from model import MyModel
model = MyModel(vocab_size=tokenizer.vocab_size(),
                embedding_dim=256,
                rnn_units=1024)

loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.optimizers.Adamax()
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'], run_eagerly=True)

In [4]:
import os

model_dir = './model'
if os.path.exists(model_dir):
    model.load_weights(tf.train.latest_checkpoint(model_dir))
    print('Loaded Weights')

Loaded Weights


In [None]:
history = model.fit(dataset, epochs=EPOCHS,)
model.save_weights(os.path.join(model_dir, "weights"))

In [None]:
from matplotlib import pyplot as plt


def plot_graphs(_history):
    fig, axs = plt.subplots(1, len(_history.items()), figsize=(10,5))
    fig.suptitle("Metrics")
    for i, (title, values) in enumerate(_history.items()):
        axs[i].plot(values)
        axs[i].set_xlabel("Epochs")
        axs[i].set_ylabel(title.title())
    plt.show()


plot_graphs(history.history)

In [5]:
from abc import ABC

class OneStep(tf.keras.Model, ABC):
    def __init__(self, model, tokenizer: text.SentencepieceTokenizer, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = model
        self.tokenizer = tokenizer

        # Create a mask to prevent "[UNK]" from being generated.
        skip_ids = tf.constant([[0]], dtype=tf.int64)
        sparse_mask = tf.SparseTensor(
            # Put an -inf at each bad index.
            values=[-float('inf')]*len(skip_ids),
            indices=skip_ids,
            # Match the shape to the vocabulary
            dense_shape=[tokenizer.vocab_size()])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

    def generate(self, inputs: str, temperature: float = 1.0, steps: int = 200, states = None):
        tokens = self.tokenizer.tokenize(inputs)
        output_array = tf.TensorArray(dtype=tf.int32, dynamic_size=True, size=0)
        for i in tf.range(0, len(tokens)):
            output_array = output_array.write(output_array.size(), tokens[i])

        for _ in tf.range(steps):
            inputs = tf.convert_to_tensor([output_array.stack()])
            predicted_logits, states = self.model(inputs=inputs, states=states,
                                                  return_state=True)
            # Only use the last prediction.
            predicted_logits = predicted_logits[:, -1, :]
            predicted_logits = predicted_logits / temperature
            # Apply the prediction mask: prevent "[UNK]" from being generated.
            predicted_logits = predicted_logits + self.prediction_mask

            # Sample the output logits to generate token IDs.
            predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
            predicted_ids = tf.squeeze(predicted_ids, axis=-1)
            predicted_id = tf.cast(predicted_ids[0], dtype=tf.int32)
            output_array = output_array.write(output_array.size(), predicted_id)

        return self.tokenizer.detokenize(output_array.stack())

In [6]:
one_step = OneStep(model, tokenizer)

In [8]:
test = "We know each other"
states = None
print(one_step.generate(test, steps = 250).numpy().decode())

We know each other thing I want.. They be screamin it out.
i know this one thing must go on till it stop me, feel it right, yo' work bow do i need some rappin? i know what i'm talkin is real, it's snaping there aint no star under the light, you're all loose im feelin blown. And dark straight, plus trip flight, momma, we get snow, grab them millionaires pocket, empty, hollow in the rain, im sick bumpin, i'm hot en clothin through the rock..
i got that top butt drop in God we got pass smoke to get us all the night waits till i love em to shine in these i.m. bottl
