In [1]:
import tensorflow as tf

import re
import pandas as pd
from tokenization import FullTokenizer

In [None]:
DELIMITER = '|'
seq_length = 128

BATCH_SIZE = 128
BUFFER_SIZE = 10000
EPOCHS = 10

In [None]:
dataset_df = pd.read_csv('dataset.csv', dtype=str, delimiter=DELIMITER).sample(frac=1)

lyrics = dataset_df['lyrics'].str.cat()
lyrics = re.sub(r'\n{3,}', '\n', lyrics)

In [2]:
tokenizer = FullTokenizer(spm_model_file='sp_model.model')
print(f'{tokenizer.vocab_size} unique characters')

loading sentence piece model
512 unique characters


In [None]:
tokens = tokenizer.tokenize(lyrics)
all_ids = tf.convert_to_tensor(tokens, dtype=tf.int16)
print(all_ids)

ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True, name="batched_lyrics_as_ids")
sequences

In [None]:
for seq in sequences.take(1):
  print(tokenizer.convert_ids_to_tokens(seq))
  print(tokenizer.detokenize(seq))

In [None]:
def split_input_target(sequence: list[str]) -> tuple[list[str], list[str]]:
  input_text = sequence[:-1]
  target_text = sequence[1:]
  return input_text, target_text

In [None]:
dataset = (
    sequences
    .map(split_input_target)
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

In [3]:
from model import MyModel
model = MyModel(vocab_size=tokenizer.vocab_size,
                embedding_dim=256,
                rnn_units=1024)

loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.optimizers.Adamax()
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [4]:
import os

model_dir = './model'
if os.path.exists(model_dir):
    model.load_weights(tf.train.latest_checkpoint(model_dir))
    print('Loaded Weights')

Loaded Weights


In [None]:
history = model.fit(dataset, epochs=EPOCHS,)
model.save_weights(os.path.join(model_dir, "weights"))

In [None]:
from matplotlib import pyplot as plt


def plot_graphs(_history):
    fig, axs = plt.subplots(1, len(_history.items()), figsize=(10,5))
    fig.suptitle("Metrics")
    for i, (title, values) in enumerate(_history.items()):
        axs[i].plot(values)
        axs[i].set_xlabel("Epochs")
        axs[i].set_ylabel(title.title())
    plt.show()


plot_graphs(history.history)

In [5]:
from abc import ABC

class OneStep(tf.keras.Model, ABC):
    def __init__(self, model, tokenizer: FullTokenizer, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = model
        self.tokenizer = tokenizer

        # Create a mask to prevent "[UNK]" from being generated.
        skip_ids = tf.constant([[tokenizer.sp_model.unk_id()]], dtype=tf.int64)
        sparse_mask = tf.SparseTensor(
            # Put an -inf at each bad index.
            values=[-float('inf')]*len(skip_ids),
            indices=skip_ids,
            # Match the shape to the vocabulary
            dense_shape=[tokenizer.vocab_size])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)
        # inputs = tf.TensorSpec(shape=[None], dtype=tf.string)
        # states = [tf.TensorSpec(shape=[None, None], dtype=tf.float32) for _ in range(4)]
        # temperature = tf.TensorSpec(shape=None, dtype=tf.float32)
        # steps = tf.TensorSpec(shape=None, dtype=tf.int32)
        # self.generate.get_concrete_function(inputs, states, temperature, steps)

    # @tf.function
    def generate(self, inputs: str, states=None, temperature: float = 1.0, steps: int = 200):
        tokens = self.tokenizer.tokenize(inputs)
        output_array = tf.TensorArray(dtype=tf.int16, dynamic_size=True, size=0)
        for i in tf.range(0, len(tokens)):
            output_array = output_array.write(output_array.size(), tokens[i])

        for _ in tf.range(steps):
            inputs = tf.convert_to_tensor([output_array.stack()])
            predicted_logits, states = self.model(inputs=inputs, states=states,
                                                  return_state=True)
            # Only use the last prediction.
            predicted_logits = predicted_logits[:, -1, :]
            predicted_logits = predicted_logits / temperature
            # Apply the prediction mask: prevent "[UNK]" from being generated.
            predicted_logits = predicted_logits + self.prediction_mask

            # Sample the output logits to generate token IDs.
            predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
            predicted_ids = tf.squeeze(predicted_ids, axis=-1)
            predicted_id = tf.cast(predicted_ids[0], dtype=tf.int16)
            output_array = output_array.write(output_array.size(), predicted_id)

        return self.tokenizer.detokenize(output_array.stack())

In [6]:
one_step = OneStep(model, tokenizer)

In [8]:
test = "We know each other"
states = None
print(one_step.generate(test, steps = 250))

We know each other tonight
Don't say good-bye

Someday you'll dream of God
We'll always be together
And our love is gettin' carted
Off together the water will crawl
Take in kisses underneath
We're livin' in a cheatar end

He is prangin'
And you'll be glad to say
Was ever there to remember
That is my pleasure
We are just me

This might he speaks to you
Brother be memory
Ha ha, gives can you wear
Just please take the end of us
The creator's head together

Some night will we find you better
Some say I'll come inside you
Cause day you gonna
We know each other is a place I want when I keep running around all in a place where ad behave me still You're all i ever needed heaven meet Thee Your better hands
the less of your kingdom suyside it's either pretending he's coming with you even where I need to be
there's no escape where no ones around me it can go on because even if U'm gonna give it up for Remain i'm on

No struggles come over but there's nothing you should do
like* you came with with