In [None]:
import pandas as pd
import numpy as np
import pickle
import random

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dropout, LayerNormalization, Dense, Lambda
from tensorflow.keras.layers import MultiHeadAttention
from tensorflow.keras.models import Model

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

import warnings
warnings.filterwarnings('ignore')

In [None]:
data_path = "./data"
model_path = "./model"

In [None]:
class MusicTokenizer:
    def __init__(self):
        self.token_to_id = {}
        self.id_to_token = {}

    def build_vocab(self, sequences):
        for line in sequences:
            parts = line.strip().split()
            for token in parts:
                if token not in self.token_to_id:
                    token_id = len(self.token_to_id)
                    self.token_to_id[token] = token_id
                    self.id_to_token[token_id] = token

    def encode(self, sequences):
        return [[self.token_to_id[token] for token in line.strip().split()] for line in sequences]

    def decode(self, id_sequences):
        return [" ".join([self.id_to_token[token_id] for token_id in line]) for line in id_sequences]

    def decode2(self, id_sequences):
        return [" ".join([self.id_to_token[id] for id in id_sequences])]

In [None]:
class PositionalEmbeddingAdder(tf.keras.layers.Layer):
    def __init__(self, max_seq_length, d_model, **kwargs):
        super().__init__(**kwargs)
        self.max_seq_length = max_seq_length
        self.d_model = d_model
        self.position_embeddings = Embedding(max_seq_length, d_model)

    def call(self, x):
        positions = tf.range(start=0, limit=tf.shape(x)[1], delta=1)
        pos_embeds = self.position_embeddings(positions)
        return x + pos_embeds

class LastToken(tf.keras.layers.Layer):
    def call(self, x):
        return x[:, -1, :]

# Transformer model definition
def transformer_model(input_vocab_size, output_vocab_size, max_seq_length, d_model=128, num_heads=4, num_layers=2, dropout_rate=0.25):
    inputs = Input(shape=(max_seq_length,), dtype=tf.int32)

    # Token embedding
    token_embedding = Embedding(input_vocab_size, d_model)(inputs)

    # Add positional embedding
    outputs = PositionalEmbeddingAdder(max_seq_length, d_model)(token_embedding)

    # Transformer blocks
    for _ in range(num_layers):
        attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(
            outputs, outputs,
            attention_mask=tf.linalg.band_part(tf.ones((max_seq_length, max_seq_length)), -1, 0)
        )
        attention_output = Dropout(dropout_rate)(attention_output)
        attention_output = LayerNormalization(epsilon=1e-7)(outputs + attention_output)

        ffn_output = Dense(d_model * 4, activation='gelu')(attention_output)
        ffn_output = Dense(d_model, activation='gelu')(ffn_output)
        ffn_output = Dropout(dropout_rate)(ffn_output)

        outputs = LayerNormalization(epsilon=1e-7)(attention_output + ffn_output)

    # Only keep the last token's output to predict the next token
    outputs = LastToken()(outputs)

    # Final prediction layer
    outputs = Dense(output_vocab_size, activation='softmax')(outputs)

    return Model(inputs=inputs, outputs=outputs)

In [None]:
x_train = np.load(f'{data_path}/x_train.npy')
x_test = np.load(f'{data_path}/x_test.npy')
x_val = np.load(f'{data_path}/x_val.npy')
y_train = np.load(f'{data_path}/y_train.npy')
y_test = np.load(f'{data_path}/y_test.npy')
y_val = np.load(f'{data_path}/y_val.npy')

In [None]:
with open(f"{model_path}/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

In [None]:
batch_size = 32
epochs = 5
vocab_size = len(tokenizer.token_to_id)
max_seq_length = len(x_train[0])

model = transformer_model(
    input_vocab_size=vocab_size,
    output_vocab_size=vocab_size,
    max_seq_length=max_seq_length,
    d_model=256,
    num_heads=8,
    num_layers=4
)

model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    epochs=epochs,
    batch_size=batch_size
)

Epoch 1/5
[1m17650/17650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m948s[0m 52ms/step - accuracy: 0.6154 - loss: 1.4129 - val_accuracy: 0.7121 - val_loss: 0.9864
Epoch 2/5
[1m17650/17650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m893s[0m 51ms/step - accuracy: 0.7226 - loss: 0.9459 - val_accuracy: 0.7473 - val_loss: 0.8679
Epoch 3/5
[1m17650/17650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m891s[0m 50ms/step - accuracy: 0.7498 - loss: 0.8482 - val_accuracy: 0.7620 - val_loss: 0.8088
Epoch 4/5
[1m17650/17650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m886s[0m 50ms/step - accuracy: 0.7657 - loss: 0.7908 - val_accuracy: 0.7709 - val_loss: 0.7909
Epoch 5/5
[1m17650/17650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m881s[0m 50ms/step - accuracy: 0.7781 - loss: 0.7465 - val_accuracy: 0.7807 - val_loss: 0.7575


<keras.src.callbacks.history.History at 0x7a14105da910>

In [None]:
loss, accuracy = model.evaluate(x_test, y_test, batch_size=batch_size)
print(f"Test Accuracy: {accuracy:.4f}")

[1m2207/2207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 17ms/step - accuracy: 0.7827 - loss: 0.7613
Test Accuracy: 0.7816


In [None]:
model.save(f"{model_path}/model_5epochs.keras")

In [None]:
model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    epochs=epochs,
    batch_size=batch_size
)

Epoch 1/5
[1m17650/17650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m881s[0m 50ms/step - accuracy: 0.7888 - loss: 0.7112 - val_accuracy: 0.7868 - val_loss: 0.7378
Epoch 2/5
[1m17650/17650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m882s[0m 50ms/step - accuracy: 0.7973 - loss: 0.6803 - val_accuracy: 0.7925 - val_loss: 0.7292
Epoch 3/5
[1m17650/17650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m896s[0m 51ms/step - accuracy: 0.8056 - loss: 0.6502 - val_accuracy: 0.8004 - val_loss: 0.7052
Epoch 4/5
[1m17650/17650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m898s[0m 51ms/step - accuracy: 0.8147 - loss: 0.6220 - val_accuracy: 0.8039 - val_loss: 0.6871
Epoch 5/5
[1m17650/17650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m904s[0m 51ms/step - accuracy: 0.8215 - loss: 0.5964 - val_accuracy: 0.8091 - val_loss: 0.6779


<keras.src.callbacks.history.History at 0x7a12dc50af10>

In [None]:
loss, accuracy = model.evaluate(x_test, y_test, batch_size=batch_size)
print(f"Test Accuracy: {accuracy:.4f}")

[1m2207/2207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 18ms/step - accuracy: 0.8087 - loss: 0.6782
Test Accuracy: 0.8089


In [None]:
model.save(f"{model_path}/model_10epochs.keras")