# Mini Transformer - Poem Generation

Fadhlan Nazhif Azizy
18221128


## Import Library

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import GPT2Tokenizer, GPT2TokenizerFast
from sklearn.model_selection import train_test_split
from tokenizers import ByteLevelBPETokenizer
# import tensorflow_models as tfm

In [2]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_rows', None)


## Load dataset

In [3]:
df = pd.read_csv("hf://datasets/merve/poetry/poetry.csv")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
df.to_csv('merve-poetry.csv')

In [5]:
df['content'].iloc[0]

'Let the bird of loudest lay\r\nOn the sole Arabian tree\r\nHerald sad and trumpet be,\r\nTo whose sound chaste wings obey.\r\n\r\nBut thou shrieking harbinger,\r\nFoul precurrer of the fiend,\r\nAugur of the fever\'s end,\r\nTo this troop come thou not near.\r\n\r\nFrom this session interdict\r\nEvery fowl of tyrant wing,\r\nSave the eagle, feather\'d king;\r\nKeep the obsequy so strict.\r\n\r\nLet the priest in surplice white,\r\nThat defunctive music can,\r\nBe the death-divining swan,\r\nLest the requiem lack his right.\r\n\r\nAnd thou treble-dated crow,\r\nThat thy sable gender mak\'st\r\nWith the breath thou giv\'st and tak\'st,\r\n\'Mongst our mourners shalt thou go.\r\n\r\nHere the anthem doth commence:\r\nLove and constancy is dead;\r\nPhoenix and the Turtle fled\r\nIn a mutual flame from hence.\r\n\r\nSo they lov\'d, as love in twain\r\nHad the essence but in one;\r\nTwo distincts, division none:\r\nNumber there in love was slain.\r\n\r\nHearts remote, yet not asunder;\r\nDis

### Split train test dataset 0.7:0.3

In [6]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

### Reformat poem, adding, title, theme, and special token

In [7]:
def format_poem_with_title_theme(poem_data):
    theme = poem_data["type"]
    title = poem_data["poem name"]
    lines = poem_data["content"]

    content = "<BOS_LINE>" + lines.replace("\r\n","<EOS_LINE> <BOS_LINE>") + "<EOS_LINE> <END>"
    return f"<TITLE> {title} </TITLE> <THEME> {theme} </THEME> {content}"

In [8]:
train_df['training set'] = train_df.apply(format_poem_with_title_theme, axis=1)
test_df['training set'] = test_df.apply(format_poem_with_title_theme, axis=1)

In [9]:
train_df['training set'].iloc[0]

'<TITLE> [Fortune Hath Taken Thee Away, My Love] </TITLE> <THEME> Love </THEME> <BOS_LINE>Fortune hath taken thee away, my love,<EOS_LINE> <BOS_LINE>My lifes soul and my souls heaven above;<EOS_LINE> <BOS_LINE>Fortune hath taken thee away, my princess;<EOS_LINE> <BOS_LINE>My only light and my true fancys mistress.<EOS_LINE> <BOS_LINE><EOS_LINE> <BOS_LINE>Fortune hath taken all away from me,<EOS_LINE> <BOS_LINE>Fortune hath taken all by taking thee.<EOS_LINE> <BOS_LINE>Dead to all joy, I only live to woe,<EOS_LINE> <BOS_LINE>So fortune now becomes my mortal foe.<EOS_LINE> <BOS_LINE><EOS_LINE> <BOS_LINE>In vain you eyes, you eyes do waste your tears,<EOS_LINE> <BOS_LINE>In vain you sighs do smoke forth my despairs,<EOS_LINE> <BOS_LINE>In vain you search the earth and heaven above,<EOS_LINE> <BOS_LINE>In vain you search, for fortune rules in love.<EOS_LINE> <BOS_LINE><EOS_LINE> <BOS_LINE>Thus now I leave my love in fortunes hands,<EOS_LINE> <BOS_LINE>Thus now I leave my love in fortunes b

## Tokenizer

### Load and train tokenizer BPE

In [10]:
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# tokenizer.pad_token = tokenizer.eos_token

In [11]:
# special_tokens = {
#     "additional_special_tokens": ["<TITLE>", "</TITLE>" ,"<BOS_LINE>","<EOS_LINE>","<THEME>,","</THEME>"]
# }
# tokenizer.add_special_tokens(special_tokens)

# vocab_size = len(tokenizer)

In [12]:
# Buat tokenizer ulang
tokenizer = ByteLevelBPETokenizer()

tokenizer.train(
    files=["/content/merve-poetry.csv"],
    vocab_size=500,
    min_frequency=2,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>","<TITLE>", "</TITLE>" ,"<BOS_LINE>","<EOS_LINE>","<THEME>","</THEME>","<END>"]
)

tokenizer.save_model("/content")

['/content/vocab.json', '/content/merges.txt']

In [13]:
tokenizer = GPT2TokenizerFast.from_pretrained("/content")
tokenizer.pad_token = tokenizer.eos_token

### Tokenize train and test dataset

In [14]:
train_texts = train_df['training set'].fillna("").astype(str).tolist()

train_encoding = tokenizer(
    train_texts,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

test_texts = test_df['training set'].fillna("").astype(str).tolist()

test_encoding = tokenizer(
    test_texts,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

In [15]:
train_ids = train_encoding['input_ids']
test_ids = test_encoding['input_ids']

## Transformer Model

### Embedding Layer & Positional Encoding

In [16]:
def positional_encoding(length, depth):
  depth = depth/2
  # get position and depth
  positions = np.arange(length)[:, np.newaxis]
  depths = np.arange(depth)[np.newaxis, :]/depth
  #get the radian
  angle_rates = 1 / (10000**depths)
  angle_rads = positions * angle_rates
  #compute encoding
  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)

In [17]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    #get input length
    length = tf.shape(x)[1]
    #embed into vector
    x = self.embedding(x)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    #add positional ecoding
    # x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x

In [18]:
class RelativePositionBias(tf.keras.layers.Layer):
    def __init__(self, num_heads, max_distance=128):
        super().__init__()
        self.num_heads = num_heads
        self.max_distance = max_distance
        self.relative_attention_bias = self.add_weight(
            name="rel_pos_bias",
            shape=(2 * max_distance - 1, num_heads),
            initializer="random_normal",
            trainable=True
        )

    def call(self, qlen, klen):
        #get position in context and memory
        context_position = tf.range(qlen)[:, None]
        memory_position = tf.range(klen)[None, :]
        #get relative position
        relative_position = memory_position - context_position
        rp_bucket = tf.clip_by_value(relative_position + self.max_distance - 1, 0, 2 * self.max_distance - 2)
        #get bucket value
        values = tf.gather(self.relative_attention_bias, rp_bucket)
        return tf.transpose(values, [2, 0, 1])

class RelativeSelfAttention(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        assert embed_dim % num_heads == 0

        self.q = tf.keras.layers.Dense(embed_dim)
        self.k = tf.keras.layers.Dense(embed_dim)
        self.v = tf.keras.layers.Dense(embed_dim)
        self.out_proj = tf.keras.layers.Dense(embed_dim)
        self.rel_pos_bias = RelativePositionBias(num_heads)

    def split_heads(self, x):
        x = tf.reshape(x, (tf.shape(x)[0], tf.shape(x)[1], self.num_heads, self.head_dim))
        return tf.transpose(x, [0, 2, 1, 3])

    def call(self, x):
        B, T, _ = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2]
        #input to q, k, v
        q = self.q(x)
        k = self.k(x)
        v = self.v(x)
        #split head
        q = self.split_heads(q)
        k = self.split_heads(k)
        v = self.split_heads(v)

        #calculate attention score
        scores = tf.matmul(q, k, transpose_b=True)
        scores = scores / tf.math.sqrt(tf.cast(self.head_dim, tf.float32))

        #add position bias
        rel_bias = self.rel_pos_bias(T, T)
        scores += tf.expand_dims(rel_bias, axis=0)

        weights = tf.nn.softmax(scores, axis=-1)
        #calculate self attention value
        attn_output = tf.matmul(weights, v)
        attn_output = tf.transpose(attn_output, [0, 2, 1, 3])
        attn_output = tf.reshape(attn_output, (B, T, self.embed_dim))
        return self.out_proj(attn_output)


### Attention Layer

In [19]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    #initialize multihead, normalization, and add layer
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

In [20]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    #get self attention value
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    #add input value with self attention value
    x = self.add([x, attn_output])
    #normalize
    x = self.layernorm(x)
    return x

In [21]:
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

### Feed Forward

In [22]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    #initialize feedforward component with activation swish
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='swish'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    #add input value with feedforward value
    x = self.add([x, self.seq(x)])
    #normalize
    x = self.layer_norm(x)
    return x

### Decoder

In [23]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()
    #initialize attention approach : absolute positional encoding + self attention | relative positional encoding + relative attention
    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.relative_self_attention = RelativeSelfAttention(
        embed_dim=d_model,
        num_heads=num_heads
    )
    #add feedforward layer
    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    #get slf attention value
    # x = self.causal_self_attention(x=x)
    x = self.relative_self_attention(x)

    self.last_attn_scores = x
    #feed value to feedforward layer
    x = self.ffn(x)
    return x

In [24]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()
    #initialize attribute
    self.d_model = d_model
    self.num_layers = num_layers
    #initialize positional embedding
    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    #initialize decoder component with num_layers of decoder layers
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

  def call(self, x):
    #feed to positional embedding
    x = self.pos_embedding(x)
    #dropout
    x = self.dropout(x)
    # feed to decoder layers
    for i in range(self.num_layers):
      x  = self.dec_layers[i](x)

    return x

### Transformer

In [25]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    #set decoder
    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)
    #set output layer with ReLU activation function
    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    x  = inputs
    #feed to decoder
    x = self.decoder(x)
    #feed to output layer
    logits = self.final_layer(x)

    try:
      del logits._keras_mask
    except AttributeError:
      pass

    return logits

## Train

## Loss Function

In [26]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()
    # set attribute
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    #arg1 for reduce learning rate
    arg1 = tf.math.rsqrt(step)
    #arg2 for increase learning rate
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [27]:
def masked_loss(label, pred):
  #cek label value
  mask = label != 0
  #initialize loss function call
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  #get loss
  loss = loss_object(label, pred)

  #cek is label empty then skip
  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask
  #count loss average
  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  #get prediction
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  #compare to label
  match = label == pred

  mask = label != 0

  #cek is label empty then skip
  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  #count accuracy
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

### Data Prep

In [28]:
#shift input to 1:n and label 0:n-1

train_inputs = train_ids[:, :-1]
train_labels = train_ids[:, 1:]
test_inputs = test_ids[:, :-1]
test_labels = test_ids[:, 1:]

In [29]:
print(train_inputs.shape)
print(train_labels.shape)

torch.Size([401, 127])
torch.Size([401, 127])


In [30]:
#make dataset in batch
dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_labels))
dataset = dataset.batch(32).shuffle(100).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((test_inputs, test_labels))
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

In [31]:
vocab_size = len(tokenizer)

## Hyperparameter setup &  Training 1

### Train

In [32]:
num_layers = 2
d_model = 128
dff = 512
num_heads = 4
dropout_rate = 0.2

In [33]:
#set learning rate and optimizer
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [34]:
#Initialize transformer model with its param
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=vocab_size,
    target_vocab_size=vocab_size,
    dropout_rate=dropout_rate)

In [35]:
#add loss function, optimizer, and metrics
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [36]:
transformer.summary()

In [37]:
#train model
transformer.fit(dataset, epochs=100, validation_data=test_dataset)

Epoch 1/100




[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 738ms/step - loss: 6.1242 - masked_accuracy: 0.0778 - val_loss: 6.0938 - val_masked_accuracy: 0.0803
Epoch 2/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 553ms/step - loss: 6.0565 - masked_accuracy: 0.0817 - val_loss: 5.9792 - val_masked_accuracy: 0.0833
Epoch 3/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 555ms/step - loss: 5.9229 - masked_accuracy: 0.0848 - val_loss: 5.8298 - val_masked_accuracy: 0.0837
Epoch 4/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 680ms/step - loss: 5.7783 - masked_accuracy: 0.0832 - val_loss: 5.6803 - val_masked_accuracy: 0.0837
Epoch 5/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 552ms/step - loss: 5.6244 - masked_accuracy: 0.0839 - val_loss: 5.5438 - val_masked_accuracy: 0.0837
Epoch 6/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 644ms/step - loss: 5.4879 - masked_accuracy: 0.08

<keras.src.callbacks.history.History at 0x7e815eb81ed0>

### Test

In [38]:

text = '<TITLE> Love is Magic </TITLE> <THEME> Love </THEME>'

#encode text
encoding = tokenizer(
    text,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

text_ids = encoding['input_ids']

#get output
output = transformer(text_ids)

In [39]:
predicted_token_ids = tf.argmax(output, axis=-1)

In [40]:
decoded_text = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)

In [41]:
#get decoded next text
decoded_text

'TITLE> Love  MagE </TITLE> <THEME> Love </THEME>D'

In [42]:
def generate_poem(model, tokenizer, prompt, max_length=100):
    # Tokenisasi input prompt
    input_ids = tokenizer(
        prompt,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='tf'
    )['input_ids']

    for _ in range(max_length):
        logits = model(input_ids, training=False)

        next_token_logits = logits[:, -1, :]

        next_token_id = tf.argmax(next_token_logits, axis=-1, output_type=tf.int32)
        next_token_id = tf.expand_dims(next_token_id, axis=-1)

        input_ids = tf.concat([input_ids, next_token_id], axis=-1)

        token_val = next_token_id.numpy()[0][0]
        if token_val == tokenizer.eos_token_id or token_val == tokenizer.pad_token_id:
            break

    generated_text = tokenizer.decode(
        input_ids.numpy()[0],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )
    return generated_text

In [44]:
poem = generate_poem(transformer, tokenizer, text)
poem

'<TITLE> Love is Magic </TITLE> <THEME> Love </THEME>Dry         ir ir ir. ir..........................................ineersersersersersersersomine)ineine)ineine)ineine)ine))ine)ineineineine)ineainineainineainainainain'

In [None]:
decoded

In [None]:
print("EOS token id:", tokenizer.eos_token_id)
print("PAD token id:", tokenizer.pad_token_id)

In [45]:
!pip install sacrebleu



In [46]:
import torch
import sacrebleu
from tqdm import tqdm

generated_poems = []
references = []

for i in tqdm(range(len(test_ids))):
    input_sample = test_ids[i:i+1]

    with torch.no_grad():
        output_logits = transformer(input_sample)

    predicted_ids = tf.argmax(output_logits, axis=-1)

    decoded = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    generated_poems.append(decoded)

    reference_ids = test_labels[i]
    reference_text = tokenizer.decode(reference_ids, skip_special_tokens=True)
    references.append([reference_text])
bleu = sacrebleu.corpus_bleu(generated_poems, references)
print(f"\nCorpus BLEU score: {bleu.score:.2f}")


100%|██████████| 172/172 [00:23<00:00,  7.45it/s]



Corpus BLEU score: 68.24


In [47]:
for i in range(len(generated_poems)):
    print(f"\n--- Poem {i+1} ---")
    print("Generated:\n", generated_poems[i])
    print("Reference:\n", test_ids[i])


--- Poem 1 ---
Generated:
 TITLE> Jas freedom is a breakfoofood] </TITLE> <THEME> Love </THEME> <BOS_LINE>E.E. Cummings, JE freedom is a breakfoofood] from Complete Poems 1994-1962, edited by Yeor..
Reference:
 tensor([ 39,  63,  52,  63,  55,  48,  41, 232,  70, 355, 282, 275, 305, 314,
        358, 274, 280, 275,  76,  86,  81, 457,  81,  90, 348,  72, 232,  39,
         26,  63,  52,  63,  55,  48,  41, 232,  39,  63,  51,  48,  56,  48,
         41, 232, 407, 232,  39,  26,  63,  51,  48,  56,  48,  41, 232,  39,
         45,  58,  62,  74,  55,  52,  57,  48,  41,  48,  25,  48,  25, 402,
         96,  88,  88, 294,  94,  23, 232,  70, 355, 282, 275, 305, 314, 358,
        274, 280, 275,  76,  86,  81, 457,  81,  90, 348,  72, 415, 402, 314,
         91,  87, 317,  80, 404,  90,  80,  88,  94, 232,  28,  36,  27,  31,
         24,  28,  36,  33,  29,  23, 232, 305, 291, 305, 394, 232,  50,  80,
        293, 430])

--- Poem 2 ---
Generated:
 TITLE> Sonnet Love7: How like a winter 

In [48]:
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm

all_preds = []
all_labels = []

for i in tqdm(range(len(test_ids))):
    input_sample = test_ids[i:i+1]
    output_logits = transformer(input_sample, training=False)
    predicted_ids = tf.argmax(output_logits, axis=-1).numpy()[0]

    label_ids = test_labels[i]

    pred_tokens = [p for p, l in zip(predicted_ids, label_ids) if l != -100]
    true_tokens = [l for l in label_ids if l != -100]

    all_preds.extend(pred_tokens)
    all_labels.extend(true_tokens)

precision, recall, f1, _ = precision_recall_fscore_support(
    all_labels, all_preds, average='macro'
)

100%|██████████| 172/172 [00:17<00:00,  9.72it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print(f"Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")

In [None]:
bleu = sacrebleu.corpus_bleu(generated_poems, references)
print(f"\nCorpus BLEU score: {bleu.score:.2f}")

## Hyperparameter setup & Train 2

### Train

In [None]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 4
dropout_rate = 0.2

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [None]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=vocab_size,
    target_vocab_size=vocab_size,
    dropout_rate=dropout_rate)

In [None]:
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [None]:
transformer.summary()

In [None]:
transformer.fit(dataset, epochs=100, validation_data=test_dataset)

### Test

In [None]:

text = '<TITLE> Love is Magic </TITLE> <THEME> Love </THEME>'

encoding = tokenizer(
    text,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

text_ids = encoding['input_ids']

output = transformer(text_ids)

In [None]:
predicted_token_ids = tf.argmax(output, axis=-1)

In [None]:
decoded_text = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)

In [None]:
decoded_text

In [None]:
def generate_poem(model, tokenizer, prompt, max_length=100, temperature=1.0):
    input_ids = tokenizer(
        prompt,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='tf'
    )['input_ids']

    for _ in range(max_length):
        logits = model(input_ids, training=False)
        next_token_logits = logits[:, -1, :]
        next_token_logits = next_token_logits / temperature

        next_token_id = tf.random.categorical(next_token_logits, num_samples=1)
        next_token_id = tf.cast(next_token_id, input_ids.dtype)

        input_ids = tf.concat([input_ids, next_token_id], axis=-1)

        token_id_val = next_token_id.numpy()[0, 0]
        if token_id_val == tokenizer.eos_token_id or token_id_val == tokenizer.pad_token_id:
            break

    generated_text = tokenizer.decode(input_ids.numpy()[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return generated_text

In [None]:
poem = generate_poem(transformer, tokenizer, text, temperature=0.5)
poem

In [None]:
print("EOS token id:", tokenizer.eos_token_id)
print("PAD token id:", tokenizer.pad_token_id)

In [None]:
import torch
import sacrebleu
from tqdm import tqdm

generated_poems = []
references = []

for i in tqdm(range(len(test_ids))):
    input_sample = test_ids[i:i+1]

    with torch.no_grad():
        output_logits = transformer(input_sample)

    predicted_ids = tf.argmax(output_logits, axis=-1)

    decoded = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    generated_poems.append(decoded)

    reference_ids = test_labels[i]
    reference_text = tokenizer.decode(reference_ids, skip_special_tokens=True)
    references.append([reference_text])
bleu = sacrebleu.corpus_bleu(generated_poems, references)
print(f"\nCorpus BLEU score: {bleu.score:.2f}")


In [None]:
for i in range(len(generated_poems)):
    print(f"\n--- Poem {i+1} ---")
    print("Generated:\n", generated_poems[i])
    print("Reference:\n", test_ids[i])

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm

all_preds = []
all_labels = []

for i in tqdm(range(len(test_ids))):
    input_sample = test_ids[i:i+1]
    output_logits = transformer(input_sample, training=False)
    predicted_ids = tf.argmax(output_logits, axis=-1).numpy()[0]

    label_ids = test_labels[i]

    pred_tokens = [p for p, l in zip(predicted_ids, label_ids) if l != -100]
    true_tokens = [l for l in label_ids if l != -100]

    all_preds.extend(pred_tokens)
    all_labels.extend(true_tokens)

precision, recall, f1, _ = precision_recall_fscore_support(
    all_labels, all_preds, average='macro'
)

In [None]:
print(f"Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")

In [None]:
bleu = sacrebleu.corpus_bleu(generated_poems, references)
print(f"\nCorpus BLEU score: {bleu.score:.2f}")

## Hyperparameter setup & Train 3

### Train

In [None]:
num_layers = 2
d_model = 256
dff = 512
num_heads = 4
dropout_rate = 0.2

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [None]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=vocab_size,
    target_vocab_size=vocab_size,
    dropout_rate=dropout_rate)

In [None]:
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [None]:
transformer.summary()

In [None]:
transformer.fit(dataset, epochs=100, validation_data=test_dataset)

### Test

In [None]:

text = '<TITLE> Love is Magic </TITLE> <THEME> Love </THEME>'

encoding = tokenizer(
    text,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

text_ids = encoding['input_ids']

output = transformer(text_ids)

In [None]:
predicted_token_ids = tf.argmax(output, axis=-1)

In [None]:
decoded_text = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)

In [None]:
decoded_text

In [None]:
def generate_poem(model, tokenizer, prompt, max_length=100, temperature=1.0):
    input_ids = tokenizer(
        prompt,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='tf'
    )['input_ids']  # e.g., int64 tensor

    for _ in range(max_length):
        logits = model(input_ids, training=False)
        next_token_logits = logits[:, -1, :]
        next_token_logits = next_token_logits / temperature

        next_token_id = tf.random.categorical(next_token_logits, num_samples=1)  # default int64
        next_token_id = tf.cast(next_token_id, input_ids.dtype)  # cast to same dtype as input_ids

        input_ids = tf.concat([input_ids, next_token_id], axis=-1)

        token_id_val = next_token_id.numpy()[0, 0]
        if token_id_val == tokenizer.eos_token_id or token_id_val == tokenizer.pad_token_id:
            break

    generated_text = tokenizer.decode(input_ids.numpy()[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return generated_text

In [None]:
poem = generate_poem(transformer, tokenizer, text, temperature=0.5)
poem

In [None]:
print("EOS token id:", tokenizer.eos_token_id)
print("PAD token id:", tokenizer.pad_token_id)

In [None]:
import torch
import sacrebleu
from tqdm import tqdm

generated_poems = []
references = []

for i in tqdm(range(len(test_ids))):
    input_sample = test_ids[i:i+1]

    with torch.no_grad():
        output_logits = transformer(input_sample)

    predicted_ids = tf.argmax(output_logits, axis=-1)

    decoded = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    generated_poems.append(decoded)

    reference_ids = test_labels[i]
    reference_text = tokenizer.decode(reference_ids, skip_special_tokens=True)
    references.append([reference_text])
bleu = sacrebleu.corpus_bleu(generated_poems, references)
print(f"\nCorpus BLEU score: {bleu.score:.2f}")


In [None]:
for i in range(len(generated_poems)):
    print(f"\n--- Poem {i+1} ---")
    print("Generated:\n", generated_poems[i])
    print("Reference:\n", test_ids[i])

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm

all_preds = []
all_labels = []

for i in tqdm(range(len(test_ids))):
    input_sample = test_ids[i:i+1]
    output_logits = transformer(input_sample, training=False)
    predicted_ids = tf.argmax(output_logits, axis=-1).numpy()[0]

    label_ids = test_labels[i]

    pred_tokens = [p for p, l in zip(predicted_ids, label_ids) if l != -100]
    true_tokens = [l for l in label_ids if l != -100]

    all_preds.extend(pred_tokens)
    all_labels.extend(true_tokens)

precision, recall, f1, _ = precision_recall_fscore_support(
    all_labels, all_preds, average='macro'
)

In [None]:
print(f"Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")

In [None]:
bleu = sacrebleu.corpus_bleu(generated_poems, references)
print(f"\nCorpus BLEU score: {bleu.score:.2f}")