In [None]:
"""
original with the outputs are here
https://colab.research.google.com/drive/1S3Efwg1meS7gfFsPcWe9nQL5j3eObFWc?usp=sharing
"""

In [None]:
!pip install transformers datasets

In [None]:
import kagglehub

path = kagglehub.dataset_download("paultimothymooney/poetry")

print("Path to dataset files:", path)

In [None]:
from datasets import load_dataset
data_files = {
    "train": ["/kaggle/input/poetry/bieber.txt"]
}
dataset = load_dataset("text", data_files=data_files)

In [None]:
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = TFGPT2LMHeadModel.from_pretrained("gpt2")

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
import tensorflow as tf

tf_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=["input_ids"],
    label_cols=["input_ids"],
    shuffle=True,
    batch_size=4,
    collate_fn=lambda x: {
        "input_ids": tf.convert_to_tensor([item["input_ids"] for item in x]),
        "attention_mask": tf.convert_to_tensor([[1]*len(item["input_ids"]) for item in x]),
        "labels": tf.convert_to_tensor([item["input_ids"] for item in x]),
    },
)


In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer)
model.fit(tf_dataset, epochs=3)


In [None]:
model.save_pretrained("./fine_tuned_gpt2_poetry")
tokenizer.save_pretrained("./fine_tuned_gpt2_poetry")


In [None]:
def generate_lyrics(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors="tf")
    outputs = model.generate(
        input_ids,
        max_length=200,
        num_return_sequences=6,
        top_k=50,
        top_p=0.95,
        temperature=0.8,
        repetition_penalty=1.2,  # Softer repetition control
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode and return all outputs
    lyrics_list = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return lyrics_list


In [None]:
prompt = (
    "In the moonlight, I feel\n"
    "My heart begins to heal\n"
    "The stars are shining bright\n"
    "I'm floating in the night\n"
)


lyrics_variants = generate_lyrics(prompt)

for i, lyrics in enumerate(lyrics_variants, 1):
    print(f"\n🎵 Version {i}:\n{lyrics}")
