## Carga de librerías

In [None]:
import torch
from datasets import load_dataset
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, TrainingArguments, Trainer 
from sklearn.model_selection import train_test_split

import os
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "





In [2]:
# Comprobar funcionamiento de CUDA.
torch.cuda.empty_cache()
print("CUDA disponible:", torch.cuda.is_available())
print("Dispositivo actual:", torch.cuda.get_device_name(0))

CUDA disponible: True
Dispositivo actual: NVIDIA GeForce RTX 3060


## Carga del dataset

In [3]:
# Cargamos dataset desde HuggingFace.
dataset = load_dataset("sebastiandizon/genius-song-lyrics", split="train")

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

## Filtrado del dataset

En el modelo anterior entrenamos con todos los datos. En este caso, nos quedaremos los registros en inglés, y después dividiremos el dataset en 3 según los géneros rock, pop y rap.

In [4]:
# Filtramos letras de canciones en inglés.
dataset_en = dataset.filter(lambda x: x["language"] == "en")

In [5]:
# Con el dataset ya filtrado en inglés, dividimos en 3 para rock, pop y rap.
rock_dataset = dataset_en.filter(lambda x: x["tag"].lower() == "rock")
pop_dataset = dataset_en.filter(lambda x: x["tag"].lower() == "pop")
rap_dataset = dataset_en.filter(lambda x: x["tag"].lower() == "rap")

In [6]:
# Comprobamos que la filtración haya sido correcta.
print(f"Rock: {len(rock_dataset)} canciones, Pop: {len(pop_dataset)} canciones, Rap: {len(rap_dataset)} canciones")

Rock: 633308 canciones, Pop: 1393559 canciones, Rap: 964605 canciones


A continuación, dividimos en train y test. Para ello, creamos una función que la aplique a los 3 datasets.

In [None]:
# # Función para dividir en train (90%) y test (10%)
# def split_dataset(dataset):
#     dataset = list(dataset)  # Convertir a lista para dividir
#     train_data, test_data = train_test_split(dataset, test_size=0.1, random_state=42)
#     return train_data, test_data

In [None]:
# rock_train, rock_test = split_dataset(rock_dataset)

In [None]:
# pop_train, pop_test = split_dataset(pop_dataset)

In [None]:
# Aplicmaos la función.
# rap_train, rap_test = split_dataset(rap_dataset)

In [None]:
# Comprobamos que el tamaño de los conjuntos de entrenamiento y test sea correcto.
# print(f"Rock - Train: {len(rock_train)}, Test: {len(rock_test)}")
# print(f"Pop - Train: {len(pop_train)}, Test: {len(pop_test)}")
# print(f"Rap - Train: {len(rap_train)}, Test: {len(rap_test)}")

Rock - Train: 569977, Test: 63331
Pop - Train: 1254203, Test: 139356
Rap - Train: 868144, Test: 96461


## Tokenización y preprocesamiento

In [7]:
# Carga el tokenizador.
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2-medium")

In [8]:
tokenizer.add_special_tokens({"additional_special_tokens": ["<|endoflyric|>"], "pad_token": "<|pad|>"})
tokenizer.pad_token = tokenizer.eos_token  # Usa <|endoftext|> como padding

Creamos las funciones de formateo y de tokenización.

In [11]:
# Función de formateo
def format_lyrics(example):
    """Formatea cada ejemplo sin incluir el género."""
    text = (
        f"[Artista: {example['artist']}]\n"
        f"[Canción: {example['title']}]\n"
        f"{example['lyrics']}\n<|endoflyric|>"
    )
    return {"text": text}

def tokenize_fn(examples):
    """Tokeniza el texto y crea labels para la pérdida."""
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    tokens["labels"] = tokens["input_ids"].copy()  # <-- Añadir etiquetas para la pérdida
    return tokens


In [None]:
# rock_train = [format_lyrics(x) for x in rock_train]
# rock_test = [format_lyrics(x) for x in rock_test]

In [None]:
# pop_train = [format_lyrics(x) for x in pop_train]
# pop_test = [format_lyrics(x) for x in pop_test]

In [None]:
# rap_train = [format_lyrics(x) for x in rap_train]
# rap_test = [format_lyrics(x) for x in rap_test]

Aplicamos el preprocesamiento a los datasets.

In [None]:
# rock_train = [tokenize_fn(x) for x in rock_train]
# rock_test = [tokenize_fn(x) for x in rock_test]

In [None]:
# pop_train = [tokenize_fn(x) for x in pop_train]
# pop_test = [tokenize_fn(x) for x in pop_test]

In [None]:
# rap_train = [tokenize_fn(x) for x in rap_train]
# rap_test = [tokenize_fn(x) for x in rap_test]

## Configuración del entrenamiento

Implementamos una función que contenga el entrenamiento completo para ejecutarla con los 3 datasets.

In [12]:
# --- 7. Aplicar `format_lyrics` y `tokenize_fn` ---
for dataset_name, dataset in zip(["rock", "pop", "rap"], [rock_dataset, pop_dataset, rap_dataset]):
    dataset = dataset.map(format_lyrics)
    dataset = dataset.map(tokenize_fn, batched=True, batch_size=1000)

    # --- 8. Dividir en train/test (90%-10%) ---
    total_samples = len(dataset)
    test_size = int(total_samples * 0.1)
    
    train_dataset = dataset.select(range(test_size, total_samples))
    test_dataset = dataset.select(range(test_size))

    # --- 9. Cargar el modelo GPT-2 ---
    model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
    model.resize_token_embeddings(len(tokenizer))

    # --- 10. Definir los parámetros de entrenamiento ---
    training_args = TrainingArguments(
        output_dir=f"./lyrics_generator_{dataset_name}",
        num_train_epochs=2,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        learning_rate=3e-5,
        evaluation_strategy="steps",
        eval_steps=500,
        logging_steps=100,
        fp16=True,
        gradient_accumulation_steps=4,
        max_steps=1000,
        report_to="none",
    )

    # --- 11. Inicializar el Trainer ---
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

    # --- 12. Entrenar el modelo ---
    print(f"Entrenando modelo para {dataset_name}...")
    trainer.train()

Map:   0%|          | 0/633308 [00:00<?, ? examples/s]

Entrenando modelo para rock...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
500,1.4324,1.409617
1000,1.4091,1.398691


Map:   0%|          | 0/1393559 [00:00<?, ? examples/s]

Map:   0%|          | 0/1393559 [00:00<?, ? examples/s]



Entrenando modelo para pop...


Step,Training Loss,Validation Loss
500,1.4413,1.367713
1000,1.4419,1.358317


Map:   0%|          | 0/964605 [00:00<?, ? examples/s]

Map:   0%|          | 0/964605 [00:00<?, ? examples/s]



Entrenando modelo para rap...


Step,Training Loss,Validation Loss
500,2.6955,3.190799
1000,2.7266,3.171878
