# 🎬 Fine-Tuning GPT-2 en Español para Generación de Guiones
Este notebook entrena un modelo `spanish-gpt2` usando Hugging Face y un dataset `.jsonl` con guiones.

In [None]:
# ✅ 1. Instalar dependencias
!pip install transformers datasets accelerate --quiet

In [None]:
# ✅ 2. Subir el archivo JSONL con los guiones
from google.colab import files
uploaded = files.upload()

In [None]:
# ✅ 3. Cargar y preparar dataset
from datasets import load_dataset

dataset = load_dataset('json', data_files='guiones_ejemplo.jsonl', split='train')
dataset = dataset.shuffle(seed=42)
dataset

In [None]:
# ✅ 4. Tokenizar datos
from transformers import AutoTokenizer

model_name = 'mrm8488/spanish-gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format('torch')

In [None]:
# ✅ 5. Cargar modelo
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
# ✅ 6. Configurar entrenamiento
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='no',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    logging_dir='./logs',
    logging_steps=50,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()

In [None]:
# ✅ 7. Guardar modelo finetuneado
trainer.save_model('./modelo-guion')
tokenizer.save_pretrained('./modelo-guion')