In [None]:
# Instalar librerías necesarias
!pip install transformers datasets torch

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch
import os
from google.colab import files

# Subir archivo .txt
print("Por favor, sube el archivo .txt con las intervenciones del parlamentario.")
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Leer y procesar el archivo
with open(filename, 'r', encoding='utf-8') as f:
    text = f.read()

# Crear dataset compatible con Hugging Face
dataset = Dataset.from_dict({"text": [text]})

# Tokenizar datos
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Establecer token de relleno

def tokenize_function(examples):
    encoding = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    # Las etiquetas deben coincidir con los input_ids
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Configurar modelo preentrenado GPT-2
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Configuración de entrenamiento
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=10,  # Aumenta si necesitas más precisión
    per_device_train_batch_size=1,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    report_to="none",
)

# Entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)

# Entrenar el modelo
print("Entrenando el modelo... Esto puede tardar varias horas.")
trainer.train()

# Guardar modelo ajustado
model.save_pretrained("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")

# Función para generar texto (actualizada)
def generate_text(prompt, max_length=50, temperature=0.7):
    # Preparar inputs y moverlos al dispositivo adecuado
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)  # Mover a la misma device que el modelo
    attention_mask = torch.ones_like(inputs)  # Crear la máscara de atención

    # Generar texto
    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,  # Proporcionar la máscara de atención
        max_length=max_length,
        temperature=temperature,
        num_return_sequences=1,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id,  # Configurar el token de relleno
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Interactuar con el modelo
print("¡Puedes comenzar a interactuar con el modelo!")
while True:
    prompt = input("Introduce el inicio del texto (o escribe 'salir' para terminar): ")
    if prompt.lower() == "salir":
        print("¡Hasta luego!")
        break
    print("Respondiendo...")
    print(generate_text(prompt, max_length=100, temperature=0.8))


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

Saving intervencionesCasado.txt to intervencionesCasado.txt


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  trainer = Trainer(


Entrenando el modelo... Esto puede tardar varias horas.


Step,Training Loss


¡Puedes comenzar a interactuar con el modelo!
Introduce el inicio del texto (o escribe 'salir' para terminar): Sánchez es usted
Respondiendo...
Sánchez es ustedo es el ficiento, en eso, es a los, de los juego y lorías, y a los, a los, a los, a, a, a, y, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a,
Introduce el inicio del texto (o escribe 'salir' para terminar): Mi partido político
Respondiendo...
Mi partido político, to have all the a caustic water.

Vanduz-Jungen, a Chinese journalist and one of the best-known scientists in the world of science, is a strong man who says that was his idea of the origin of life is to try to say that if the world has always been created by the right thing, because life is so, and we all will make sure that the world is created by being able to understand the
