In [1]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import os




## Obtener datos de entrenamiento

In [2]:
docs_folder = 'docs'

def load_files(directorio):
    contenido = []
    for root, _, files in os.walk(directorio):
        for file in files:
            if file.endswith(".es.md"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    contenido.append(f.read())
    return "\n\n".join(contenido)

contenido_documentacion = load_files(docs_folder)

texts = contenido_documentacion.split("\n\n")

## Preparar datos

In [3]:
# Crear un objeto Dataset con el contenido dividido
dataset = Dataset.from_dict({"text": texts})

# Dividir el dataset en entrenamiento y test (o validación)
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

## Tokenizar los datos

In [4]:
model_checkpoint = "gpt2"  # Modelo base
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Establecer el token de EOS como token de padding
tokenizer.pad_token = tokenizer.eos_token

def tokenize_and_prepare_labels(examples):
    # Tokenizar el texto
    tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    pad_token_id = tokenizer.pad_token_id
    tokenized_inputs["labels"] = [[pad_token_id] + label[:-1] for label in tokenized_inputs["labels"]]
    
    return tokenized_inputs

# Aplicar esta función al dataset
tokenized_train = train_dataset.map(tokenize_and_prepare_labels, batched=True)
tokenized_test = test_dataset.map(tokenize_and_prepare_labels, batched=True)



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]



merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/1091 [00:00<?, ? examples/s]

Map:   0%|          | 0/122 [00:00<?, ? examples/s]

## Entrenar modelo

In [5]:
# Cargar el modelo preentrenado
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

# Configurar los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
)

# Inicializar el Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

# Entrenar el modelo
trainer.train()

 92%|█████████▏| 500/545 [1:33:04<09:12, 12.28s/it]  

{'loss': 0.1564, 'learning_rate': 1.6513761467889911e-06, 'epoch': 0.92}


                                                   
 92%|█████████▏| 500/545 [1:37:15<09:12, 12.28s/it]

{'eval_loss': 0.006177566479891539, 'eval_runtime': 250.6912, 'eval_samples_per_second': 0.483, 'eval_steps_per_second': 0.243, 'epoch': 0.92}


100%|██████████| 545/545 [1:44:32<00:00, 11.51s/it]  

{'train_runtime': 6272.7509, 'train_samples_per_second': 0.174, 'train_steps_per_second': 0.087, 'train_loss': 0.14406084303462177, 'epoch': 1.0}





TrainOutput(global_step=545, training_loss=0.14406084303462177, metrics={'train_runtime': 6272.7509, 'train_samples_per_second': 0.174, 'train_steps_per_second': 0.087, 'train_loss': 0.14406084303462177, 'epoch': 1.0})

In [6]:
# Guardar el modelo ajustado
trainer.save_model("/mnt/data/finetuned_model")