In [1]:
from datasets import Dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import chardet

# Detectar la codificaci√≥n
with open('informacion.csv', 'rb') as f:
    result = chardet.detect(f.read())
    encoding = result['encoding']

In [3]:
# Intentar cargar el archivo CSV con la codificaci√≥n detectada y el delimitador correcto
try:
    df = pd.read_csv('informacion.csv', encoding=encoding, sep=';', on_bad_lines='skip')
    dataset1 = Dataset.from_pandas(df)
except pd.errors.ParserError as e:
    print(f"Error al parsear el archivo CSV: {e}")

In [4]:
# Leer el dataset
print(dataset1)

Dataset({
    features: ['pregunta', 'respuesta'],
    num_rows: 85
})


In [5]:
# Contar el numero de filas y colunas de dataset1
print(f"Numero de filas: {len(dataset1)}")
print(f"Numero de columnas: {len(dataset1.column_names)}")

Numero de filas: 85
Numero de columnas: 2


In [6]:
# Cargar el modelo preentrenado de Hugging Face
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es")
model = AutoModelForQuestionAnswering.from_pretrained("mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es")

Some weights of the model checkpoint at mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# Pruebas con el modelo
from transformers import pipeline

# Cargar el pipeline de pregunta-respuesta
nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)

# Ejemplo de texto y pregunta
context = "El modelo de lenguaje de Transformers es desarrollado por Hugging Face."
question = "¬øQui√©n desarroll√≥ el modelo de lenguaje de Transformers?"

# Obtener la respuesta
result = nlp(question=question, context=context)
print(result)


{'score': 0.9272115230560303, 'start': 58, 'end': 70, 'answer': 'Hugging Face'}


In [8]:
# Tokenizar los datos
def tokenize_function(examples):
    return tokenizer(examples['pregunta'], examples['respuesta'], truncation=True)

tokenized_datasets = dataset1.map(tokenize_function, batched=True)

Map:   0%|          | 0/85 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 85/85 [00:00<00:00, 197.73 examples/s]


In [9]:
from transformers import TrainingArguments

# Configurar los argumentos de entrenamiento con early stopping y weight decay
training_args = TrainingArguments(
    output_dir="./results", # Directorio para guardar los resultados
    evaluation_strategy="epoch", # Evaluar al final de cada √©poca
    save_strategy="epoch", # Guardar al final de cada √©poca
    learning_rate=2e-5, # Tasa de aprendizaje
    per_device_train_batch_size=8, # Tama√±o del lote de entrenamiento por dispositivo
    per_device_eval_batch_size=8, # Tama√±o del lote de evaluaci√≥n por dispositivo
    num_train_epochs=3, # N√∫mero de √©pocas de entrenamiento
    weight_decay=0.01, # Peso de la regularizaci√≥n L2
    load_best_model_at_end=True,  # Cargar el mejor modelo al final del entrenamiento
    metric_for_best_model="eval_loss",  # M√©trica para seleccionar el mejor modelo
    greater_is_better=False,  # Indica que una menor p√©rdida es mejor
    save_total_limit=1,  # Limita el n√∫mero de modelos guardados
    logging_dir='./logs',  # Directorio para los logs
    logging_steps=10,  # Log cada 10 pasos
)



In [10]:
from transformers import Trainer

# Crear el objeto Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
)