In [6]:
CSV_PATH = "preguntas_respuestas.csv"
OUTPUT_DIR = "datos_fine_tuning"

In [31]:
import math
import json

def calculate_optimal_split(total_records):
    if total_records < 1000:
        train_size = 0.85
        val_size = 0.10
        test_size = 0.05
    elif total_records < 10000:
        train_size = 0.80
        val_size = 0.10
        test_size = 0.10
    else:
        train_size = 0.90
        val_size = 0.05
        test_size = 0.05
    
    min_val_test = 100  # mínimo de registros para val y test
    
    # Ajustar si los conjuntos de val/test son muy pequeños
    val_records = math.floor(total_records * val_size)
    test_records = math.floor(total_records * test_size)
    
    if val_records < min_val_test or test_records < min_val_test:
        required_total = min_val_test * 2  # para val y test
        if required_total < total_records:
            # Recalcular proporciones manteniendo mínimos
            val_size = test_size = min_val_test / total_records
            train_size = 1 - (val_size + test_size)
    
    return train_size, val_size, test_size

def save_to_jsonl(df, filename):
        filepath = os.path.join(OUTPUT_DIR, filename)
        with open(filepath, 'w', encoding='utf-8') as f:
            for _, row in df.iterrows():
                json_record = row.to_dict()
                f.write(json.dumps(json_record, ensure_ascii=False) + '\n')
        print(f"✓ Generado: {filename} ({len(df)} registros)")

In [5]:
import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [7]:
import pandas as pd

df = pd.read_csv(CSV_PATH)
total_records = len(df)

In [9]:
train_size, val_size, test_size = calculate_optimal_split(total_records)

In [13]:
print(f"\nAnálisis del dataset:")
print(f"Total de registros: {total_records}")
print("\nProporciones calculadas automáticamente:")
print(f"Training:   {train_size*100:.1f}% ({math.floor(total_records*train_size)} registros)")
print(f"Validation: {val_size*100:.1f}% ({math.floor(total_records*val_size)} registros)")
print(f"Testing:    {test_size*100:.1f}% ({math.floor(total_records*test_size)} registros)")


Análisis del dataset:
Total de registros: 803

Proporciones calculadas automáticamente:
Training:   75.1% (603 registros)
Validation: 12.5% (100 registros)
Testing:    12.5% (100 registros)


In [20]:
# Primera división: separar el conjunto de entrenamiento

from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(
        df, 
        train_size=train_size,
        random_state=42,
        shuffle=True
    )

In [23]:
# Segunda división: separar validación y prueba del resto
val_size_adjusted = val_size / (val_size + test_size)
val_df, test_df = train_test_split(
    temp_df,
    train_size=val_size_adjusted,
    random_state=42,
    shuffle=True
)

In [32]:
# Función auxiliar para guardar en formato JSONL
print("\nGenerando archivos JSONL...")
save_to_jsonl(train_df, 'train.jsonl')
save_to_jsonl(val_df, 'valid.jsonl')
save_to_jsonl(test_df, 'test.jsonl')


Generando archivos JSONL...
✓ Generado: train.jsonl (603 registros)
✓ Generado: valid.jsonl (100 registros)
✓ Generado: test.jsonl (100 registros)
