# Entrenamiento de Modelos — Penguin Classifier

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from utils.model_trainer import ModelTrainer

## 1. Cargar datos

In [None]:
df = pd.read_csv('/app/data/penguins_v1.csv')
print(f'Dimensiones del dataset: {df.shape}')
df.head()

## 2. Limpieza y exploración

In [None]:
print('Valores nulos por columna:')
print(df.isnull().sum())
print(f'\nFilas duplicadas: {df.duplicated().sum()}')
df.info()

## 3. Preparación de datos

In [None]:
df_clean = df.drop('id', axis=1)
X = df_clean.drop('species', axis=1)
y = df_clean['species']

# Feature engineering
X['bill_ratio'] = X['bill_length_mm'] / X['bill_depth_mm']
X['body_mass_kg'] = X['body_mass_g'] / 1000

print(f'Distribución de clases:\n{y.value_counts()}')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f'Train: {X_train.shape}, Test: {X_test.shape}')

## 4. Inicializar ModelTrainer

In [None]:
trainer = ModelTrainer(
    models_dir='/app/models',
    report_path='/app/report/model_metrics.pkl',
)

## 5. Entrenar Random Forest

In [None]:
rf_metrics = trainer.train_and_save(
    name='randomforest',
    estimator=RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1),
    X_train=X_train, X_test=X_test,
    y_train=y_train, y_test=y_test,
    scaler=StandardScaler(),
)

## 6. Entrenar SVM

In [None]:
svm_metrics = trainer.train_and_save(
    name='svm',
    estimator=SVC(kernel='rbf', C=1.0, random_state=42),
    X_train=X_train, X_test=X_test,
    y_train=y_train, y_test=y_test,
    scaler=StandardScaler(),
)

## 7. Entrenar Gradient Boosting

In [None]:
gb_metrics = trainer.train_and_save(
    name='gradientboosting',
    estimator=GradientBoostingClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42),
    X_train=X_train, X_test=X_test,
    y_train=y_train, y_test=y_test,
    scaler=StandardScaler(),
)

## 8. Resumen de métricas

In [None]:
pd.DataFrame([rf_metrics, svm_metrics, gb_metrics]).set_index('model')