In [None]:
# train_model.py
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

import seaborn as sns

In [None]:
# 1. Cargar dataset
df = sns.load_dataset("titanic")  # dataframe limpio para ejemplo
df = df[['survived','pclass','sex','age','sibsp','parch','fare','embarked']].dropna(subset=['survived'])
df = df.reset_index(drop=True)

In [None]:
# 2. Features y target
X = df.drop(columns=['survived'])
y = df['survived']

In [None]:
# 3. Columnas por tipo
num_features = ['age','sibsp','parch','fare','pclass']
cat_features = ['sex','embarked']


In [None]:
# 4. Preprocesamiento
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [None]:
# 5. Pipeline completo
clf = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [None]:
# 6. Split y entrenamiento
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf.fit(X_train, y_train)

In [None]:
# 7. Evaluación
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred))
try:
    print("ROC AUC:", roc_auc_score(y_test, y_proba))
except Exception:
    pass

In [None]:
# 8. Guardar pipeline
joblib.dump(clf, "model.joblib")
print("Modelo guardado en model.joblib")