# Titanic Survival Prediction - Feature Engineering
# =================================================

# ## 📋 Objetivo
# Crear nuevas características (features) para mejorar el poder predictivo de nuestros modelos:
# - Extraer información de variables existentes
# - Crear variables de interacción
# - Transformar variables categóricas
# - Escalar variables numéricas
# - Preparar datos para modelado

## 1. Importación de Librerías

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder, TargetEncoder
import re
import warnings

# Funciones helper
import sys
import os
sys.path.append('../src')
from utils.helpers import save_current_plot

In [None]:
# Configuración
warnings.filterwarnings("ignore")
plt.style.use("seaborn-v0_8")
sns.set_palette("husl")

print("✅ Librerías importadas correctamente")

## 2. Carga de Datos Limpios

In [None]:
# Cargar dataset limpio del paso anterior
df = pd.read_csv("../data/processed/train_processed.csv")

print("🚢 DATASET LIMPIO CARGADO")
print("=" * 30)
print(f"📊 Dimensiones: {df.shape}")
print(f"👥 Total de registros: {len(df)}")

print("\n📋 Información del dataset:")
print(df.info())

print("\n🔍 Primeras filas:")
print(df.head())

# Verificar que no hay valores faltantes
missing_check = df.isnull().sum()
print(f"\n❌ Verificación de valores faltantes:")
print(missing_check[missing_check > 0])

if missing_check.sum() == 0:
    print("✅ Confirmado: No hay valores faltantes")

## 3. Feature Engineering: Variables Derivadas

In [None]:
print("\n🔧 CREACIÓN DE NUEVAS CARACTERÍSTICAS")
print("=" * 40)

# Crear copia para feature engineering
df_features = df.copy()

### 3.1 Family Size (Tamaño de Familia)

In [None]:
print("\n👨‍👩‍👧‍👦 FEATURE 1: FAMILY SIZE")
print("=" * 25)

# FamilySize = SibSp + Parch + 1 (el pasajero mismo)
df_features["FamilySize"] = df_features["SibSp"] + df_features["Parch"] + 1

print("Distribución de FamilySize:")
family_size_dist = df_features["FamilySize"].value_counts().sort_index()
print(family_size_dist)

# Analizar supervivencia por tamaño de familia
family_survival = (
    df_features.groupby("FamilySize")["Survived"].agg(["count", "sum", "mean"]).round(3)
)
family_survival.columns = ["Total", "Supervivientes", "Tasa_Supervivencia"]
print("\nSupervivencia por tamaño de familia:")
print(family_survival)

# Visualización
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
df_features["FamilySize"].hist(
    bins=range(1, df_features["FamilySize"].max() + 2), alpha=0.7, color="skyblue"
)
plt.title("Distribución de Family Size")
plt.xlabel("Tamaño de Familia")
plt.ylabel("Frecuencia")

plt.subplot(1, 3, 2)
family_survival["Tasa_Supervivencia"].plot(kind="bar", color="lightgreen")
plt.title("Tasa de Supervivencia por Family Size")
plt.xlabel("Tamaño de Familia")
plt.ylabel("Tasa de Supervivencia")
plt.xticks(rotation=0)

plt.subplot(1, 3, 3)
sns.boxplot(data=df_features, x="FamilySize", y="Survived", palette="Set2")
plt.title("Supervivencia vs Family Size")
plt.xlabel("Tamaño de Familia")

plt.tight_layout()
save_current_plot("family_size_analysis", "../results/figures/feature_engineering/")
plt.show()

### 3.2 Is Alone (Viaja Solo)

In [None]:
print("\n🚶 FEATURE 2: IS ALONE")
print("=" * 20)

# IsAlone = 1 si FamilySize == 1, 0 si no
df_features["IsAlone"] = (df_features["FamilySize"] == 1).astype(int)

# Análisis
alone_analysis = pd.crosstab(
    df_features["IsAlone"], df_features["Survived"], margins=True
)
print("Supervivencia por viajar solo:")
print(alone_analysis)

alone_survival_pct = (
    pd.crosstab(df_features["IsAlone"], df_features["Survived"], normalize="index")
    * 100
)
print("\nPorcentajes:")
print(alone_survival_pct.round(1))

# Visualización
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
sns.barplot(
    data=df_features, x="IsAlone", y="Survived", palette=["coral", "lightgreen"]
)
plt.title("Supervivencia: Solo vs Acompañado")
plt.xlabel("Viaja Solo (0=No, 1=Sí)")
plt.ylabel("Tasa de Supervivencia")

plt.subplot(1, 2, 2)
sns.countplot(
    data=df_features, x="IsAlone", hue="Survived", palette=["crimson", "forestgreen"]
)
plt.title("Conteo: Solo vs Acompañado")
plt.xlabel("Viaja Solo (0=No, 1=Sí)")

plt.tight_layout()
save_current_plot("is_alone_analysis", "../results/figures/feature_engineering/")
plt.show()

### 3.3 Age Groups (Categorías de Edad)

In [None]:
print("\n👶👨👴 FEATURE 3: AGE GROUPS")
print("=" * 25)


# Crear bins de edad basados en análisis demográfico histórico
def categorize_age(age):
    if age < 12:
        return "Child"
    elif age < 18:
        return "Teen"
    elif age < 35:
        return "Young_Adult"
    elif age < 60:
        return "Middle_Age"
    else:
        return "Senior"


df_features["AgeGroup"] = df_features["Age"].apply(categorize_age)

# Análisis por grupo etario
age_group_analysis = (
    df_features.groupby("AgeGroup")["Survived"].agg(["count", "sum", "mean"]).round(3)
)
age_group_analysis.columns = ["Total", "Supervivientes", "Tasa_Supervivencia"]
print("Supervivencia por grupo etario:")
print(age_group_analysis)

# Visualización
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
df_features["AgeGroup"].value_counts().plot(kind="bar", color="lightblue")
plt.title("Distribución por Grupo Etario")
plt.xlabel("Grupo de Edad")
plt.ylabel("Cantidad")
plt.xticks(rotation=45)

plt.subplot(1, 3, 2)
age_group_analysis["Tasa_Supervivencia"].plot(kind="bar", color="orange")
plt.title("Tasa de Supervivencia por Edad")
plt.xlabel("Grupo de Edad")
plt.ylabel("Tasa de Supervivencia")
plt.xticks(rotation=45)

plt.subplot(1, 3, 3)
sns.boxplot(data=df_features, x="AgeGroup", y="Age", palette="Set3")
plt.title("Distribución de Edad por Grupo")
plt.xlabel("Grupo de Edad")
plt.xticks(rotation=45)

plt.tight_layout()
save_current_plot("age_groups_analysis", "../results/figures/feature_engineering/")
plt.show()

### 3.4 Fare Bins (Categorías de Tarifa)

In [None]:
print("\n💰 FEATURE 4: FARE BINS")
print("=" * 20)

# Crear cuartiles de tarifa
df_features["FareBin"] = pd.qcut(
    df_features["Fare"], q=4, labels=["Low", "Medium", "High", "Premium"]
)

# Análisis por categoría de tarifa
fare_bin_analysis = (
    df_features.groupby("FareBin")["Survived"].agg(["count", "sum", "mean"]).round(3)
)
fare_bin_analysis.columns = ["Total", "Supervivientes", "Tasa_Supervivencia"]
print("Supervivencia por categoría de tarifa:")
print(fare_bin_analysis)

# Mostrar rangos de cada bin
fare_ranges = df_features.groupby("FareBin")["Fare"].agg(["min", "max"]).round(2)
print("\nRangos de tarifa por categoría:")
print(fare_ranges)

# Visualización
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
df_features["FareBin"].value_counts().plot(kind="bar", color="gold")
plt.title("Distribución por Categoría de Tarifa")
plt.xlabel("Categoría de Tarifa")
plt.ylabel("Cantidad")
plt.xticks(rotation=0)

plt.subplot(1, 3, 2)
fare_bin_analysis["Tasa_Supervivencia"].plot(kind="bar", color="green")
plt.title("Supervivencia por Categoría de Tarifa")
plt.xlabel("Categoría de Tarifa")
plt.ylabel("Tasa de Supervivencia")
plt.xticks(rotation=0)

plt.subplot(1, 3, 3)
sns.boxplot(data=df_features, x="FareBin", y="Fare", palette="viridis")
plt.title("Distribución de Tarifa por Categoría")
plt.xlabel("Categoría de Tarifa")

plt.tight_layout()
save_current_plot("fare_bins_analysis", "../results/figures/feature_engineering/")
plt.show()

### 3.5 Títulos Extraídos del Nombre (usando datos originales)

In [None]:
print("\n🎩 FEATURE 5: TITLES FROM NAMES")
print("=" * 30)

# Necesitamos cargar los nombres originales
df_original = pd.read_csv("../data/raw/titanic.csv")


# Extraer títulos usando expresiones regulares
def extract_title(name):
    title_search = re.search(" ([A-Za-z]+)\.", name)
    if title_search:
        return title_search.group(1)
    return ""


df_features["Title"] = df_original["Name"].apply(extract_title)

# Analizar distribución de títulos
title_counts = df_features["Title"].value_counts()
print("Distribución de títulos:")
print(title_counts)


# Simplificar títulos raros en categorías
def simplify_title(title):
    if title in ["Mr"]:
        return "Mr"
    elif title in ["Miss", "Mlle"]:
        return "Miss"
    elif title in ["Mrs", "Mme"]:
        return "Mrs"
    elif title in ["Master"]:
        return "Master"
    elif title in ["Dr", "Rev", "Col", "Major", "Capt"]:
        return "Officer"
    elif title in ["Countess", "Lady", "Sir", "Don", "Dona", "Jonkheer"]:
        return "Royalty"
    else:
        return "Rare"


df_features["Title_Simplified"] = df_features["Title"].apply(simplify_title)

# Análisis por título
title_analysis = (
    df_features.groupby("Title_Simplified")["Survived"]
    .agg(["count", "sum", "mean"])
    .round(3)
)
title_analysis.columns = ["Total", "Supervivientes", "Tasa_Supervivencia"]
print("\nSupervivencia por título:")
print(title_analysis)

# Visualización
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
df_features["Title_Simplified"].value_counts().plot(kind="bar", color="purple")
plt.title("Distribución de Títulos Simplificados")
plt.xlabel("Título")
plt.ylabel("Cantidad")
plt.xticks(rotation=45)

plt.subplot(1, 3, 2)
title_analysis["Tasa_Supervivencia"].plot(kind="bar", color="pink")
plt.title("Supervivencia por Título")
plt.xlabel("Título")
plt.ylabel("Tasa de Supervivencia")
plt.xticks(rotation=45)

plt.subplot(1, 3, 3)
sns.countplot(
    data=df_features,
    x="Title_Simplified",
    hue="Survived",
    palette=["crimson", "forestgreen"],
)
plt.title("Conteo de Supervivencia por Título")
plt.xlabel("Título")
plt.xticks(rotation=45)

plt.tight_layout()

save_current_plot("titles_analysis", "../results/figures/feature_engineering/")
plt.show()

### 3.6 Variables de Interacción

In [None]:
print("\n🔗 FEATURE 6: INTERACTION VARIABLES")
print("=" * 35)

# Crear variables de interacción importantes basadas en EDA
# Sex_Pclass: La combinación más predictiva según EDA
df_features["Sex_Pclass"] = (
    df_features["Sex"] + "_Class" + df_features["Pclass"].astype(str)
)


# Age_Sex: Interacción edad-género
def age_sex_category(row):
    if row["AgeGroup"] in ["Child", "Teen"]:
        return "Young"
    elif row["Sex"] == "female":
        return "Adult_Female"
    else:
        return "Adult_Male"


df_features["Age_Sex"] = df_features.apply(age_sex_category, axis=1)

# Análisis de variables de interacción
print("Supervivencia por Sex_Pclass:")
sex_pclass_analysis = (
    df_features.groupby("Sex_Pclass")["Survived"].agg(["count", "mean"]).round(3)
)
print(sex_pclass_analysis)

print("\nSupervivencia por Age_Sex:")
age_sex_analysis = (
    df_features.groupby("Age_Sex")["Survived"].agg(["count", "mean"]).round(3)
)
print(age_sex_analysis)

# Visualización
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
sex_pclass_analysis["mean"].plot(kind="bar", color="teal")
plt.title("Supervivencia: Género × Clase")
plt.xlabel("Género_Clase")
plt.ylabel("Tasa de Supervivencia")
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
age_sex_analysis["mean"].plot(kind="bar", color="navy")
plt.title("Supervivencia: Edad × Género")
plt.xlabel("Categoría Edad_Género")
plt.ylabel("Tasa de Supervivencia")
plt.xticks(rotation=45)

plt.tight_layout()
save_current_plot("interaction_variables", "../results/figures/feature_engineering/")
plt.show()

## 4. Encoding de Variables Categóricas

In [11]:
print("\n🔢 ENCODING DE VARIABLES CATEGÓRICAS")
print("=" * 40)

# Crear copia para encoding
df_encoded = df_features.copy()


🔢 ENCODING DE VARIABLES CATEGÓRICAS


### 4.1 Label Encoding para variables binarias

In [None]:
print("\n📊 LABEL ENCODING (Variables Binarias):")

# Sex: male=0, female=1
le_sex = LabelEncoder()
df_encoded["Sex_Encoded"] = le_sex.fit_transform(df_encoded["Sex"])
print(f"Sex: {dict(zip(le_sex.classes_, le_sex.transform(le_sex.classes_)))}")

# Embarked: ordinal por frecuencia
le_embarked = LabelEncoder()
df_encoded["Embarked_Encoded"] = le_embarked.fit_transform(df_encoded["Embarked"])
print(
    f"Embarked: {dict(zip(le_embarked.classes_, le_embarked.transform(le_embarked.classes_)))}"
)

### 4.2 Ordinal Encoding para variables ordenadas

In [None]:
print("\n📈 ORDINAL ENCODING (Variables Ordenadas):")

# AgeGroup: orden natural
age_order = ["Child", "Teen", "Young_Adult", "Middle_Age", "Senior"]
oe_age = OrdinalEncoder(categories=[age_order])
df_encoded["AgeGroup_Encoded"] = oe_age.fit_transform(df_encoded[["AgeGroup"]]).astype(
    int
)
print(f"AgeGroup: {dict(zip(age_order, range(len(age_order))))}")

# FareBin: orden natural
fare_order = ["Low", "Medium", "High", "Premium"]
oe_fare = OrdinalEncoder(categories=[fare_order])
df_encoded["FareBin_Encoded"] = oe_fare.fit_transform(df_encoded[["FareBin"]]).astype(
    int
)
print(f"FareBin: {dict(zip(fare_order, range(len(fare_order))))}")

4.3 One-Hot Encoding para variables nominales

In [None]:
print("\n🔥 ONE-HOT ENCODING (Variables Nominales):")

# Title_Simplified
title_dummies = pd.get_dummies(df_encoded["Title_Simplified"], prefix="Title")
df_encoded = pd.concat([df_encoded, title_dummies], axis=1)
print(f"Title_Simplified → {title_dummies.columns.tolist()}")

# Sex_Pclass (interacción importante)
sex_pclass_dummies = pd.get_dummies(df_encoded["Sex_Pclass"], prefix="SexPclass")
df_encoded = pd.concat([df_encoded, sex_pclass_dummies], axis=1)
print(f"Sex_Pclass → {sex_pclass_dummies.columns.tolist()}")

# Age_Sex
age_sex_dummies = pd.get_dummies(df_encoded["Age_Sex"], prefix="AgeSex")
df_encoded = pd.concat([df_encoded, age_sex_dummies], axis=1)
print(f"Age_Sex → {age_sex_dummies.columns.tolist()}")

## 5. Selección de Features Finales

In [None]:
print("\n🎯 SELECCIÓN DE FEATURES FINALES")
print("=" * 35)

# Variables originales relevantes
original_features = [
    "Survived",
    "Pclass",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
    "Cabin_Known",
]

# Nuevas features numéricas
new_numeric_features = ["FamilySize", "IsAlone"]

# Features encodificadas
encoded_features = [
    "Sex_Encoded",
    "Embarked_Encoded",
    "AgeGroup_Encoded",
    "FareBin_Encoded",
]

# Features one-hot encoded
onehot_features = (
    title_dummies.columns.tolist()
    + sex_pclass_dummies.columns.tolist()
    + age_sex_dummies.columns.tolist()
)

# Crear dataset final
final_features = (
    original_features + new_numeric_features + encoded_features + onehot_features
)

df_final = df_encoded[final_features].copy()

print(f"📊 Features finales: {len(final_features)}")
print(f"📈 Dimensiones finales: {df_final.shape}")

print("\nCategorías de features:")
print(f"  - Originales: {len(original_features)}")
print(f"  - Nuevas numéricas: {len(new_numeric_features)}")
print(f"  - Encoded: {len(encoded_features)}")
print(f"  - One-hot: {len(onehot_features)}")

## 6. Scaling de Variables Numéricas

In [None]:
print("\n📏 SCALING DE VARIABLES NUMÉRICAS")
print("=" * 35)

# Identificar variables numéricas para scaling (excluyendo target y dummies)
numeric_features_to_scale = [
    "Pclass",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
    "FamilySize",
    "Sex_Encoded",
    "Embarked_Encoded",
    "AgeGroup_Encoded",
    "FareBin_Encoded",
]

# Aplicar StandardScaler
scaler = StandardScaler()
df_scaled = df_final.copy()

# Escalar solo las features numéricas (no las dummies ni el target)
df_scaled[numeric_features_to_scale] = scaler.fit_transform(
    df_final[numeric_features_to_scale]
)

print("✅ Variables escaladas con StandardScaler:")
print(f"  - {len(numeric_features_to_scale)} variables numéricas")
print(f"  - Media ≈ 0, Desviación estándar ≈ 1")

# Verificar scaling
print("\nVerificación del scaling (primeras 5 variables):")
scaling_check = df_scaled[numeric_features_to_scale[:5]].describe()
print(scaling_check.round(3))

## 7. Análisis de Correlación Final

In [None]:
print("\n🔗 ANÁLISIS DE CORRELACIÓN FINAL")
print("=" * 35)

# Matriz de correlación con target
correlation_with_target = df_scaled.corr()["Survived"].sort_values(ascending=False)
print("Top 10 correlaciones con Survived:")
print(correlation_with_target.head(10).round(3))

print("\nBottom 5 correlaciones con Survived:")
print(correlation_with_target.tail(5).round(3))

# Visualización de correlaciones importantes
plt.figure(figsize=(12, 8))

# Top correlaciones (absolutas)
top_correlations = (
    correlation_with_target.drop("Survived").abs().sort_values(ascending=False).head(15)
)
top_corr_values = correlation_with_target[top_correlations.index]

sns.barplot(x=top_corr_values.values, y=top_corr_values.index, palette="viridis")
plt.title("Top 15 Correlaciones con Supervivencia")
plt.xlabel("Correlación con Survived")
plt.ylabel("Features")

plt.tight_layout()
save_current_plot("final_correlations", "../results/figures/feature_engineering/")
plt.show()

## 8. Resumen de Feature Engineering

In [None]:
print("\n📝 RESUMEN DE FEATURE ENGINEERING")
print("=" * 40)

feature_summary = {
    "Dataset original": f"{df.shape[0]} filas × {df.shape[1]} columnas",
    "Dataset final": f"{df_final.shape[0]} filas × {df_final.shape[1]} columnas",
    "Features añadidas": len(final_features) - len(original_features),
    "Nuevas variables creadas": [
        "FamilySize (SibSp + Parch + 1)",
        "IsAlone (FamilySize == 1)",
        "AgeGroup (5 categorías)",
        "FareBin (4 cuartiles)",
        "Title_Simplified (6 categorías)",
        "Variables de interacción (Sex_Pclass, Age_Sex)",
    ],
    "Encoding aplicado": [
        "Label Encoding: Sex, Embarked",
        "Ordinal Encoding: AgeGroup, FareBin",
        "One-Hot Encoding: Title, Sex_Pclass, Age_Sex",
    ],
    "Scaling": "StandardScaler en variables numéricas",
    "Top 3 features": correlation_with_target.drop("Survived")
    .abs()
    .head(3)
    .index.tolist(),
}

for key, value in feature_summary.items():
    print(f"✅ {key}: {value}")

## 9. Guardar Dataset con Features

In [None]:
print("\n💾 GUARDANDO DATASET CON FEATURES")
print("=" * 35)

# Guardar dataset sin escalar (para análisis)
output_path_features = "../data/processed/features_engineered.csv"
df_final.to_csv(output_path_features, index=False)
print(f"✅ Dataset con features guardado: {output_path_features}")

# Guardar dataset escalado (para modelado)
output_path_scaled = "../data/processed/features_scaled.csv"
df_scaled.to_csv(output_path_scaled, index=False)
print(f"✅ Dataset escalado guardado: {output_path_scaled}")

# Verificación
df_verify_features = pd.read_csv(output_path_features)
df_verify_scaled = pd.read_csv(output_path_scaled)

print(f"🔍 Verificación features: {df_verify_features.shape}")
print(f"🔍 Verificación scaled: {df_verify_scaled.shape}")

## 10. Próximos Pasos

In [None]:
print("\n🚀 PRÓXIMOS PASOS")
print("=" * 18)

next_steps = [
    "Modelado (04_modeling.ipynb)",
    "- Split estratificado train/validation/test",
    "- Implementar algoritmos base:",
    "  * Logistic Regression",
    "  * Random Forest",
    "  * Support Vector Machine",
    "  * Naive Bayes",
    "- Cross-validation para robustez",
    "- Hyperparameter tuning",
    "Evaluación (05_model_evaluation.ipynb)",
    "- Métricas de performance",
    "- Análisis de feature importance",
    "- Interpretación de resultados",
    "- Comparación de modelos",
]

for i, step in enumerate(next_steps, 1):
    if step.startswith("-") or step.startswith(" "):
        print(f"  {step}")
    else:
        print(f"{i}. {step}")

print("\n✅ ¡Feature Engineering completado exitosamente!")
print("📊 Datasets listos para modelado")
print(f"🎯 {len(final_features)} features disponibles para entrenamiento")