In [None]:
# ============================================================================
# TRABAJO PRACTICO FINAL - ARBOL DE DECISION - TITANIC
# Sistemas de Soporte para la Toma de Decisiones
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)
import warnings
warnings.filterwarnings('ignore')

# Configuracion de graficos
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")


In [None]:
# ============================================================================
# 1. CARGA DE DATOS
# ============================================================================

print("="*80)
print("1. CARGANDO DATOS DEL TITANIC")
print("="*80)

# Rutas de los archivos
ruta_train = r"..\data\train.csv"
ruta_test = r"..\data\test.csv"

# Cargar datasets
df_train = pd.read_csv(ruta_train)
df_test = pd.read_csv(ruta_test)

print(f"\nDimensiones del conjunto de entrenamiento: {df_train.shape}")
print(f"Dimensiones del conjunto de prueba: {df_test.shape}")

print("\nInformacion general del dataset:")
print(df_train.info())


In [None]:
# ============================================================================
# 2. ANALISIS EXPLORATORIO DE DATOS (EDA)
# ============================================================================

print("\n" + "="*80)
print("2. ANALISIS EXPLORATORIO DE DATOS")
print("="*80)

# Valores faltantes
print("\nValores faltantes en train:")
missing_train = df_train.isnull().sum()
print(missing_train[missing_train > 0])

print("\nValores faltantes en test:")
missing_test = df_test.isnull().sum()
print(missing_test[missing_test > 0])

# Estadisticas descriptivas
print("\nEstadisticas descriptivas:")
print(df_train.describe())

# Distribucion de la variable objetivo
print("\nDistribucion de Survived:")
print(df_train['Survived'].value_counts())
print(f"Tasa de supervivencia: {df_train['Survived'].mean()*100:.2f}%")

# Graficos EDA
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Analisis Exploratorio - Titanic Dataset', fontsize=16, fontweight='bold')

# Survived
surv_values = df_train['Survived'].value_counts()
surv_values.plot(
    kind='pie',
    ax=axes[0,0],
    autopct=lambda pct: f"{pct:.1f}%\n({int(pct * surv_values.sum() / 100)})",
    startangle=90,
    colors=['#e74c3c', '#2ecc71'],
    labels=['No sobrevivió', 'Sobrevivió']
)
axes[0,0].set_ylabel('')
axes[0,0].set_title('Distribución de Supervivencia')


# Pclass
pclass_values = df_train['Pclass'].value_counts().sort_index()
pclass_values.plot(
    kind='pie',
    ax=axes[0,1],
    autopct=lambda pct: f"{pct:.1f}%\n({int(pct * pclass_values.sum() / 100)})",
    startangle=90,
    colors=['#F74071', '#7948F7', '#F0E73E'],
    labels=['1ra', '2da', '3ra']
)
axes[0,1].set_ylabel('')
axes[0,1].set_title('Distribución por Clase')


# Sex 
sex_values = df_train['Sex'].value_counts()
sex_values.plot(
    kind='pie',
    ax=axes[0,2],
    autopct=lambda pct: f"{pct:.1f}%\n({int(pct * sex_values.sum() / 100)})",
    startangle=90,
    colors=['#329EF0', '#F07432'],
    labels=['Masculino', 'Femenino']
)
axes[0,2].set_ylabel('')
axes[0,2].set_title('Distribución por Sexo')


# Age
df_train['Age'].hist(bins=30, ax=axes[1,0], color='#84FF54', edgecolor='black')
axes[1,0].set_title('Distribucion de Edad')
axes[1,0].set_xlabel('Edad')
axes[1,0].set_ylabel('Frecuencia')

# Fare
df_train['Fare'].hist(bins=30, ax=axes[1,1], color='#4248FC', edgecolor='black')
axes[1,1].set_title('Distribucion de Tarifa')
axes[1,1].set_xlabel('Tarifa')
axes[1,1].set_ylabel('Frecuencia')

# Sex
pd.crosstab(df_train['Sex'], df_train['Survived']).plot(kind='bar', ax=axes[1,2], 
                                                          color=['#329EF0', '#F07432'])
axes[1,2].set_title('Supervivencia por Sexo')
axes[1,2].set_xticklabels(['Femenino', 'Masculino'], rotation=0)
axes[1,2].set_ylabel('Cantidad')
axes[1,2].set_xlabel('Sexo')
axes[1,2].legend(['No sobrevivio', 'Sobrevivio'])
for container in axes[1,2].containers:
    axes[1,2].bar_label(container, fmt='%d', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('01_eda_titanic.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# ============================================================================
# 3. PREPROCESAMIENTO DE DATOS
# ============================================================================

print("\n" + "="*80)
print("3. PREPROCESAMIENTO DE DATOS")
print("="*80)

def preprocesar_datos(df, es_train=True):
    """
    Preprocesa el dataset del Titanic aplicando:
    - Imputacion de valores faltantes
    - Feature engineering
    - Codificacion de variables categoricas
    """
    df = df.copy()
    
    # ----- FEATURE ENGINEERING -----
    print("\nFeature Engineering...")
    
    # 2. FamilySize = SibSp + Parch + 1
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    # 3. IsAlone
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # ----- IMPUTACION DE VALORES FALTANTES -----
    print("\nImputando valores faltantes...")
    
    # Age: Mediana por Pclass y Sex
    for pclass in df['Pclass'].unique():
        for sex in df['Sex'].unique():
            mask = (df['Pclass'] == pclass) & (df['Sex'] == sex) & (df['Age'].isnull())
            median_age = df[(df['Pclass'] == pclass) & (df['Sex'] == sex)]['Age'].median()
            df.loc[mask, 'Age'] = median_age
       
    # Fare: Mediana (solo en test tiene 1 nulo)
    if df['Fare'].isnull().sum() > 0:
        df['Fare'].fillna(df['Fare'].median(), inplace=True)
    
    # ----- CODIFICACION DE VARIABLES CATEGORICAS -----
    print("\nCodificando variables categoricas...")
    
    # Sex: Label Encoding (male=1, female=0)
    df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})
      
    # ----- ELIMINAR COLUMNAS INNECESARIAS -----
    print("\nBorrando columnas innecesarias...")
    columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked']
    if not es_train:
        columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
    
    print(f"\nShape despues del preprocesamiento: {df.shape}")
    print(f"Columnas finales: {list(df.columns)}")
    
    return df

# Aplicar preprocesamiento
df_train_prep = preprocesar_datos(df_train, es_train=True)
df_test_prep = preprocesar_datos(df_test, es_train=False)

# Alinear columnas entre train y test
missing_cols = set(df_train_prep.columns) - set(df_test_prep.columns) - {'Survived'}
for col in missing_cols:
    df_test_prep[col] = 0

df_test_prep = df_test_prep[df_train_prep.drop('Survived', axis=1).columns]

print("\nPreprocesamiento completado")
print(f"Train shape: {df_train_prep.shape}")
print(f"Test shape: {df_test_prep.shape}")

df_train_prep.to_csv("..\data\dataset_train.csv", index=False)
df_test_prep.to_csv("..\data\dataset_test.csv", index=False)
print("Archivos guardados correctamente.")
