In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform 
import numpy as np

ruta = "insurance.csv"
df = pd.read_csv(ruta)

# Mostrar información básica del dataset
print("Información del dataset:")
print(f"Forma del dataset: {df.shape}")
print(f"Columnas: {list(df.columns)}")
print("\nPrimeras 5 filas:")
df

Información del dataset:
Forma del dataset: (1338, 7)
Columnas: ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']

Primeras 5 filas:


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [5]:
df.duplicated().sum()

np.int64(1)

In [13]:

X = df.drop(columns='charges')
y = df['charges']


In [14]:
X_encoded = pd.get_dummies(X, columns=['sex', 'smoker', 'region'], drop_first=True)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [None]:


# Identificar columnas numéricas (asegúrate de que estas columnas existan)
numeric_features = ['age', 'bmi', 'children']

# Verificar que las columnas existen
print("Columnas disponibles:", list(X_train.columns))
print("Columnas numéricas a escalar:", numeric_features)

# Crear copias para evitar warnings
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Inicializar el escalador
scaler = StandardScaler()

# Ajustar solo con los datos de entrenamiento y transformar ambos sets
X_train_scaled[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test_scaled[numeric_features] = scaler.transform(X_test[numeric_features])

Columnas disponibles: ['age', 'bmi', 'children', 'sex_male', 'smoker_yes', 'region_northwest', 'region_southeast', 'region_southwest']
Columnas numéricas a escalar: ['age', 'bmi', 'children']


In [18]:
models = {
    'Regresión Lineal': LinearRegression(),
    'K-Nearest Neighbors': KNeighborsRegressor(n_neighbors=5),
    'Árbol de Decisión': DecisionTreeRegressor(random_state=42)
}

# Diccionario para almacenar resultados
results = {}

print("=== ENTRENAMIENTO Y EVALUACIÓN DE MODELOS ===")

# Entrenar y evaluar cada modelo
for name, model in models.items():
    print(f"\n--- {name} ---")
    
    # Entrenar el modelo
    model.fit(X_train_scaled, y_train)
    
    # Hacer predicciones
    y_pred = model.predict(X_test_scaled)
    
    # Calcular métricas
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Almacenar resultados
    results[name] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2,
        'model': model
    }
    
    # Mostrar resultados
    print(f"MSE: ${mse:,.2f}")
    print(f"RMSE: ${rmse:,.2f}")
    print(f"MAE: ${mae:,.2f}")
    print(f"R²: {r2:.4f}")

# Comparar modelos
print("\n=== COMPARACIÓN DE MODELOS ===")
comparison_df = pd.DataFrame(results).T
print(comparison_df[['MSE', 'RMSE', 'MAE', 'R²']].round(4))

=== ENTRENAMIENTO Y EVALUACIÓN DE MODELOS ===

--- Regresión Lineal ---
MSE: $33,596,915.85
RMSE: $5,796.28
MAE: $4,181.19
R²: 0.7836

--- K-Nearest Neighbors ---
MSE: $46,285,808.78
RMSE: $6,803.37
MAE: $3,891.05
R²: 0.7019

--- Árbol de Decisión ---
MSE: $40,795,333.45
RMSE: $6,387.12
MAE: $3,114.15
R²: 0.7372

=== COMPARACIÓN DE MODELOS ===
                                 MSE         RMSE          MAE        R²
Regresión Lineal     33596915.851361  5796.284659  4181.194474  0.783593
K-Nearest Neighbors  46285808.777673  6803.367459  3891.052489   0.70186
Árbol de Decisión    40795333.454086  6387.122471  3114.152909  0.737226


In [21]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

print("=== VALIDACIÓN CRUZADA DE MODELOS ===")

# Diccionario para almacenar resultados de CV
cv_results = {}

for name, model in models.items():
    print(f"\n--- {name} ---")
    
    # Realizar validación cruzada
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=cv, scoring='neg_mean_squared_error')
    
    # Convertir MSE negativo a positivo
    mse_scores = -cv_scores
    rmse_scores = np.sqrt(mse_scores)
    
    # Calcular estadísticas
    mean_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)
    
    # Almacenar resultados
    cv_results[name] = {
        'Mean RMSE': mean_rmse,
        'Std RMSE': std_rmse,
        'CV Scores (RMSE)': rmse_scores
    }
    
    print(f"RMSE promedio: ${mean_rmse:,.2f}")
    print(f"Desviación estándar: ${std_rmse:,.2f}")
    print(f"RMSE por fold: {rmse_scores.round(2)}")

=== VALIDACIÓN CRUZADA DE MODELOS ===

--- Regresión Lineal ---
RMSE promedio: $6,123.35
Desviación estándar: $491.65
RMSE por fold: [6602.04 6528.25 5437.13 6427.86 5621.49]

--- K-Nearest Neighbors ---
RMSE promedio: $6,976.01
Desviación estándar: $666.78
RMSE por fold: [7715.39 7847.19 6278.13 6436.52 6602.83]

--- Árbol de Decisión ---
RMSE promedio: $6,683.42
Desviación estándar: $742.93
RMSE por fold: [7277.43 6836.31 5233.38 6905.27 7164.73]


In [26]:
print("=== GRIDSEARCHCV SIMPLIFICADO ===")

# Solo los modelos que tienen hiperparámetros importantes
models_grid = {
    'KNN': {
        'model': KNeighborsRegressor(),
        'params': {'n_neighbors': [3, 5, 7, 9]}
    },
    'Árbol': {
        'model': DecisionTreeRegressor(random_state=42),
        'params': {'max_depth': [3, 5, 7, 10]}
    }
}

best_models = {}

for name, config in models_grid.items():
    print(f"\nOptimizando {name}...")
    
    grid = GridSearchCV(
        config['model'], 
        config['params'], 
        cv=5, 
        scoring='neg_mean_squared_error'
    )
    
    grid.fit(X_train_scaled, y_train)
    
    best_models[name] = {
        'model': grid.best_estimator_,
        'params': grid.best_params_,
        'rmse': np.sqrt(-grid.best_score_)
    }
    
    print(f"Mejores parámetros: {grid.best_params_}")
    print(f"RMSE: ${np.sqrt(-grid.best_score_):,.2f}")

# Mostrar el mejor
best = min(best_models.items(), key=lambda x: x[1]['rmse'])
print(f"\nMejor modelo: {best[0]} (RMSE: ${best[1]['rmse']:,.2f})")

=== GRIDSEARCHCV SIMPLIFICADO ===

Optimizando KNN...
Mejores parámetros: {'n_neighbors': 3}
RMSE: $6,616.07

Optimizando Árbol...
Mejores parámetros: {'max_depth': 3}
RMSE: $4,826.90

Mejor modelo: Árbol (RMSE: $4,826.90)


In [27]:
print("=== RANDOMIZEDSEARCHCV ===")

# Distribuciones de parámetros
param_distributions = {
    'KNN': {
        'model': KNeighborsRegressor(),
        'params': {
            'n_neighbors': randint(3, 15),
            'weights': ['uniform', 'distance']
        }
    },
    'Árbol': {
        'model': DecisionTreeRegressor(random_state=42),
        'params': {
            'max_depth': randint(3, 15),
            'min_samples_split': randint(2, 10)
        }
    }
}

best_models = {}

for name, config in param_distributions.items():
    print(f"\nOptimizando {name}...")
    
    random_search = RandomizedSearchCV(
        config['model'],
        config['params'],
        n_iter=20,  # Solo 20 iteraciones
        cv=5,
        scoring='neg_mean_squared_error',
        random_state=42
    )
    
    random_search.fit(X_train_scaled, y_train)
    
    best_models[name] = {
        'model': random_search.best_estimator_,
        'params': random_search.best_params_,
        'rmse': np.sqrt(-random_search.best_score_)
    }
    
    print(f"Mejores parámetros: {random_search.best_params_}")
    print(f"RMSE: ${np.sqrt(-random_search.best_score_):,.2f}")

# Mejor modelo
best = min(best_models.items(), key=lambda x: x[1]['rmse'])
print(f"\nMejor modelo: {best[0]} (RMSE: ${best[1]['rmse']:,.2f})")

=== RANDOMIZEDSEARCHCV ===

Optimizando KNN...
Mejores parámetros: {'n_neighbors': 3, 'weights': 'distance'}
RMSE: $6,518.32

Optimizando Árbol...
Mejores parámetros: {'max_depth': 3, 'min_samples_split': 5}
RMSE: $4,826.90

Mejor modelo: Árbol (RMSE: $4,826.90)
