### Dependencias

In [44]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Problema

Predecir el coste del seguro

## Instrucciones

 Utilizar el dataset (insurance.csv) para entrenar un modelo de regresión capaz de predecir el valor del seguro en función de las características del cliente. Realizar limpieza, preprocesado modelado y testeo del modelo aportando conclusiones de todos estos pasos.

## El set de datos

* age: age of primary beneficiary

* sex: insurance contractor gender, female, male

* bmi: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height,
objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9

* children: Number of children covered by health insurance / Number of dependents

* smoker: Smoking

* region: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.

* charges: Individual medical costs billed by health insurance



In [45]:
ruta = "insurance.csv"
data = pd.read_csv(ruta)

In [46]:
print(data.shape)
print(data.head())

(1338, 7)
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


## Objetivo

Generar un model de regresión capaz de predecir el valor del seguro en base a las características del cliente.

* Aplicar las técnicas oportunas de procesamiento de datos (lipieza, nans, escalado...)

* Valorar diferentes modelos de regresión (linear regressor, polynomial, ridge, lasso, elastic, decission tree y random forest)

* Comparación entre modelos (dividir el dataset en train y test, entrenar con el train y evaluar con el test)

* Métricas (todas, y que aporta cada una)

* Conclusiones finales

# Implementación

## Procesamiento de Datos

### Valores Nulos

In [47]:
# Verificar valores nulos
missing_values = data.isnull().sum()
print(missing_values)

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


### Variables Categoricas

In [48]:
# Codificar variables categóricas (sex, smoker, region)
categorical_features = ["sex", "smoker", "region"]

encoder = OneHotEncoder(drop="first", sparse_output=False)
encoded_features = encoder.fit_transform(data[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# Concatenar con el dataset original, eliminando las categóricas originales
df = pd.concat([data.drop(columns=categorical_features), encoded_df], axis=1)

In [49]:
print(df)

      age     bmi  children      charges  sex_male  smoker_yes  \
0      19  27.900         0  16884.92400       0.0         1.0   
1      18  33.770         1   1725.55230       1.0         0.0   
2      28  33.000         3   4449.46200       1.0         0.0   
3      33  22.705         0  21984.47061       1.0         0.0   
4      32  28.880         0   3866.85520       1.0         0.0   
...   ...     ...       ...          ...       ...         ...   
1333   50  30.970         3  10600.54830       1.0         0.0   
1334   18  31.920         0   2205.98080       0.0         0.0   
1335   18  36.850         0   1629.83350       0.0         0.0   
1336   21  25.800         0   2007.94500       0.0         0.0   
1337   61  29.070         0  29141.36030       0.0         1.0   

      region_northwest  region_southeast  region_southwest  
0                  0.0               0.0               1.0  
1                  0.0               1.0               0.0  
2                  0.0  

### Split & Escalado

In [50]:
# Separar variables predictoras (X) y variable objetivo (y)
X = df.drop(columns=["charges"])
y = df["charges"]

# Escalado de las variables numéricas
scaler = StandardScaler()
X = scaler.fit_transform(X)

# División en conjunto de entrenamiento y prueba
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Verificar dimensiones finales
X_train.shape[0] + X_val.shape[0] + X_test.shape[0], data.shape[0]

(1338, 1338)

In [51]:
X_train

array([[-1.08275831, -0.66657783, -0.90861367, ..., -0.56641788,
        -0.61132367, -0.56641788],
       [-0.86915474, -1.74188939,  1.58092576, ...,  1.76548098,
        -0.61132367, -0.56641788],
       [ 0.55486907, -0.40164599,  2.41077224, ..., -0.56641788,
        -0.61132367, -0.56641788],
       ...,
       [ 0.62607026,  1.73339292,  2.41077224, ...,  1.76548098,
        -0.61132367, -0.56641788],
       [-1.43876426, -1.46137333, -0.90861367, ...,  1.76548098,
        -0.61132367, -0.56641788],
       [-1.22516069,  0.67858691,  1.58092576, ..., -0.56641788,
        -0.61132367,  1.76548098]])

### Distribución del Target en Train

In [52]:
# Crear histograma con Plotly Express
fig = px.histogram(
    x=y_train,
    nbins=30,
    title='Distribution of Target Values in Training Set',
)
fig.update_layout(
    title_x=0.5,
    xaxis_title=None,
    yaxis_title=None,
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(color='black'),
    yaxis=dict(showgrid=True, gridcolor='lightgray', gridwidth=1, 
            zeroline=False, range=[-5, 110]),  # Límite fijo en el eje Y
    xaxis=dict(showgrid=True, gridcolor='lightgray', gridwidth=1,
            zeroline=False, range=[-5000, 65000]),
)
fig.update_traces(marker=dict(
    color='lightgrey',
    line=dict(color='black', width=1)
))
fig.show()

## Entrenamiento

### Funciones

In [53]:
def train_model(model):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    train_error = mean_squared_error(y_train, y_train_pred)
    val_error = mean_squared_error(y_val, y_val_pred)
    return model, train_error, val_error

In [54]:
def min_mse(model, min_mse=10000000000000000000000, alpha_target=None):

    for alpha in np.logspace(-3, 4, 2000):
        
        reg = model(alpha=alpha)
        reg.fit(X_train, y_train)
        
        predictions = reg.predict(X_val)

        mse = mean_squared_error(y_val, predictions)

        if mse < min_mse:
            min_mse = mse
            alpha_target = alpha

    print(f"Min. Validation RMSE {reg.__class__.__name__.capitalize()} Regularization: {int(np.sqrt(min_mse))} USD")
    print(f'Alpha target: {alpha_target}')
    
    return reg, min_mse, alpha_target
            

### Regresión Lineal

In [55]:
# Entrenaminento
linear_model, train_mse_linear, val_mse_linear = train_model(LinearRegression())
linear_model

In [56]:
# Evaluación
test_predictions_linear_model = linear_model.predict(X_test)
test_mse_linear = mean_squared_error(y_test, test_predictions_linear_model)

In [57]:
# Métricas
print("Linear Regression RMSE")
print(f"Train: {int(np.sqrt(train_mse_linear))} USD")
print(f"Val: {int(np.sqrt(val_mse_linear))} USD")
print(f"Test: {int(np.sqrt(test_mse_linear))} USD")

Linear Regression RMSE
Train: 5878 USD
Val: 6766 USD
Test: 5809 USD


In [58]:
# Crear un DataFrame con los valores RMSE para cada modelo
df_linear_rmse = pd.DataFrame({
    'Set': ['Train', 'Val', 'Test'],
    'RMSE': [np.sqrt(train_mse_linear), np.sqrt(val_mse_linear), np.sqrt(test_mse_linear)]
})

# Crear gráfico de barras interactivo
fig = px.bar(df_linear_rmse,
            x='Set',
            y='RMSE',
            color='Set',
            color_discrete_sequence=['#222222', '#999999', '#dddddd'],
            title='Linear Regression RMSE',
            template='plotly_white')

fig.update_layout(yaxis_title='Root Mean Squared Error',
            xaxis_title=None,
            title_x=0.5,
            legend_title=None,
)
fig.show()

### L1 (Lasso)

In [59]:
# Entrenaminento
lasso_model, train_mse_lasso, val_mse_lasso = train_model(Lasso(alpha=0.1))
lasso_model

In [60]:
# Evaluación
test_predictions_lasso_model = lasso_model.predict(X_test)
test_mse_lasso = mean_squared_error(y_test, test_predictions_lasso_model)

In [61]:
# Métricas
print("Lasso RMSE")
print(f"Train: {int(np.sqrt(train_mse_lasso))} USD")
print(f"Val: {int(np.sqrt(val_mse_lasso))} USD")
print(f"Test: {int(np.sqrt(test_mse_lasso))} USD")

Lasso RMSE
Train: 5878 USD
Val: 6766 USD
Test: 5809 USD


In [62]:
# Crear un DataFrame con los valores RMSE para cada modelo
df_lasso_rmse = pd.DataFrame({
    'Set': ['Train', 'Val', 'Test'],
    'RMSE': [np.sqrt(train_mse_lasso), np.sqrt(val_mse_lasso), np.sqrt(test_mse_lasso)]
})

# Crear gráfico de barras interactivo
fig = px.bar(df_lasso_rmse,
            x='Set',
            y='RMSE',
            color='Set',
            color_discrete_sequence=['#222222', '#999999', '#dddddd'],
            title='Lasso (L1) RMSE',
            template='plotly_white')

fig.update_layout(yaxis_title='Root Mean Squared Error',
            xaxis_title=None,
            title_x=0.5,
            legend_title=None,
)
fig.show()

In [63]:
# Entrenaminento con error mínimo 
lasso_model_min, val_min_mse_lasso, alpha_target = min_mse(Lasso)
lasso_model_min, train_min_mse_lasso, val_min_mse_lasso = train_model(Lasso(alpha=alpha_target))
lasso_model_min

Min. Validation RMSE Lasso Regularization: 6766 USD
Alpha target: 0.001


In [64]:
# Evaluación con error mínimo
test_predictions_lasso_model_min = lasso_model_min.predict(X_test)
test_min_mse_lasso = mean_squared_error(y_test, test_predictions_lasso_model_min)

In [65]:
# Métricas con error mínimo
print("Lasso Min RMSE")
print(f"Train: {int(np.sqrt(train_min_mse_lasso))} USD")
print(f"Val: {int(np.sqrt(val_min_mse_lasso))} USD")
print(f"Test: {int(np.sqrt(test_min_mse_lasso))} USD")

Lasso Min RMSE
Train: 5878 USD
Val: 6766 USD
Test: 5809 USD


In [66]:
# Crear un DataFrame con los valores RMSE para cada modelo
df_lasso_min_rmse = pd.DataFrame({
    'Set': ['Train', 'Val', 'Test'],
    'RMSE': [np.sqrt(train_min_mse_lasso), np.sqrt(val_min_mse_lasso), np.sqrt(test_min_mse_lasso)]
})

# Crear gráfico de barras interactivo
fig = px.bar(df_lasso_min_rmse,
            x='Set',
            y='RMSE',
            color='Set',
            color_discrete_sequence=['#222222', '#999999', '#dddddd'],
            title='Lasso (L1) Min RMSE',
            template='plotly_white')

fig.update_layout(yaxis_title='Min Root Mean Squared Error',
            xaxis_title=None,
            title_x=0.5,
            legend_title=None,
)
fig.show()

### L2 (Ridge)

In [67]:
# Entrenaminento
ridge_model, train_mse_ridge, val_mse_ridge = train_model(Ridge(alpha=0.1))
ridge_model

In [68]:
# Evaluación
test_predictions_ridge_model = ridge_model.predict(X_test)
test_mse_ridge = mean_squared_error(y_test, test_predictions_ridge_model)

In [69]:
# Métricas
print("Ridge RMSE")
print(f"Train: {int(np.sqrt(train_mse_ridge))} USD")
print(f"Val: {int(np.sqrt(val_mse_ridge))} USD")
print(f"Test: {int(np.sqrt(test_mse_ridge))} USD")

Ridge RMSE
Train: 5878 USD
Val: 6767 USD
Test: 5809 USD


In [70]:
# Crear un DataFrame con los valores RMSE para cada modelo
df_ridge_rmse = pd.DataFrame({
    'Set': ['Train', 'Val', 'Test'],
    'RMSE': [np.sqrt(train_mse_ridge), np.sqrt(val_mse_ridge), np.sqrt(test_mse_ridge)]
})

# Crear gráfico de barras interactivo
fig = px.bar(df_ridge_rmse,
            x='Set',
            y='RMSE',
            color='Set',
            color_discrete_sequence=['#222222', '#999999', '#dddddd'],
            title='Ridge (L2) RMSE',
            template='plotly_white')

fig.update_layout(yaxis_title='Root Mean Squared Error',
            xaxis_title=None,
            title_x=0.5,
            legend_title=None,
)
fig.show()

In [71]:
# Entrenaminento con error mínimo 
ridge_model_min, val_min_mse_ridge, alpha_target = min_mse(Ridge)
ridge_model_min, train_min_mse_ridge, val_min_mse_ridge = train_model(Ridge(alpha=alpha_target))
ridge_model_min

Min. Validation RMSE Ridge Regularization: 6766 USD
Alpha target: 0.001


In [72]:
# Evaluación con error mínimo
test_predictions_ridge_model_min = ridge_model_min.predict(X_test)
test_min_mse_ridge = mean_squared_error(y_test, test_predictions_ridge_model_min)

In [73]:
# Métricas con error mínimo
print("Ridge Min RMSE")
print(f"Train: {int(np.sqrt(train_min_mse_ridge))} USD")
print(f"Val: {int(np.sqrt(val_min_mse_ridge))} USD")
print(f"Test: {int(np.sqrt(test_min_mse_ridge))} USD")

Ridge Min RMSE
Train: 5878 USD
Val: 6766 USD
Test: 5809 USD


In [74]:
# Crear un DataFrame con los valores RMSE para cada modelo
df_ridge_min_rmse = pd.DataFrame({
    'Set': ['Train', 'Val', 'Test'],
    'RMSE': [np.sqrt(train_min_mse_ridge), np.sqrt(val_min_mse_ridge), np.sqrt(test_min_mse_ridge)]
})

# Crear gráfico de barras interactivo
fig = px.bar(df_ridge_min_rmse,
            x='Set',
            y='RMSE',
            color='Set',
            color_discrete_sequence=['#222222', '#999999', '#dddddd'],
            title='Ridge (L2) Min RMSE',
            template='plotly_white')

fig.update_layout(yaxis_title='Min Root Mean Squared Error',
            xaxis_title=None,
            title_x=0.5,
            legend_title=None,
)
fig.show()

### ElasticNet

In [75]:
# Entrenaminento
alpha=0.001
l1_ratio=0.01
max_iter=100000
elasticnet_model, train_mse_elasticnet, val_mse_elasticnet = train_model(ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=max_iter))
elasticnet_model

In [76]:
# Evaluación
test_predictions_elasticnet_model = elasticnet_model.predict(X_test)
test_mse_elasticnet = mean_squared_error(y_test, test_predictions_elasticnet_model)

In [77]:
# Métricas
print("ElasticNet RMSE")
print(f"Train: {int(np.sqrt(train_mse_elasticnet))} USD")
print(f"Val: {int(np.sqrt(val_mse_elasticnet))} USD")
print(f"Test: {int(np.sqrt(test_mse_elasticnet))} USD")

ElasticNet RMSE
Train: 5878 USD
Val: 6767 USD
Test: 5810 USD


In [78]:
# Crear un DataFrame con los valores RMSE para cada modelo
df_elasticnet_rmse = pd.DataFrame({
    'Set': ['Train', 'Val', 'Test'],
    'RMSE': [np.sqrt(train_mse_elasticnet), np.sqrt(val_mse_elasticnet), np.sqrt(test_mse_elasticnet)]
})

# Crear gráfico de barras interactivo
fig = px.bar(df_elasticnet_rmse,
            x='Set',
            y='RMSE',
            color='Set',
            color_discrete_sequence=['#222222', '#999999', '#dddddd'],
            title='ElasticNet RMSE',
            template='plotly_white')

fig.update_layout(yaxis_title='Root Mean Squared Error',
            xaxis_title=None,
            title_x=0.5,
            legend_title=None,
)
fig.show()

In [79]:
# Define the parameter grid for alpha and l1_ratio
param_grid = {
    'alpha': np.logspace(-5, 3, 50),
    'l1_ratio': np.logspace(-5, -1, 50)
}

# Create the ElasticNet instance with a high max_iter
elasticnet = ElasticNet(max_iter=100000)

# Set up the grid search with 5-fold cross-validation; note that scoring is negative MSE
grid_search = GridSearchCV(
    estimator=elasticnet,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1
)

# Fit grid search on training data
grid_search.fit(X_train, y_train)

alpha = grid_search.best_params_['alpha']
l1_ratio = grid_search.best_params_['l1_ratio']

print("Best alpha:", alpha)
print("Best l1_ratio:", l1_ratio)

Fitting 5 folds for each of 2500 candidates, totalling 12500 fits
Best alpha: 0.00868511373751352
Best l1_ratio: 0.1


In [80]:
# Entrenaminento
best_alpha=alpha
best_l1_ratio=l1_ratio
max_iter=100000
elasticnet_model_min, train_min_mse_elasticnet, val_min_mse_elasticnet = train_model(ElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio, max_iter=max_iter))
elasticnet_model_min

In [81]:
# Evaluación
test_predictions_elasticnet_model_min = elasticnet_model_min.predict(X_test)
test_min_mse_elasticnet = mean_squared_error(y_test, test_predictions_elasticnet_model_min)

In [82]:
# Métricas
print("ElasticNet RMSE")
print(f"Train: {int(np.sqrt(train_min_mse_elasticnet))} USD")
print(f"Val: {int(np.sqrt(val_min_mse_elasticnet))} USD")
print(f"Test: {int(np.sqrt(test_min_mse_elasticnet))} USD")

ElasticNet RMSE
Train: 5879 USD
Val: 6775 USD
Test: 5818 USD


In [83]:
# Crear un DataFrame con los valores RMSE para cada modelo
df_elasticnet_min_rmse = pd.DataFrame({
    'Set': ['Train', 'Val', 'Test'],
    'RMSE': [np.sqrt(train_min_mse_elasticnet), np.sqrt(val_min_mse_elasticnet), np.sqrt(test_min_mse_elasticnet)]
})

# Crear gráfico de barras interactivo
fig = px.bar(df_elasticnet_min_rmse,
            x='Set',
            y='RMSE',
            color='Set',
            color_discrete_sequence=['#222222', '#999999', '#dddddd'],
            title='ElasticNet Min RMSE',
            template='plotly_white')

fig.update_layout(yaxis_title='Min Root Mean Squared Error',
            xaxis_title=None,
            title_x=0.5,
            legend_title=None,
)
fig.show()

## Comparación de Modelos

In [84]:
# Crear un DataFrame con los valores RMSE para cada modelo
df_rmse = pd.DataFrame({
    'Modelo': ['Linear', 'Lasso', 'Ridge', 'ElasticNet'],
    'RMSE': [np.sqrt(test_mse_linear), np.sqrt(test_min_mse_lasso), np.sqrt(test_min_mse_ridge), np.sqrt(test_min_mse_elasticnet)]
})

# Crear gráfico de barras interactivo
fig = px.bar(df_rmse,
            x='Modelo',
            y='RMSE',
            color='Modelo',
            color_discrete_sequence=['#222222', '#666666', '#aaaaaa', '#dddddd'],
            title='Model RMSE Comparison',
            template='plotly_white')

fig.update_layout(yaxis_title='Root Mean Squared Error',
            xaxis_title=None,
            title_x=0.5,
            legend_title=None,
            yaxis=dict(showgrid=True, gridcolor='lightgray', gridwidth=1, 
                        zeroline=False, range=[5799, 5824]), # Límite fijo en el eje Y
)

fig.show()

# Conclusiones

- Los valores próximos de RMSE para Regresión Linear, Lasso y Ridge indican un comportamiento similar de los modelos.
- Esto sugiere que el dataset no presenta una complejidad o multicolinealidad que requiera un gran ajuste mediante regularización, por lo que la penalización en Lasso (L1) o Ridge (L2) deja prácticamente sin modificar los coeficientes finales.
- El valor más elevado de RMSE para ElasticNet puede ser debido a no haber encontrado valores óptimos para `apha` y `l1_ratio`.
- En este caso la Regresión Lineal parece suficiente sin necesidad de regularización.