In [102]:
import pandas as pd


In [103]:
data = pd.read_csv('insurance.csv')

In [104]:
data.shape


(1338, 7)

In [105]:
data.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [106]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [107]:
data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [108]:
data = data.dropna()

In [109]:
data.shape

(1338, 7)

No hay ningún valor nulo como se puede ver

In [110]:
data = data.drop_duplicates()
data.shape

(1337, 7)

Solo hay un valor duplicado, se ha eliminado 

In [111]:
data = pd.get_dummies(data, drop_first=True)


In [112]:
x = data.drop(['charges'], axis=1)
y = data['charges']

In [113]:
import sklearn as sk
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [114]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [115]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# MODELOS ML #

Linear Regression 

In [116]:
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
y_pred_lin = lin_reg.predict(x_test)


Polynomial Regression

In [117]:
poly_degree = 2
polynomial_pipeline = Pipeline([
    ('poly_features', PolynomialFeatures(degree=poly_degree, include_bias=False)),
    ('scaler', StandardScaler()),  # normaliza las nuevas variables polinómicas
    ('lin_reg', LinearRegression())
])

polynomial_pipeline.fit(x_train, y_train)
y_pred_poly = polynomial_pipeline.predict(x_test)


Ridge

In [118]:
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(x_train, y_train)
y_pred_ridge = ridge_reg.predict(x_test)


Lasso

In [119]:
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(x_train, y_train)
y_pred_lasso = lasso_reg.predict(x_test)


Elastic Net

In [120]:
elastic_net = ElasticNet(alpha=0.001, l1_ratio=0.5)
elastic_net.fit(x_train, y_train)
y_pred_elastic = elastic_net.predict(x_test)


Decision Tree Regressor

In [121]:
tree_reg = DecisionTreeRegressor(max_depth=None, random_state=42)
tree_reg.fit(x_train, y_train)
y_pred_tree = tree_reg.predict(x_test)


Random Forest Regressor

In [122]:
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(x_train, y_train)
y_pred_rf = rf_reg.predict(x_test)

In [123]:
def evaluate_model(y_true, y_pred, model_name="Modelo"):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name}:")
    print(f"  MSE: {mse:.3f}")
    print(f"  RMSE: {rmse:.3f}")
    print(f"  R2: {r2:.3f}")
    print("---------------")

# Evaluamos cada modelo
evaluate_model(y_test, y_pred_lin, "Linear Regression")
evaluate_model(y_test, y_pred_poly, f"Polynomial (grado={poly_degree})")
evaluate_model(y_test, y_pred_ridge, "Ridge")
evaluate_model(y_test, y_pred_lasso, "Lasso")
evaluate_model(y_test, y_pred_elastic, "Elastic Net")
evaluate_model(y_test, y_pred_tree, "Decision Tree")
evaluate_model(y_test, y_pred_rf, "Random Forest")


Linear Regression:
  MSE: 35478020.675
  RMSE: 5956.343
  R2: 0.807
---------------
Polynomial (grado=2):
  MSE: 21585843.724
  RMSE: 4646.057
  R2: 0.883
---------------
Ridge:
  MSE: 35512474.828
  RMSE: 5959.234
  R2: 0.807
---------------
Lasso:
  MSE: 35478028.000
  RMSE: 5956.344
  R2: 0.807
---------------
Elastic Net:
  MSE: 35496421.992
  RMSE: 5957.887
  R2: 0.807
---------------
Decision Tree:
  MSE: 34953028.963
  RMSE: 5912.109
  R2: 0.810
---------------
Random Forest:
  MSE: 22378218.813
  RMSE: 4730.562
  R2: 0.878
---------------


In [124]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]
}

ridge = Ridge()
grid_search = GridSearchCV(ridge, param_grid, cv=5,
                           scoring='neg_mean_squared_error', 
                           return_train_score=True)
grid_search.fit(x_train, y_train)

print("Mejores parámetros Ridge:", grid_search.best_params_)



Mejores parámetros Ridge: {'alpha': 10}
Mejor puntuación (MSE negativo): -37523749.25297037


# CONCLUSIONES 

- Si el objetivo es prevenir el sobreajuste y mejorar la estabilidad del modelo en datos nuevos, Ridge es la mejor opción.
- Si se busca maximizar la precisión, el modelo polinómico es el más adecuado, aunque puede ser más propenso al sobreajuste y difícil de interpretar.
- Lasso y Elastic Net no han mostrado una ventaja significativa en este caso.