In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
seguro = pd.read_excel("insurance.xlsx")
seguro.head()

Unnamed: 0,idade,sexo,imc,quantidade_filhos,fumante,regiao,custos_seguro
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.56,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
seguro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1341 entries, 0 to 1340
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   idade              1341 non-null   int64  
 1   sexo               1338 non-null   object 
 2   imc                1341 non-null   float64
 3   quantidade_filhos  1341 non-null   int64  
 4   fumante            1341 non-null   object 
 5   regiao             1341 non-null   object 
 6   custos_seguro      1341 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.5+ KB


In [4]:
seguro.dropna(inplace=True)

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

le.fit(seguro.sexo)
seguro.sexo = le.transform(seguro.sexo)

le.fit(seguro.fumante)
seguro.fumante = le.transform(seguro.fumante)

le.fit(seguro.regiao)
seguro.regiao = le.transform(seguro.regiao)

In [6]:
seguro = pd.DataFrame(seguro)

In [7]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

seguro_norm = pd.DataFrame(scaler.fit_transform(seguro), index=seguro.index, columns=seguro.columns)
seguro_norm.head()

Unnamed: 0,idade,sexo,imc,quantidade_filhos,fumante,regiao,custos_seguro
0,0.021739,0.0,0.321227,0.0,1.0,1.0,0.251611
1,0.0,1.0,0.47915,0.2,0.0,0.666667,0.009636
2,0.217391,1.0,0.4735,0.6,0.0,0.666667,0.053115
3,0.326087,1.0,0.181464,0.0,0.0,0.333333,0.33301
4,0.304348,1.0,0.347592,0.0,0.0,0.333333,0.043816


In [8]:
import statsmodels.formula.api as smf
function = "custos_seguro~idade+imc+quantidade_filhos+fumante+regiao"
model = smf.ols(formula=function, data=seguro_norm).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          custos_seguro   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.750
Method:                 Least Squares   F-statistic:                     802.2
Date:                Fri, 03 Mar 2023   Prob (F-statistic):               0.00
Time:                        02:31:32   Log-Likelihood:                 1230.3
No. Observations:                1338   AIC:                            -2449.
Df Residuals:                    1332   BIC:                            -2417.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            -0.0488      0.00

In [9]:
x = seguro_norm[["idade", "imc", "quantidade_filhos", "fumante", "regiao"]]

y = seguro_norm[["custos_seguro"]]

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=0)

lr = LinearRegression()

lr.fit(x_train, y_train)

In [11]:
from sklearn import metrics
r_sq = lr.score(x,y)
print(r_sq)

0.7505629738411864


In [12]:
y_pred_train = lr.predict(x_train)
print("MAE:", metrics.mean_absolute_error(y_train, y_pred_train))

MAE: 0.06760649849946623


In [13]:
y_pred_test = lr.predict(x_test)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred_test))

MAE: 0.06275502946015028


In [14]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(x_train, y_train)

  rf.fit(x_train, y_train)


In [15]:
from sklearn import metrics
r_sq = rf.score(x,y)
print(r_sq)
y_pred_train = rf.predict(x_train)
print("MAE:", metrics.mean_absolute_error(y_train, y_pred_train))

y_pred_test = rf.predict(x_test)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred_test))

0.9515878614174957
MAE: 0.016336819507585677
MAE: 0.04383461054195164


In [16]:
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor()
ada.fit(x_train, y_train)



  y = column_or_1d(y, warn=True)


In [17]:


r_sq = ada.score(x,y)
print(r_sq)
y_pred_train = ada.predict(x_train)
print("MAE:", metrics.mean_absolute_error(y_train, y_pred_train))

y_pred_test = ada.predict(x_test)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred_test))

0.8341671509087921
MAE: 0.062188444214383236
MAE: 0.061328230789572794


In [18]:
from sklearn.ensemble import GradientBoostingRegressor
grb = GradientBoostingRegressor()
grb.fit(x_train, y_train)


  y = column_or_1d(y, warn=True)


In [19]:
r_sq = grb.score(x,y)
print(r_sq)
y_pred_train = grb.predict(x_train)
print("MAE:", metrics.mean_absolute_error(y_train, y_pred_train))

y_pred_test = grb.predict(x_test)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred_test))

0.8972856272390293
MAE: 0.03346236453828395
MAE: 0.03922240530874838


In [20]:
from sklearn.model_selection import GridSearchCV

parameters = { "max_depth": [5],
              "min_samples_leaf": [4],
              "min_samples_split": [2],
              "n_estimators": [200]}

grid_search = GridSearchCV(grb, parameters, scoring="r2", cv=2, n_jobs=-1)

In [21]:
grid_search.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


In [22]:
print(grid_search.best_estimator_)
print(grid_search.best_params_)

GradientBoostingRegressor(max_depth=5, min_samples_leaf=4, n_estimators=200)
{'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}


In [23]:
best_model = grid_search.best_estimator_

In [24]:
best_model.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 5,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [25]:
grb_tunned = GradientBoostingRegressor(alpha = 0.09,
 ccp_alpha = 0.0,
 criterion = 'friedman_mse',
 init = None,
 learning_rate = 0.1,
 loss = 'squared_error',
 max_depth = 5,
 max_features = None,
 max_leaf_nodes = None,
 min_impurity_decrease = 0.0,
 min_samples_leaf = 4,
 min_samples_split = 2,
 min_weight_fraction_leaf = 0.0,
 n_estimators = 200,
 n_iter_no_change = None,
 random_state = None,
 subsample = 1.0,
 tol = 0.0001,
 validation_fraction = 0.1,
 verbose = 0,
 warm_start = False)

In [26]:
grb_tunned.fit(x_train, y_train)


  y = column_or_1d(y, warn=True)


In [27]:
r_sq = grb_tunned.score(x,y)
r_sq

0.9325722766215536

In [28]:
y_pred_train = grb_tunned.predict(x_train)
print("MAE:", metrics.mean_absolute_error(y_train, y_pred_train))

MAE: 0.023830866069655104


In [29]:
y_pred_test = grb_tunned.predict(x_test)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred_test))

MAE: 0.045718048492786795
