Medical Cost personal  Dataset - Insurance Forecast by using Regression algorithms of machine learning, Predicting the insurance costs.

### Model Building and Tuning

#### Import the Libraries

In [None]:
import pandas as pd
import numpy as np

#### Load the dataset

In [None]:
insurance = pd.read_csv('drive/MyDrive/Datasets/insurance_clean.csv')
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


#### Features and Target variables

In [None]:
X = insurance.iloc[:,0:6].values     # features
y = insurance.iloc[:, 6:7].values    # Target

#### Splitting the data into train and test



In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

X_train:  (1070, 6)
X_test:  (268, 6)
y_train:  (1070, 1)
y_test:  (268, 1)


#### Linear Regression Model Building


##### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression(n_jobs= 2,normalize= True)
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=2, normalize=True)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
print('Score on training set: ',lin_reg.score(X_train, y_train))

Score on training set:  0.7368306228430945


In [None]:
y_pred = lin_reg.predict(X_test)                     # y_pred = test set predicted profits and they are predicted on x_test = features of test set to get y_pred(label of test set)
np.set_printoptions(precision = 2)                        
df = np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),axis = 1)
df = pd.DataFrame(df)
df

Unnamed: 0,0,1
0,11017.230479,9724.53000
1,9787.530168,8547.69130
2,37994.782118,45702.02235
3,16122.778407,12950.07120
4,6930.759230,9644.25250
...,...,...
263,14699.098767,15019.76005
264,8210.854549,6664.68595
265,16019.284542,20709.02034
266,33104.944084,40932.42950


In [None]:
from sklearn.metrics import r2_score
print('r2 score: ' , r2_score(y_test, y_pred))
print('Linear model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred))))

r2 score:  0.7998747145449959
Linear model RMSE: 5643.219748880902


Cross Validation of Linear Regression model

In [None]:
from sklearn.model_selection import cross_val_predict, cross_val_score
cv_lin_reg = -cross_val_score(lin_reg, X_train, y_train, cv = 3, scoring = 'neg_mean_squared_error').mean()
print('RMSE of tuned model - Training: {}'.format(np.sqrt(cv_lin_reg)))

y_pred_cv = cross_val_predict(lin_reg, X_test, y_test, cv = 5)
print('RMSE of tuned model - Testing: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_cv))))

RMSE of tuned model - Training: 6175.696747119696
RMSE of tuned model - Testing: 5720.416660643799


##### Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge()
ridge_reg.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
print('Score on training set: ',ridge_reg.score(X_train, y_train))

Score on training set:  0.7368099378903962


In [None]:
y_pred_ridge = ridge_reg.predict(X_test)
print('r2 score: ' , r2_score(y_test, y_pred_ridge))
print('Ridge model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_ridge))))

r2 score:  0.7995147965249456
Ridge model RMSE: 5648.292031679053


Model Tuning for Ridge Regression

In [None]:
from sklearn.linear_model import RidgeCV
lambdas = 10**np.linspace(-5,5,500)*0.5
ridge_cv = RidgeCV(alphas = lambdas, scoring = 'neg_mean_squared_error', normalize = True,)
ridge_cv.fit(X_train, y_train)

RidgeCV(alphas=array([5.00e-06, 5.24e-06, 5.48e-06, 5.74e-06, 6.01e-06, 6.30e-06,
       6.59e-06, 6.91e-06, 7.23e-06, 7.57e-06, 7.93e-06, 8.31e-06,
       8.70e-06, 9.11e-06, 9.54e-06, 9.99e-06, 1.05e-05, 1.10e-05,
       1.15e-05, 1.20e-05, 1.26e-05, 1.32e-05, 1.38e-05, 1.45e-05,
       1.51e-05, 1.58e-05, 1.66e-05, 1.74e-05, 1.82e-05, 1.91e-05,
       2.00e-05, 2.09e-05, 2.19e-05, 2.29e-05, 2.40e-05, 2.51e-05,
       2.63...
       1.20e+04, 1.25e+04, 1.31e+04, 1.37e+04, 1.44e+04, 1.51e+04,
       1.58e+04, 1.65e+04, 1.73e+04, 1.81e+04, 1.90e+04, 1.99e+04,
       2.08e+04, 2.18e+04, 2.28e+04, 2.39e+04, 2.50e+04, 2.62e+04,
       2.74e+04, 2.87e+04, 3.01e+04, 3.15e+04, 3.30e+04, 3.46e+04,
       3.62e+04, 3.79e+04, 3.97e+04, 4.16e+04, 4.35e+04, 4.56e+04,
       4.77e+04, 5.00e+04]),
        cv=None, fit_intercept=True, gcv_mode=None, normalize=True,
        scoring='neg_mean_squared_error', store_cv_values=False)

In [None]:
ridge_cv.alpha_

0.002109699040402512

In [None]:
ridge_reg_tuned = Ridge(alpha= ridge_cv.alpha_, normalize= True,random_state= 0)
ridge_reg_tuned.fit(X_train, y_train)

Ridge(alpha=0.002109699040402512, copy_X=True, fit_intercept=True,
      max_iter=None, normalize=True, random_state=0, solver='auto', tol=0.001)

In [None]:
y_pred_ridge_tune = ridge_reg_tuned.predict(X_test)
print('r2 score: ' , r2_score(y_test, y_pred_ridge_tune))
print('Ridge Tuned model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_ridge_tune))))

r2 score:  0.7997114575379172
Ridge Tuned model RMSE: 5645.521075644364


##### Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(max_iter=1000)
lasso_reg.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
print('Score on training set: ',lasso_reg.score(X_train, y_train))

y_pred_lasso = lasso_reg.predict(X_test)
print('r2 score: ' , r2_score(y_test, y_pred_lasso))
print('Lasso model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_lasso))))

Score on training set:  0.7368305371730299
r2 score:  0.7998461817644503
Lasso model RMSE: 5643.622024414162


Model Tuning for Lasso Regression

In [None]:
from sklearn.linear_model import LassoCV
lasso_cv_reg = LassoCV(cv = 15, max_iter = 1000, normalize= True)
lasso_cv_reg.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LassoCV(alphas=None, copy_X=True, cv=15, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=100, n_jobs=None, normalize=True,
        positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

In [None]:
lasso_cv_reg.alpha_

0.8689876273541082

In [None]:
lasso_tuned = Lasso(alpha= lasso_cv_reg.alpha_, max_iter= 5000, random_state= 0)
lasso_tuned.fit(X_train, y_train)

Lasso(alpha=0.8689876273541082, copy_X=True, fit_intercept=True, max_iter=5000,
      normalize=False, positive=False, precompute=False, random_state=0,
      selection='cyclic', tol=0.0001, warm_start=False)

In [None]:
y_pred_lasso_tune = lasso_tuned.predict(X_test)
print('r2 score: ' , r2_score(y_test, y_pred_lasso_tune))
print('Lasso Tuned model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_lasso_tune))))

r2 score:  0.799849928252933
Lasso Tuned model RMSE: 5643.569205377171


#### Random Forest Regression Model Building

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(n_estimators= 100, criterion= 'mae')
rf_reg.fit(X_train , y_train)

  after removing the cwd from sys.path.


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [None]:
y_pred_rf = rf_reg.predict(X_test)                     # y_pred = test set predicted profits and they are predicted on x_test = features of test set to get y_pred(label of test set)
np.set_printoptions(precision = 2)                        
df = np.concatenate((y_pred_rf.reshape(len(y_pred_rf),1),y_test.reshape(len(y_test),1)),axis = 1)
df = pd.DataFrame(df)
df

Unnamed: 0,0,1
0,10152.990353,9724.53000
1,9723.004387,8547.69130
2,44504.452045,45702.02235
3,13121.199389,12950.07120
4,10486.559448,9644.25250
...,...,...
263,16734.814117,15019.76005
264,6783.442690,6664.68595
265,11649.881877,20709.02034
266,43544.418502,40932.42950


In [None]:
from sklearn.metrics import mean_squared_error, r2_score
print('Score on training set: ',rf_reg.score(X_train, y_train))

print('r2 score: ' , r2_score(y_test, y_pred_rf))
print('Random Forest Regression model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_rf))))

Score on training set:  0.9720198635588667
r2 score:  0.8798528859351975
Random Forest Regression model RMSE: 4372.528528694141


##### Hyperparameter Tuning for Random Forest

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 220, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 7, 10]
min_samples_leaf = [1, 2, 4,5]
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rf_tune = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf_tune, param_distributions = random_grid, n_iter = 100, cv = 3, verbose = 2,
                               random_state = 42, n_jobs = -1)
rf_random.fit(X_train , y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  7.2min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [None]:
rf_random.best_params_

{'bootstrap': True,
 'max_depth': 94,
 'max_features': 'auto',
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'n_estimators': 1155}

In [None]:
rf_tuned = RandomForestRegressor(bootstrap= True,
 max_depth= 94,
 max_features= 'auto',
 min_samples_leaf = 5,
 min_samples_split = 2,
 n_estimators= 1155)
rf_tuned.fit(X_train, y_train)

  import sys


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=94, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1155, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred_rf_tune = rf_reg.predict(X_test) 

print('Score on training set: ',rf_tuned.score(X_train, y_train))
print('r2 score: ' , r2_score(y_test, y_pred_rf_tune))
print('Random Forest Regression Tuned model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_rf_tune))))

Score on training set:  0.9026518367011778
r2 score:  0.8798528859351975
Random Forest Regression Tuned model RMSE: 4372.528528694141


##### Grid Search with Cross Validation

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 100, 110,120, 150],
    'max_features': [ 3, 4, 5],
    'min_samples_leaf': [3, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200,500, 1000]
}
rf_grid = RandomForestRegressor()

grid_search = GridSearchCV(estimator = rf_grid, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 540 candidates, totalling 1620 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed: 15.0min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed: 16.7min finished
  self.best_estimator_.fit(X, y, **fit_params)


{'bootstrap': True,
 'max_depth': 150,
 'max_features': 4,
 'min_samples_leaf': 5,
 'min_samples_split': 12,
 'n_estimators': 200}

In [None]:
rf_grid_tuned = RandomForestRegressor(bootstrap = True,
 max_depth = 150,
 max_features = 4,
 min_samples_leaf = 5,
 min_samples_split = 12,
 n_estimators = 200)
rf_grid_tuned.fit(X_train, y_train)

  import sys


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=150, max_features=4, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=12, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred_rf_tune_grid = rf_grid_tuned.predict(X_test) 

print('Score on training set: ',rf_grid_tuned.score(X_train, y_train))
print('r2 score: ' , r2_score(y_test, y_pred_rf_tune_grid))
print('Random Forest Regression Grid Cross CV Tuned model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_rf_tune_grid))))

Score on training set:  0.8978015740276342
r2 score:  0.9009491834551663
Random Forest Regression Grid Cross CV Tuned model RMSE: 3970.1325802490733
