In [1]:
from matplotlib.pylab import rcParams
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import opendatasets as od
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xbg
import scipy.stats as stats

In [2]:
# Assign the Kaggle data set URL into variable
dataset = 'https://www.kaggle.com/datasets/rahulvyasm/medical-insurance-cost-prediction'
# Using opendatasets let's download the data sets
od.download(dataset )

Skipping, found downloaded files in ".\medical-insurance-cost-prediction" (use force=True to force download)


In [3]:
df = pd.read_csv("D:/DataAnalysis/medical-insurance-cost-prediction/medical_insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2772 entries, 0 to 2771
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       2772 non-null   int64  
 1   sex       2772 non-null   object 
 2   bmi       2772 non-null   float64
 3   children  2772 non-null   int64  
 4   smoker    2772 non-null   object 
 5   region    2772 non-null   object 
 6   charges   2772 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 151.7+ KB


In [5]:
label_encoder = preprocessing.LabelEncoder()
df['sex']= label_encoder.fit_transform(df['sex']) 
df['smoker']= label_encoder.fit_transform(df['smoker']) 
df['region']= label_encoder.fit_transform(df['region']) 
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [6]:
X = df[['age','sex','bmi','children','smoker','region']]
y= df['charges']
# using the train test split function
X_train, X_test, y_train, y_test = train_test_split(X,y ,  random_state=10,  test_size=0.25, shuffle=True)

In [None]:
clf = tree.DecisionTreeRegressor()
clf = clf.fit(X_train, y_train)

In [None]:
#  Plot Decision Tree 
# rcParams['figure.figsize'] = 160,100
# tree.plot_tree(clf)

In [None]:
yPred= clf.predict(X_test)

In [None]:
print(np.sqrt(metrics.mean_squared_error(y_test, yPred)))
print(metrics.mean_absolute_error(y_test, yPred))

In [None]:
metrics.r2_score(y_test, yPred)

In [None]:
# Applying grid search for DT
parameters = {
              'max_depth':[10,12,None],
              'min_samples_leaf':[10,15,20,30 ],
              'min_samples_split':[ 90, 100,110],
            'criterion' : ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']
             }
clfN = tree.DecisionTreeRegressor()
# model_DT =RandomizedSearchCV( clf, parameters,cv = 5,n_jobs = -1, verbose=True, scoring= 'r2',error_score='raise',return_train_score=True)
model_DT = GridSearchCV( estimator = clfN,param_grid= parameters,cv = 5,n_jobs = -1, verbose=True, scoring= 'r2')
model_DT.fit(X_train, y_train)

In [None]:
# model_DT.cv_results_

In [None]:
model_DT.best_estimator_

In [None]:
model_DT.best_score_

In [None]:
yPredGD= model_DT.predict(X_test)
print(np.sqrt(metrics.mean_squared_error(y_test, yPredGD)))
print(metrics.mean_absolute_error(y_test, yPredGD))

In [None]:
metrics.r2_score(y_test, yPredGD)

In [None]:
# Applying grid search for Random Search

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
clf_rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
model_RF = RandomizedSearchCV(estimator = clf_rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1,scoring ='r2' )
# Fit the random search model
model_RF.fit(X_train, y_train)

In [None]:
model_RF.best_score_

In [None]:
model_RF.best_estimator_

In [None]:
yPredRF= model_RF.predict(X_test)
print(np.sqrt(metrics.mean_squared_error(y_test, yPredRF)))
print(metrics.mean_absolute_error(y_test, yPredRF))

In [None]:
metrics.r2_score(y_test, yPredRF)

In [7]:
# Number of trees in xgb
n_estimators = [int(x) for x in np.linspace(start = 300, stop = 3000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
learning_rate = [0.1,0.2,0.3,0.4,0.5,0.6]
booster = ['gbtree','gblinear']
max_bin = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
validate_parameters = True
# Min sample split
gamma = [2, 5, 10,15,20]
# Sampling of data
subsample = [0.1,0.2,0.3,0.4,0.5,0.6]
lambdaReg = [int(x) for x in np.linspace(0, 50, num = 10)]
alpha = [int(x) for x in np.linspace(0, 50, num = 10)]
tree_method = ['auto','exact','approx','hist']
# Create the  random search cv
parameter_grid = {
               'max_bin': max_bin,
               'n_estimators':n_estimators,
               'max_depth': max_depth,
               'learning_rate': learning_rate,
               'booster': booster,
               'gamma':gamma,
               'subsample':subsample,
               'lambda':lambdaReg,
               'alpha': alpha,
               'tree_method':tree_method
             }
clf_XGB = xbg.XGBRegressor(verbosity =1,validate_parameters = True,n_jobs=-1)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
model_xbg = RandomizedSearchCV(estimator = clf_XGB, param_distributions = parameter_grid, n_iter=100, cv = 3, verbose=2, random_state=42, n_jobs = -1,scoring ='r2' )
# model_xbg = GridSearchCV( estimator = clf_XGB, param_grid= parameter_grid, cv = 3, n_jobs = -1, verbose=True, scoring= 'r2')
# Fit the random search model
model_xbg.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [9]:
model_xbg.best_score_

0.9132887196554272

In [8]:
model_xbg.best_estimator_