In [1]:
#Import Libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
#Importing The Dataset
dataset = pd.read_csv('insurance charges.csv')
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
#Missing Data ?
dataset.isna().any().any()

False

In [4]:
#Get Statistical data
dataset.describe

<bound method NDFrame.describe of       age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
1      18    male  33.770         1     no  southeast   1725.55230
2      28    male  33.000         3     no  southeast   4449.46200
3      33    male  22.705         0     no  northwest  21984.47061
4      32    male  28.880         0     no  northwest   3866.85520
...   ...     ...     ...       ...    ...        ...          ...
1333   50    male  30.970         3     no  northwest  10600.54830
1334   18  female  31.920         0     no  northeast   2205.98080
1335   18  female  36.850         0     no  southeast   1629.83350
1336   21  female  25.800         0     no  southwest   2007.94500
1337   61  female  29.070         0    yes  northwest  29141.36030

[1338 rows x 7 columns]>

In [5]:
#Create X features and y targets
X = dataset.iloc[:, 0:6].values
y = dataset.iloc[:, 6].values

In [6]:
print(X)

[[19 'female' 27.9 0 'yes' 'southwest']
 [18 'male' 33.77 1 'no' 'southeast']
 [28 'male' 33.0 3 'no' 'southeast']
 ...
 [18 'female' 36.85 0 'no' 'southeast']
 [21 'female' 25.8 0 'no' 'southwest']
 [61 'female' 29.07 0 'yes' 'northwest']]


In [7]:
print(y)

[16884.924   1725.5523  4449.462  ...  1629.8335  2007.945  29141.3603]


In [8]:
print('X Shape: ', X.shape)
print('y Shape: ', y.shape)

X Shape:  (1338, 6)
y Shape:  (1338,)


In [9]:
#Encoding Categorical Data (Gender Column)
le = LabelEncoder()
X[:, 1] = le.fit_transform(X[:, 1])
print(X)

[[19 0 27.9 0 'yes' 'southwest']
 [18 1 33.77 1 'no' 'southeast']
 [28 1 33.0 3 'no' 'southeast']
 ...
 [18 0 36.85 0 'no' 'southeast']
 [21 0 25.8 0 'no' 'southwest']
 [61 0 29.07 0 'yes' 'northwest']]


In [10]:
#Encoding categorical Data (Smoker Column)
X[:, 4] = le.fit_transform(X[:, 4])
print(X)

[[19 0 27.9 0 1 'southwest']
 [18 1 33.77 1 0 'southeast']
 [28 1 33.0 3 0 'southeast']
 ...
 [18 0 36.85 0 0 'southeast']
 [21 0 25.8 0 0 'southwest']
 [61 0 29.07 0 1 'northwest']]


In [11]:
#One Hot Encode region Column
ct_region = ColumnTransformer(transformers=[('region', OneHotEncoder(), [5])], remainder='passthrough')
X = ct_region.fit_transform(X)
print(X)

[[0.0 0.0 0.0 ... 27.9 0 1]
 [0.0 0.0 1.0 ... 33.77 1 0]
 [0.0 0.0 1.0 ... 33.0 3 0]
 ...
 [0.0 0.0 1.0 ... 36.85 0 0]
 [0.0 0.0 0.0 ... 25.8 0 0]
 [0.0 1.0 0.0 ... 29.07 0 1]]


In [12]:
#Splitting dataset Into Training & Testing Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state=0)

In [13]:
from xgboost import XGBRegressor
regressor = XGBRegressor(random_state=42, silent=True)
regressor.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1, verbosity=1)

In [14]:
#Predicting The Test Results
y_pred = regressor.predict(X_test)
results_comparison = pd.DataFrame({"Actual Costs": y_test.flatten(), "Predicted Costs": y_pred.flatten().round(2)})
results_comparison

Unnamed: 0,Actual Costs,Predicted Costs
0,9724.53000,12687.620117
1,8547.69130,10116.120117
2,45702.02235,46235.839844
3,12950.07120,14842.809570
4,9644.25250,12036.009766
...,...,...
263,15019.76005,17838.000000
264,6664.68595,7342.319824
265,20709.02034,11739.879883
266,40932.42950,40649.710938


In [15]:
from sklearn import metrics 
print('Train Score: {:.2f} %'.format(regressor.score(X_train, y_train) * 100))  
print('Test Score: {:.2f} %'.format(regressor.score(X_test, y_test) *100))

Train Score: 89.06 %
Test Score: 89.91 %


In [16]:
print("Mean Absolute Error", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error", metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error 2417.2510406013002
Mean Squared Error 16059852.73093341
Root Mean Squared Error 4007.4746076467422


In [17]:
#Applying K-fold Cross validation 
from sklearn.model_selection import cross_val_score
accs = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
print('Accuracy: {:.2f} %'.format(accs.mean()*100))
print('Standard Deviation: {:.2f} %'.format(accs.std()*100))

Accuracy: 84.41 %
Standard Deviation: 4.07 %
