## Insurance Premium Prediction

Given *data about insurance customers*, let's try to predict the **premium charges** a given customer will incur.

We will use a linear regression model within a scikit-learn pipeline to make our predictions.

We will design an interactive widget that will allow us to make predictions.

Data source: https://www.kaggle.com/datasets/noordeen/insurance-premium-prediction

### Getting Started

In [37]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor

import ipywidgets as widgets

from IPython.display import display

In [38]:
data = pd.read_csv('insurance.csv')
data

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Preprocessing

In [40]:
df = data.copy()

In [41]:
# Split df into X and y
y = df['expenses'].copy()
X = df.drop('expenses', axis=1).copy()

In [42]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle = True, random_state=1)

### Building the Pipeline and Training

In [43]:
nominal_features = ['sex', 'smoker', 'region']

nominal_transformer = Pipeline(
    steps=
    [
        ('onehot', OneHotEncoder(sparse_output=False, drop='if_binary'))
         ])

preprocessor = ColumnTransformer(
    transformers=[
        ('nominal', nominal_transformer, nominal_features)
    ], remainder='passthrough'
)

In [44]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor())
])

In [45]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('nominal', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'if_binary'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Results

In [46]:
y_pred = model.predict(X_test)

In [47]:
y_pred

array([ 1774.8419, 12971.9655,  9117.7519, 10461.3783,  2332.1505,
       39166.3069,  9669.5383, 11663.4237,  4342.5523, 20049.3883,
       16576.8029, 15138.4419,  7026.5251,  7020.0039,  2334.978 ,
       10794.8044,  5489.9176,  6661.5394, 15732.5396, 13641.0872,
        9971.8123, 39388.1509, 12904.2983,  9894.4197, 14971.2686,
        6583.0569,  9281.7742,  9818.2108,  7887.1163,  4400.9444,
       13207.2783,  6475.2435, 25630.7441, 34944.6941, 25470.2792,
       14572.7568, 37860.2714, 16921.1881, 14760.0971, 45321.637 ,
        7231.2973, 11778.6571, 10760.6756, 18475.6816,  7064.1055,
       13070.2961,  1704.1055, 34797.1537,  9723.8052, 17832.0301,
       15691.0489, 16661.6592,  5855.9388, 10547.1654, 17829.085 ,
        5819.5885, 40607.0585, 13803.4135,  9234.9936,  1459.112 ,
        4244.0982, 13232.9654, 23810.3118,  6243.7167, 15083.1329,
        9516.1235, 10523.489 , 13226.972 ,  2542.3391, 16937.926 ,
       44262.5643, 39405.3327,  2216.3747, 14873.3279, 13517.3

In [48]:
y_test

559      1646.43
1087    11353.23
1020     8798.59
460     10381.48
802      2103.08
          ...   
323     11566.30
1268     1880.49
134      2457.21
1274    17043.34
876     26140.36
Name: expenses, Length: 402, dtype: float64

In [49]:
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
rmse

np.float64(4917.27841967216)

In [50]:
y_test.describe()

count      402.000000
mean     13255.809154
std      11919.315632
min       1131.510000
25%       4936.177500
50%       9830.960000
75%      16694.947500
max      60021.400000
Name: expenses, dtype: float64

In [51]:
np.sum((y_test - y_test.mean())**2)   # Sum of Squared Errors

np.float64(56970104140.256516)

In [52]:
r2 = 1 - np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2)  # r2 score

In [53]:
print("RMSE: {:.2f}".format(rmse))
print("R^2 Score: {:.5f}".format(r2))

RMSE: 4917.28
R^2 Score: 0.82938


#### Interactive Widget 

In [54]:
widgets.IntSlider()

IntSlider(value=0)

In [55]:
widgets.FloatSlider()

FloatSlider(value=0.0)

In [56]:
widgets.ToggleButtons()

ToggleButtons(options=(), value=None)

In [57]:
X_train.describe()

Unnamed: 0,age,bmi,children
count,936.0,936.0,936.0
mean,38.82265,30.684829,1.104701
std,14.029097,6.087874,1.222664
min,18.0,16.0,0.0
25%,26.0,26.275,0.0
50%,38.0,30.5,1.0
75%,51.0,34.725,2.0
max,64.0,53.1,5.0


In [58]:
{column: list(X_train[column].unique()) for column in X_train.select_dtypes('object').columns}

{'sex': ['male', 'female'],
 'smoker': ['no', 'yes'],
 'region': ['northwest', 'southwest', 'northeast', 'southeast']}

In [65]:
age_widget = widgets.IntSlider(
    value=38,
    min=18,
    max=64,
    step=1,
    description='Age'
)

In [67]:
children_widget = widgets.IntSlider(
    value=1,
    min=0,
    max=5,
    step=1,
    description='Children'
)

In [68]:
bmi_widget = widgets.FloatSlider(
    value=30,
    min=15,
    max=54,
    step=0.01,
    description='BMI'
)

In [69]:
sex_widget = widgets.ToggleButtons(
    options=['female', 'male'],
    description='Sex'
)

In [72]:
smoker_widget = widgets.ToggleButtons(
    options=['no', 'yes'],
    description='Smoker'
)

In [73]:
region_widget = widgets.Dropdown(
    options=['northeast', 'northwest', 'southeast', 'southwest'],
    description='Region'
)

In [109]:
def make_prediction(btn):
    x = pd.DataFrame({
        'age': age_widget.value,
        'sex': sex_widget.value,
        'bmi': bmi_widget.value,
        'smoker': smoker_widget.value,
        'children': children_widget.value,
        'region': region_widget.value
    }, index=[0])

    prediction = model.predict(x)

    with prediction_out:
        prediction_out.clear_output()
        print("Prediction: {:.4f}".format(prediction[0]))

predict_btn = widgets.Button(
    description='Predict'
)

predict_btn.on_click(make_prediction)

prediction_out = widgets.Output()

display(age_widget, bmi_widget, children_widget, sex_widget, smoker_widget, region_widget, predict_btn, prediction_out)

IntSlider(value=47, description='Age', max=64, min=18)

FloatSlider(value=37.96, description='BMI', max=54.0, min=15.0, step=0.01)

IntSlider(value=2, description='Children', max=5)

ToggleButtons(description='Sex', options=('female', 'male'), value='female')

ToggleButtons(description='Smoker', options=('no', 'yes'), value='no')

Dropdown(description='Region', options=('northeast', 'northwest', 'southeast', 'southwest'), value='northeast'…

Button(description='Predict', style=ButtonStyle())

Output()

<div class='alert alert-block alert-info'><strong>Note:</strong> This Widget can only be accessed in an interactive session of the notebook.</div>