## Insurance Premium Prediction

Given data about insurance customers, let's try to predict the **premium charges** a given customer will incur.

We will use a linear regression model within a scikit-learn pipeline to make our predictions.
We will design an interactive widget that will allow us to make predictions. 

Data Source: https://www.kaggle.com/datasets/simranjain17/insurance

### Importing Libraries

In [25]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor

import ipywidgets as widgets
from IPython.display import display

In [2]:
data = pd.read_csv("insurance.csv")
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Preprocessing

In [4]:
def preprocess_inputs(df):
    df = df.copy()

    # Split df into X and y
    y = df['charges']
    X = df.drop('charges', axis=1)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

    return X_train, X_test, y_train, y_test

In [5]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [6]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region
744,50,male,26.410,0,no,northwest
363,21,female,26.400,1,no,southwest
10,25,male,26.220,0,no,northeast
970,50,female,28.160,3,no,southeast
634,51,male,39.700,1,no,southwest
...,...,...,...,...,...,...
715,60,male,28.900,0,no,southwest
905,26,female,29.355,2,no,northeast
1096,51,female,34.960,2,yes,northeast
235,40,female,22.220,2,yes,southeast


In [7]:
y_train

744      8827.20990
363      2597.77900
10       2721.32080
970     10702.64240
634      9391.34600
           ...     
715     12146.97100
905      4564.19145
1096    44641.19740
235     19444.26580
1061    11554.22360
Name: charges, Length: 936, dtype: float64

### Building the Pipeline and Training

In [19]:
nominal_features = ["sex", "smoker", "region"]

nominal_transformer = Pipeline(steps = [
    ('onehot', OneHotEncoder(sparse_output=False, drop='if_binary'))
])

preprocessor = ColumnTransformer(transformers=[
    ('nominal', nominal_transformer, nominal_features)
], remainder="passthrough")

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor())
])

In [20]:
model.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



### Results

In [21]:
y_pred = model.predict(X_test)
y_pred

array([ 1832.3983213 , 11815.074678  ,  9027.064192  , 10750.7012258 ,
        2490.0637676 , 39958.4063322 , 10029.5477922 , 11652.0751305 ,
        3579.7651563 , 20763.5365675 , 16433.4349007 , 13222.7575798 ,
        6974.9444824 ,  7152.2345558 ,  1885.6603205 , 11349.3423004 ,
        6259.5942232 ,  6829.80135   , 15895.5983642 , 13462.940167  ,
        9922.5059448 , 40206.252239  , 10805.3034281 ,  9381.9674178 ,
       15731.7294414 ,  6743.1316105 ,  8531.9991381 ,  9110.5582046 ,
        6989.3431867 ,  4506.7914651 , 13730.8015653 ,  7260.1946791 ,
       26591.8156779 , 34791.3157463 , 25013.384792  , 15333.1569154 ,
       38812.4077343 , 17007.8921162 , 14247.6852912 , 47481.0380584 ,
        7756.4612745 , 11720.702314  , 11331.9798703 , 18062.5263089 ,
        7758.7849335 , 12719.5866085 ,  1868.9337145 , 34843.550163  ,
       10347.1956598 , 17723.5733326 , 16701.9447455 , 15817.378262  ,
        7397.9298803 , 11003.9492884 , 17627.2671525 ,  6045.398765  ,
      

In [22]:
y_test

559      1646.42970
1087    11353.22760
1020     8798.59300
460     10381.47870
802      2103.08000
           ...     
323     11566.30055
1268     1880.48700
134      2457.21115
1274    17043.34140
876     26140.36030
Name: charges, Length: 402, dtype: float64

In [35]:
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))

print("RMSE: {:.2f}".format(rmse))
print(" R^2: {:.5f}".format(r2))

RMSE: 4864.02
 R^2: 0.83306


### Interactive Widget

In [39]:
X_train.describe()

Unnamed: 0,age,bmi,children
count,936.0,936.0,936.0
mean,38.82265,30.682185,1.104701
std,14.029097,6.087997,1.222664
min,18.0,15.96,0.0
25%,26.0,26.2725,0.0
50%,38.0,30.495,1.0
75%,51.0,34.7175,2.0
max,64.0,53.13,5.0


In [40]:
{column: list(X_train[column].unique()) for column in X_train.select_dtypes('object').columns}

{'sex': ['male', 'female'],
 'smoker': ['no', 'yes'],
 'region': ['northwest', 'southwest', 'northeast', 'southeast']}

In [83]:
age_widget = widgets.IntSlider(
    value=38,
    min=18,
    max=64,
    step=1,
    description='Age:'
)

children_widget = widgets.IntSlider(
    value=38,
    min=0,
    max=5,
    step=1,
    description='Children:'
)

bmi_widget = widgets.FloatSlider(
    value=38,
    min=15,
    max=54,
    step=0.01,
    description='BMI:'
)

sex_widget = widgets.ToggleButtons(
    options=['female', 'male'],
    description="Sex:"
)

smoker_widget = widgets.ToggleButtons(
    options=['no', 'yes'],
    description="Smoker:"
)

region_widget = widgets.Dropdown(
    options=['northeast', 'northwest', 'southeast', 'southwest'],
    description='Region:'
)

predict_btn = widgets.Button(
    description="Predict"
)

prediction_out = widgets.Output()

def make_prediction(btn):
    x = pd.DataFrame({
        'age':      age_widget.value,
        'sex':      sex_widget.value,
        'bmi':      bmi_widget.value,
        'children': children_widget.value,
        'smoker':   smoker_widget.value,
        'region':   region_widget.value
    }, index=[0])
    
    prediction = model.predict(x)

    with prediction_out:
        prediction_out.clear_output()
        print("Prediction: {:.4f}".format(prediction[0]))

predict_btn.on_click(make_prediction)

display(predict_btn, prediction_out, age_widget, bmi_widget, children_widget, sex_widget, smoker_widget, region_widget)

Button(description='Predict', style=ButtonStyle())

Output()

IntSlider(value=38, description='Age:', max=64, min=18)

FloatSlider(value=38.0, description='BMI:', max=54.0, min=15.0, step=0.01)

IntSlider(value=5, description='Children:', max=5)

ToggleButtons(description='Sex:', options=('female', 'male'), value='female')

ToggleButtons(description='Smoker:', options=('no', 'yes'), value='no')

Dropdown(description='Region:', options=('northeast', 'northwest', 'southeast', 'southwest'), value='northeast…

<div class="alert alert-block alert-info"><strong>Note:</strong>This widget can only be accessed in an interactive session of the notebook. </div>