## <center>Health Insurance: Data vs. Reality - A Story</center>



In [19]:
import os
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import mean_absolute_error, r2_score
from pathlib import Path
path = Path()

filepath: Path = f"{path}/insurance.csv"

In [27]:
def load_data(path: Path) -> pd.DataFrame:
    return pd.read_csv(path)

def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    df_fe = df.copy()

    # add a column for "obesity"
    df_fe["obese"] = (df_fe["bmi"] >= 30).astype(int) # 1: obese, 0: not obese

    # age buckets
    bins = [0, 30, 45, 60, 200]
    labels = ["18–30", "31–45", "46–60", "61+"]
    df_fe["age_group"] = pd.cut(df_fe["age"], bins=bins, labels=labels, right=True, include_lowest=True)

    return df_fe

In [28]:
def main():
    # load the data
    df = load_data(filepath)

    # feature engineering
    df_fe = engineer_features(df)

    print(df_fe)

if __name__ == "__main__":
    main()

      age     sex     bmi  children smoker     region      charges  obese  \
0      19  female  27.900         0    yes  southwest  16884.92400      0   
1      18    male  33.770         1     no  southeast   1725.55230      1   
2      28    male  33.000         3     no  southeast   4449.46200      1   
3      33    male  22.705         0     no  northwest  21984.47061      0   
4      32    male  28.880         0     no  northwest   3866.85520      0   
...   ...     ...     ...       ...    ...        ...          ...    ...   
1333   50    male  30.970         3     no  northwest  10600.54830      1   
1334   18  female  31.920         0     no  northeast   2205.98080      1   
1335   18  female  36.850         0     no  southeast   1629.83350      1   
1336   21  female  25.800         0     no  southwest   2007.94500      0   
1337   61  female  29.070         0    yes  northwest  29141.36030      0   

     age_group  
0        18–30  
1        18–30  
2        18–30  
3      