In [2]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

import matplotlib.pyplot as plt

### <font color="orange">  **Imports, Data Loading and Preprocessing** </font>

In [24]:
df = pd.read_csv('health_insurance_train.csv')
df_autograder = pd.read_csv('health_insurance_autograde.csv')
pd.options.display.float_format = '{:.2f}'.format

# Define a function to convert education ranges to numerical values
def convert_education(education):

    education = education.replace('years', '')

    if '>16' in education:
        return 20
    
    if '<9' in education:
        return 4.5

    education = education.split('-')
    education = (float(education[-1])+float(education[0]))/2

    return education

# Define a function to convert yes/no values to binary values
def yes_no_to_binary(value):
    if value == 'yes' or value == 'no':
        if value == 'yes':
            return 1
        else:
            return -1
    return value

# Define a function to change NaN values to the mean of the column
def nan_to_mean(value, mean):
    if pd.isna(value):
        return mean
    return value

# Apply the conversion functions to the education column
df['education'] = df['education'].apply(convert_education)

# Apply binary conversion to all columns that contain yes/no values
for key in df.keys():
    df[key] = df[key].apply(yes_no_to_binary)


# Apply one-hot encoding to the race and region column
''''
One hot encoding is a process by which categorical variables are converted into
binary True/False. ML algorithm will read True as 1 and False as 0. So gradients
can be made for each column separetely.

'''
df = pd.get_dummies(df, columns=['race'], prefix='race')
df = pd.get_dummies(df, columns=['region'],prefix='reg')


# Apply the mean conversion to all columns that contain NaN values

''''
Now that all columns are already converted to numerical values, we can apply the
mean conversion to all columns that contain NaN values.

'''

for key in df:
    mean = df[key].mean()
    df[key] = df[key].apply(lambda x: nan_to_mean(x, mean))


df.head()



Unnamed: 0,whrswk,hhi,whi,hhi2,education,hispanic,experience,kidslt6,kids618,husby,race_black,race_other,race_white,reg_northcentral,reg_other,reg_south,reg_west
0,40.0,-1,1,1,14.0,-1,17.0,0.0,1.0,22.0,False,False,True,False,False,True,False
1,40.0,-1,1,1,14.0,-1,4.0,1.0,0.0,15.0,False,False,True,False,False,True,False
2,0.0,1,-1,1,16.0,-1,21.0,0.0,1.0,100.0,False,False,True,False,True,False,False
3,40.0,-1,-1,1,14.0,-1,22.0,0.34,0.7,60.0,False,False,True,True,False,False,False
4,35.0,-1,1,-1,12.0,-1,15.0,0.0,2.0,0.0,False,False,True,False,False,True,False


In [None]:
########### Pipeline Configuration ##############

scale_data = True


#Pipeline 1 --> True,False,False,True,False
#Pipeline 2 --> True,False,False,True,True
#Pipeline 3 --> False,True,False,True,False

#################################################