## Model Building

In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
data = pd.read_csv('\insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [3]:
def preprocess_inputs(df):
    df = df.copy()

    # Splitting the dataset into X and y
    X = df.drop('expenses', axis=1)
    y = df['expenses']

    # train test split
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size  =0.2, random_state = 100)

    # categorical columns
    cat_cols = ['sex', 'smoker','region']

    # numerical columns
    num_cols = ['age', 'bmi', 'children']

    # using column transformer for num and cat cols
    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)])

    return x_train, x_test, y_train, y_test, preprocessor

In [4]:
x_train, x_test, y_train, y_test, preprocessor = preprocess_inputs(data)

# Creating Pipelines for Models

In [5]:
pipeline_lr = Pipeline([('preprocessor', preprocessor), ('linear_regressor', LinearRegression())])
pipeline_dt = Pipeline([('preprocessor', preprocessor), ('decision_tree_regressor', DecisionTreeRegressor())])
pipeline_knn = Pipeline([('preprocessor', preprocessor), ('knn', KNeighborsRegressor(n_neighbors = 7))])
pipeline_rf = Pipeline([('preprocessor', preprocessor), ('random_forest', RandomForestRegressor())])
pipeline_gb = Pipeline([('preprocessor', preprocessor), ('gradient_boosting', GradientBoostingRegressor())])

In [6]:
pipelines = [pipeline_dt, pipeline_gb, pipeline_knn, pipeline_lr, pipeline_rf]
pipe_dict = {0: 'Linear Regression', 1: 'Decision Tree', 2: 'KNN', 3: 'Random forest', 4: 'Gradient boosting'}

In [7]:
for pipe in pipelines:
    pipe.fit(x_train,y_train)

# Checking Model Performance using evaluation metrics

**1. Linear Regression** 

In [8]:
pred_lr = pipeline_lr.predict(x_test)
linear_reg_mse = mean_squared_error(y_test, pred_lr)
linear_reg_rmse = mean_squared_error(y_test, pred_lr, squared = False)
linear_reg_r2_score = r2_score(y_test, pred_lr)

# Evaluation Metrics
print("The Mean Squared Error using Linear Regression : {}".format(linear_reg_mse))
print(('The Root Mean Squared Error using Linear Regression : {}'.format(linear_reg_rmse)))
print(('The r2_score using Linear Regression : {}'.format(linear_reg_r2_score)))

The Mean Squared Error using Linear Regression : 32156084.838352986
The Root Mean Squared Error using Linear Regression : 5670.633548233653
The r2_score using Linear Regression : 0.794933500510842


**2. Decision Tree**

In [9]:
pred_dt = pipeline_dt.predict(x_test)
decision_tree_mse = mean_squared_error(y_test, pred_dt)
decision_tree_rmse = mean_squared_error(y_test, pred_dt, squared = False)
decision_tree_r2_score = r2_score(y_test, pred_dt)

print("The Mean Squared Error using Decision Tree Regressor : {}".format(decision_tree_mse))
print("The Root Mean Squared Error using Decision Tree Regressor : {}".format(decision_tree_rmse))
print("The r2_score using Decision Tree Regressor : {}".format(decision_tree_r2_score))

The Mean Squared Error using Decision Tree Regressor : 43710038.82394553
The Root Mean Squared Error using Decision Tree Regressor : 6611.356806582558
The r2_score using Decision Tree Regressor : 0.7212513681556513


**3. KNN**

In [10]:
pred_knn = pipeline_knn.predict(x_test)
knn_mse = mean_squared_error(y_test, pred_knn)
knn_rmse = mean_squared_error(y_test, pred_knn, squared = False)
knn_r2_score = r2_score(y_test, pred_knn)

print("The mean squared error using KNN is {}".format(knn_mse))
print("The root mean squared error using KNN is {}".format(knn_rmse))
print("The r2_score using KNN is {}".format(knn_r2_score))

The mean squared error using KNN is 34683548.07230548
The root mean squared error using KNN is 5889.2739851619635
The r2_score using KNN is 0.7788153057561124


**4. Random Forest**

In [11]:
pred_rf = pipeline_rf.predict(x_test)
random_forest_mse = mean_squared_error(y_test, pred_rf)
random_forest_rmse = mean_squared_error(y_test, pred_rf, squared = False)
random_forest_r2_score = r2_score(y_test, pred_rf)

print("The Mean Squared Error using Random Forest Regressor : {}".format(random_forest_mse))
print("The Root Mean Squared Error using Random Forest Regressor : {}".format(random_forest_rmse))
print("The r2_score Error using Random Forest Regressor : {}".format(random_forest_r2_score))

The Mean Squared Error using Random Forest Regressor : 19686049.44276266
The Root Mean Squared Error using Random Forest Regressor : 4436.896375030936
The r2_score Error using Random Forest Regressor : 0.8744576876105605


**5. Gradient Boosting**

In [12]:
pred_gb = pipeline_gb.predict(x_test)
gradient_boosting_mse = mean_squared_error(y_test, pred_gb)
gradient_boosting_rmse = mean_squared_error(y_test, pred_gb, squared = False)
gradient_boosting_r2_score = r2_score(y_test, pred_gb)

print("The Mean Squared Error using Gradient Boosting Regressor : {}".format(gradient_boosting_mse))
print("The Root Mean Squared Error using Gradient Boosting Regressor : {}".format(gradient_boosting_rmse))
print("The r2_sccore using Gradient Boosting Regressor : {}".format(gradient_boosting_r2_score))

The Mean Squared Error using Gradient Boosting Regressor : 15647810.508128764
The Root Mean Squared Error using Gradient Boosting Regressor : 3955.731349337157
The r2_sccore using Gradient Boosting Regressor : 0.9002104347683397


In [13]:
models = pd.DataFrame({
    'Model' : ['Linear Regression', 'Decision Tree', 'Random Forest',
               'Gradient Boosting', 'KNN'],
    'RMSE' : [linear_reg_rmse, decision_tree_rmse, random_forest_rmse,
            gradient_boosting_rmse, knn_rmse],
    'r2_score' : [linear_reg_r2_score, decision_tree_r2_score, random_forest_r2_score, 
    gradient_boosting_r2_score, knn_r2_score]
})

models.sort_values(by='RMSE', ascending=True, ignore_index=True)

Unnamed: 0,Model,RMSE,r2_score
0,Gradient Boosting,3955.731349,0.90021
1,Random Forest,4436.896375,0.874458
2,Linear Regression,5670.633548,0.794934
3,KNN,5889.273985,0.778815
4,Decision Tree,6611.356807,0.721251


As we can see through the evaluation metrics, RMSE and r2_score both is quite good for Gradient Boosting and Random Forest algorithms.<br>
Among all the regressors, Decision Tree performs the worst, it maybe because tree based algorithms generally lack to perform with continuous numerical variables and is inadequate for applying regression and predicting continuous values.<br>
The RMSE and r2 score of Gradient boosting is quite imminent that it is the best performing regressor, we shall use Gradient boosting as our final algorithm.

## Pickle the model

In [14]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(pipeline_gb, f)

In [15]:
with open('model.pkl', 'rb') as f:
    m = pickle.load(f)