# Hyperparameter-Tuned Best Model Selection

In [2]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# train test split the data
from sklearn.model_selection import train_test_split

#import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV

# import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
#load the data
data = sns.load_dataset('tips')
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
# check for missing values
data.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [5]:
#columns
data.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [6]:
# select features and variables
X = data.drop('tip', axis=1)
y = data['tip']

#encode with label encoder all the X columns using for loop
label = LabelEncoder()
for i in X.columns:
    X[i] = label.fit_transform(X[i])

In [8]:
%%time
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a dictionaries of list of models to evaluate performance with hyperparameters
models = { 
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100]}),          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

for name, (model, params) in models.items():
    # create a pipline
    pipeline = GridSearchCV(model, params, cv=5)
    
    # fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = pipeline.predict(X_test)
    
      
    # print the performing metric
    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'R2: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')

LinearRegression MSE:  0.6565138933097732
LinearRegression R2:  0.474776845251208
LinearRegression MAE:  0.6160553405209654


SVR MSE:  0.635053061597389
SVR R2:  0.4919459042008757
SVR MAE:  0.669092672341374


DecisionTreeRegressor MSE:  0.8235850163311136
DecisionTreeRegressor R2:  0.34111688284232766
DecisionTreeRegressor MAE:  0.6958053058053058


RandomForestRegressor MSE:  0.8677597187755113
RandomForestRegressor R2:  0.30577631074724754
RandomForestRegressor MAE:  0.7426897959183677


KNeighborsRegressor MSE:  0.6021555034084395
KNeighborsRegressor R2:  0.5182645540750211
KNeighborsRegressor MAE:  0.6237327188940092


GradientBoostingRegressor MSE:  0.8185226615381059
GradientBoostingRegressor R2:  0.34516685951751525
GradientBoostingRegressor MAE:  0.771784517472228


XGBRegressor MSE:  0.6624107100882575
XGBRegressor R2:  0.4700592836840687
XGBRegressor MAE:  0.6549163442728472


CPU times: total: 4.11 s
Wall time: 5.35 s
