## Grid Search and Model Evaluation <a class="anchor" id="grid-search"></a> 

In [3]:
import pickle
import pandas as pd
import numpy as np

import sys
sys.path.append('./ml_housing_lib')
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from basic_data_prep import prep_data_fin
from pipeline import KMeansTransformer

In [6]:
# Setup GridSearch

# Designation of categorical and numerical features (see Feature Engineering Notebook)
cat_features = ['geo_cluster', 'yr_built_bin', 'yr_renovated_bin', 'grade', 'bedrooms','view', 'floors', 'month']
num_features = [ 'condition', 'bathrooms', 'sqft_above', 'sqft_basement','sqft_living','sqft_lot']

# Setup transformers
cat_transformer = ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
num_transformer = ('num', RobustScaler(), num_features)
           
# Create a preprocessor
preprocessor = ColumnTransformer(transformers= [cat_transformer, 
                                                num_transformer], 
                                 remainder='passthrough')

# Setup pipeline
pipeline = Pipeline(steps=[('kmeans', KMeansTransformer(n_clusters=500)),
                           ('preprocessor', preprocessor),
                           ('model', Lasso())]
                   )

# Gridsearch params
param_grid = [
     # {'preprocessor__num': [None, StandardScaler(), MinMaxScaler(), RobustScaler()],
     #  'model': [LinearRegression()]},
    {
     # 'preprocessor__num': [RobustScaler(),],
     # 'model': [Lasso()],
    'model__alpha': [0.1, 1, 3, 5]
    },
     # {
     # 'preprocessor__num': [None, StandardScaler(), MinMaxScaler(), RobustScaler()],
     # 'model': [ElasticNet()],
     # 'model__alpha': [0.01, 0.1, 1, 2, 5, 10],
     # 'model__l1_ratio': [0.1, 0.5, 0.9]
     # },
]

# Set up grid search 
grid_search = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)

# Fit model
df = prep_data_fin()

X = df.drop('price', axis = 1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
%% time

# Run grid search
grid_search.fit(X_train, y_train)

# Get best model, params, score
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best params: {best_params}\nBest score: {best_score}\nBest model: {best_model}")

Best params: {'model__alpha': 5}
Best score: 0.8488353472366441
Best model: Pipeline(steps=[('kmeans', KMeansTransformer(n_clusters=500)),
                ('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['geo_cluster',
                                                   'yr_built_bin',
                                                   'yr_renovated_bin', 'grade',
                                                   'bedrooms', 'view', 'floors',
                                                   'month']),
                                                 ('num', RobustScaler(),
                                                  ['condition', 'bathrooms',
                                                   'sqft_above',
                                                   'sqft_basement',
                                                   'sqft_living',
                                                   'sqft_lot'])])),
                ('model', Lasso(alpha=5))])

In [22]:
# Save model
with open('best_linear_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

#### Notes:
- Gridsearch results:
    - Best params: {'model': Lasso(alpha=5), 'preprocessor__num': RobustScaler()}
    - Best R2 score: 0.849

In [4]:
# Load model
with open('best_linear_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [7]:
y_pred = loaded_model.predict(X_test)

from model_evaluation import calc_scores
calc_scores(y_test, y_pred, X_test)

{'mae': 71126.09, 'rmse': 113431.285, 'rsquare': 0.849, 'adj_rsquare': 0.849}

## Best Model <a class="anchor" id="best-model"></a>

- Via conducting multiple Grid Searches, the following model and model (hyper)parameters turned out to be the best in terms of Linear Model performance:  
    - Lasso (alpha 5)
    - RobustScaler
    - Kmeans with 500 Clusters
    - cat_features = ['geo_cluster', 'yr_built_bin', 'yr_renovated_bin', 'grade', 'bedrooms','view', 'floors', 'month', 'waterfront']
    - num_features = [ 'condition', 'bathrooms', 'sqft_above', 'sqft_basement','sqft_living','sqft_lot']
    - Score on test set:  {'mae': 71126.09, 'rmse': 113431.285, 'rsquare': 0.849, 'adj_rsquare': 0.849}