## Grid Search to find best linear model and hyperparameters <a class="anchor" id="grid-search"></a> 

In [16]:
import pickle
import pandas as pd
import numpy as np

import sys
sys.path.append('./ml_housing_lib')
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from basic_data_prep import prep_data_fin
from pipeline import KMeansTransformer

In [21]:
# Setup GridSearch

# Designation of categorical and numerical features (see Feature Engineering Notebook)
cat_features = ['geo_cluster', 'yr_built_bin', 'yr_renovated_bin', 'grade', 'bedrooms','view', 'floors', 'month']
num_features = [ 'condition', 'bathrooms', 'sqft_above', 'sqft_basement','sqft_living','sqft_lot']

# Setup transformers
cat_transformer = ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
num_transformer = ('num', RobustScaler(), num_features)
           
# Create a preprocessor
preprocessor = ColumnTransformer(transformers= [cat_transformer, 
                                                num_transformer], 
                                 remainder='passthrough')

# Setup pipeline
pipeline = Pipeline(steps=[('kmeans', KMeansTransformer(n_clusters=500)),
                           ('preprocessor', preprocessor),
                           ('model', Lasso())]
                   )

# Gridsearch params
param_grid = [
     # {'preprocessor__num': [None, StandardScaler(), MinMaxScaler(), RobustScaler()],
     #  'model': [LinearRegression()]},
    {
     # 'preprocessor__num': [RobustScaler(),],
     # 'model': [Lasso()],
    'model__alpha': [0.1, 1, 3, 5]
    },
     # {
     # 'preprocessor__num': [None, StandardScaler(), MinMaxScaler(), RobustScaler()],
     # 'model': [ElasticNet()],
     # 'model__alpha': [0.01, 0.1, 1, 2, 5, 10],
     # 'model__l1_ratio': [0.1, 0.5, 0.9]
     # },
]

# Set up grid search 
grid_search = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)

# Fit model
df = prep_data_fin()

X = df.drop('price', axis = 1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

grid_search.fit(X_train, y_train)

# Get best model, params, score
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best params: {best_params}\nBest score: {best_score}\nBest model: {best_model}")

Best params: {'model__alpha': 5}
Best score: 0.8488353472366441
Best model: Pipeline(steps=[('kmeans', KMeansTransformer(n_clusters=500)),
                ('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['geo_cluster',
                                                   'yr_built_bin',
                                                   'yr_renovated_bin', 'grade',
                                                   'bedrooms', 'view', 'floors',
                                                   'month']),
                                                 ('num', RobustScaler(),
                                                  ['condition', 'bathrooms',
                                                   'sqft_above',
                                                  

Best params: {'model__alpha': 5}
Best score: 0.8488353472366441
Best model: Pipeline(steps=[('kmeans', KMeansTransformer(n_clusters=500)),
                ('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['geo_cluster',
                                                   'yr_built_bin',
                                                   'yr_renovated_bin', 'grade',
                                                   'bedrooms', 'view', 'floors',
                                                   'month']),
                                                 ('num', RobustScaler(),
                                                  ['condition', 'bathrooms',
                                                   'sqft_above',
                                                   'sqft_basement',
                                                   'sqft_living',
                                                   'sqft_lot'])])),
                ('model', Lasso(alpha=5))])

In [22]:
# Save model
with open('best_linear_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

#### Notes:
- Gridsearch results:
    - Best params: {'model': Lasso(alpha=5), 'preprocessor__num': RobustScaler()}
    - Best R2 score: 0.848

In [None]:
# Load model
with open('linear_regression_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

## Conclusion and Best Model <a class="anchor" id="best-model"></a>

- The best performing model on the test set:  
    - LinearRegression
    - MinMaxScaler
    - Kmeans with 500 Clusters
    - cat_features = ['geo_cluster', 'yr_built_bin', 'yr_renovated_bin', 'grade', 'bedrooms', 'bathrooms','view', 'floors']
    - num_features = [ 'condition', 'sqft_basement','sqft_living','sqft_lot']
    - Scores:  
        - 5-fold Cross Validation using only training data: {'mean_mae': 70609.794, 'mean_rmse': 110086.056, 'mean_rsquare': 0.852, 'mean_adj_rsquare': 0.852}
        - Performance on validation set: {'mae': 75217.855, 'rmse': 122555.544, 'rsquare': 0.827, 'adj_rsquare': 0.826}
        - Performance on test set: {'mae': 69745.756, 'rmse': 111124.99, 'rsquare': 0.854, 'adj_rsquare': 0.853}
- even though using grid search a Lasso model was found that performed slightly better on the validation set, this tiny performance improvement does not make the added complexity and computational cost worthwhile.