In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('..') #get root directory

Now that we know what we know which features we want to use and the format we want it to be in we can try out some different models, hoping to pick out some promising ones. The first thing that needs to be done is get the data pipeline going.

# Transformation Pipeline 
Now that the preprocessing steps are defined we can wrap all of this neatly into a Pipeline, allowing us to train and test various different models more efficiently.

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from src.features import logTransformer
from src.features import cbrtTransformer
standard_scaler = StandardScaler()
one_hot_encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore') # Instead of a SciPy sparse matrix, we get a NumPy array

In [3]:
preprocess = ColumnTransformer([("onehot_cities", one_hot_encoder, ['city']),("log", logTransformer(), ['sqft_lot']),('cbrt', cbrtTransformer(), ['sqft_living', 'sqft_basement', 'sqft_above'])], remainder = 'passthrough')
pipeline = Pipeline([("preprocess", preprocess),('stdscaler', standard_scaler)])

In [4]:
X_train = pd.read_csv("../data/interim/X_train.csv", index_col = 0)
X_test = pd.read_csv("../data/interim/X_test.csv", index_col = 0)
y_train = pd.read_csv("../data/interim/y_train.csv", index_col = 0)
y_test = pd.read_csv("../data/interim/y_test.csv", index_col = 0)

In [5]:
X_train_transformed_data = pipeline.fit_transform(X_train) 
X_train_transformed = pd.DataFrame(data=X_train_transformed_data, index = X_train.index, columns= pipeline.get_feature_names_out())
X_train_transformed.head()

Unnamed: 0,onehot_cities__city_Algona,onehot_cities__city_Auburn,onehot_cities__city_Beaux Arts Village,onehot_cities__city_Bellevue,onehot_cities__city_Black Diamond,onehot_cities__city_Bothell,onehot_cities__city_Burien,onehot_cities__city_Carnation,onehot_cities__city_Clyde Hill,onehot_cities__city_Covington,...,log__sqft_lot,cbrt__sqft_living,cbrt__sqft_basement,cbrt__sqft_above,remainder__bedrooms,remainder__bathrooms,remainder__floors,remainder__waterfront,remainder__view,remainder__condition
1992,-0.037088,-0.199256,-0.016577,-0.256682,-0.043895,-0.077979,-0.114372,-0.066446,-0.049786,-0.101337,...,-0.316153,0.001884,1.199345,-0.586298,0.682752,-0.519269,-0.951048,-0.084819,2.240026,-0.674857
871,-0.037088,-0.199256,-0.016577,-0.256682,-0.043895,-0.077979,-0.114372,-0.066446,-0.049786,-0.101337,...,0.200884,-0.98234,-0.784361,-0.634607,-0.43823,0.445744,0.903661,-0.084819,-0.312391,-0.674857
843,-0.037088,-0.199256,-0.016577,-0.256682,-0.043895,-0.077979,-0.114372,-0.066446,-0.049786,-0.101337,...,-1.726154,-0.667192,0.711249,-0.784264,-0.43823,0.445744,0.903661,-0.084819,-0.312391,-0.674857
2708,-0.037088,-0.199256,-0.016577,-0.256682,-0.043895,-0.077979,-0.114372,-0.066446,-0.049786,-0.101337,...,-0.077051,0.671513,-0.784361,1.063531,0.682752,0.767416,0.903661,-0.084819,-0.312391,-0.674857
380,-0.037088,-0.199256,-0.016577,-0.256682,-0.043895,-0.077979,-0.114372,-0.066446,-0.049786,-0.101337,...,-0.159552,-0.104828,-0.784361,0.266403,0.682752,-0.197598,-0.023693,-0.084819,-0.312391,2.29909


In [6]:
X_test_transformed_data = pipeline.transform(X_test)
X_test_transformed = pd.DataFrame(data = X_test_transformed_data, index = X_test.index, columns = pipeline.get_feature_names_out())

y_train_transformed = np.log(y_train) # doesn't need to go through a pipeline so no need to use logTransformer class
y_test_transformed = np.log(y_test)

In [7]:
X_train_transformed.to_csv("../data/processed/X_train_transformed.csv")
X_test_transformed.to_csv("../data/processed/X_test_transformed.csv")
y_train_transformed.to_csv("../data/processed/y_train_transformed.csv")
y_test_transformed.to_csv("../data/processed/y_test_transformed.csv")

# Model Selection

Since this is a supervised learning problem and we want to predict a house price we will use a regression model. We'll try out a few: lasso, suppor vector regression, and random forest regressor. 

In [8]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline


In [56]:
ls_param_grid = [{'lasso__alpha' : [1.0,1.5,2.5,100], 'lasso__max_iter': [100,1000,10000,100000] , 'lasso__tol': [0.0001, 0.001, 0.00001, 1, 10]}] 

svr_params_grid = [{'svr__kernel': ['linear'], 'svr__C': [1,20,40,60,80], 'svr__epsilon': [0.1,0.01,1.0, 0.001] },
          {'svr__kernel': ['poly'], 'svr__degree': [2,4,5], 'svr__C': [1,20,40,60,80]},
          {'svr__kernel': ['rbf', 'sigmoid'], 'svr__gamma': ['auto'], 'svr__C': [0.1,1,10,30,60,100], 'svr__epsilon': [0.1,0.01,1.0, 0.001]}]

rfr_params_grid = [{
    'randomforestregressor__n_estimators': [200, 500, 800],
    'randomforestregressor__max_depth': [10, 20, 30, None],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__min_samples_leaf': [1, 2, 4],
    'randomforestregressor__max_features': ['sqrt', 'log2', 0.5],
    'randomforestregressor__criterion' : ['squared_error', 'absolute_error']}]


In [57]:
ls_pipeline = make_pipeline(pipeline, Lasso())
ls_randomized_search = RandomizedSearchCV(ls_pipeline, ls_param_grid, scoring = 'neg_root_mean_squared_error', cv = 5, n_iter = 30, random_state = 0)
y_train_flattened = np.ravel(y_train_transformed)
ls_randomized_search.fit(X_train, y_train_flattened)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [58]:
print(ls_randomized_search.best_params_)
print(-ls_randomized_search.best_score_)
ls_randomized_search_cv_results_df = pd.DataFrame(ls_randomized_search.cv_results_)
ls_randomized_search_cv_results_df.sort_values(by = 'rank_test_score')

{'lasso__tol': 0.0001, 'lasso__max_iter': 10000, 'lasso__alpha': 2.5}
0.5463295432257984


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lasso__tol,param_lasso__max_iter,param_lasso__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.012181,0.003665,0.005442,0.003079,0.0001,10000,2.5,"{'lasso__tol': 0.0001, 'lasso__max_iter': 1000...",-0.546185,-0.547515,-0.55503,-0.537503,-0.545415,-0.54633,0.005584,1
27,0.004529,0.000216,0.001969,9.6e-05,1e-05,100,100.0,"{'lasso__tol': 1e-05, 'lasso__max_iter': 100, ...",-0.546185,-0.547515,-0.55503,-0.537503,-0.545415,-0.54633,0.005584,1
26,0.004373,0.000433,0.002075,0.00013,0.0001,100,100.0,"{'lasso__tol': 0.0001, 'lasso__max_iter': 100,...",-0.546185,-0.547515,-0.55503,-0.537503,-0.545415,-0.54633,0.005584,1
25,0.004605,0.000513,0.002286,0.000323,1e-05,100000,1.0,"{'lasso__tol': 1e-05, 'lasso__max_iter': 10000...",-0.546185,-0.547515,-0.55503,-0.537503,-0.545415,-0.54633,0.005584,1
24,0.00477,0.0006,0.002371,0.000387,0.0001,100000,2.5,"{'lasso__tol': 0.0001, 'lasso__max_iter': 1000...",-0.546185,-0.547515,-0.55503,-0.537503,-0.545415,-0.54633,0.005584,1
23,0.004142,0.000354,0.002078,0.000223,10.0,100000,100.0,"{'lasso__tol': 10, 'lasso__max_iter': 100000, ...",-0.546185,-0.547515,-0.55503,-0.537503,-0.545415,-0.54633,0.005584,1
22,0.004304,0.000514,0.002239,0.000128,0.001,100,2.5,"{'lasso__tol': 0.001, 'lasso__max_iter': 100, ...",-0.546185,-0.547515,-0.55503,-0.537503,-0.545415,-0.54633,0.005584,1
21,0.004465,0.000517,0.001923,0.000205,1.0,100000,1.5,"{'lasso__tol': 1, 'lasso__max_iter': 100000, '...",-0.546185,-0.547515,-0.55503,-0.537503,-0.545415,-0.54633,0.005584,1
20,0.004625,0.000424,0.002193,0.000166,1e-05,100,2.5,"{'lasso__tol': 1e-05, 'lasso__max_iter': 100, ...",-0.546185,-0.547515,-0.55503,-0.537503,-0.545415,-0.54633,0.005584,1
19,0.004526,0.000476,0.00243,0.000323,0.001,1000,1.5,"{'lasso__tol': 0.001, 'lasso__max_iter': 1000,...",-0.546185,-0.547515,-0.55503,-0.537503,-0.545415,-0.54633,0.005584,1


In [12]:
svr_pipeline = make_pipeline(pipeline, SVR())
svr_randomized_search = RandomizedSearchCV(svr_pipeline, svr_params_grid, scoring = 'neg_root_mean_squared_error', cv = 5, n_iter = 30, random_state = 0)
y_train_flattened = np.ravel(y_train_transformed) # svr excepts a 1d array not a column vector
svr_randomized_search.fit(X_train, y_train_flattened)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [13]:
print(svr_randomized_search.best_params_)
print(-svr_randomized_search.best_score_)
svr_randomized_search_cv_results = pd.DataFrame(svr_randomized_search.cv_results_)
svr_randomized_search_cv_results.sort_values(by = 'rank_test_score')

{'svr__kernel': 'linear', 'svr__epsilon': 0.1, 'svr__C': 80}
0.2902156356612463


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svr__kernel,param_svr__degree,param_svr__C,param_svr__gamma,param_svr__epsilon,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
14,46.891016,1.021841,0.043049,0.007746,linear,,80.0,,0.1,"{'svr__kernel': 'linear', 'svr__epsilon': 0.1,...",-0.277722,-0.271391,-0.309711,-0.267428,-0.324827,-0.290216,0.022837,1
9,23.925449,1.184528,0.039142,0.006796,linear,,40.0,,0.1,"{'svr__kernel': 'linear', 'svr__epsilon': 0.1,...",-0.277695,-0.27147,-0.309672,-0.267457,-0.324861,-0.290231,0.022825,2
11,53.861173,1.999617,0.068918,0.013964,linear,,60.0,,0.01,"{'svr__kernel': 'linear', 'svr__epsilon': 0.01...",-0.279339,-0.272174,-0.309507,-0.267725,-0.327202,-0.291189,0.023173,3
28,1.393889,0.063468,0.052605,0.000571,linear,,1.0,,0.001,"{'svr__kernel': 'linear', 'svr__epsilon': 0.00...",-0.27941,-0.272031,-0.309861,-0.267822,-0.327687,-0.291362,0.023377,4
20,0.20079,0.0023,0.09928,0.02013,rbf,,1.0,auto,0.1,"{'svr__kernel': 'rbf', 'svr__gamma': 'auto', '...",-0.277892,-0.27105,-0.314083,-0.282929,-0.334911,-0.296173,0.02434,5
16,0.319761,0.001955,0.144625,0.002088,rbf,,1.0,auto,0.01,"{'svr__kernel': 'rbf', 'svr__gamma': 'auto', '...",-0.279012,-0.272658,-0.314138,-0.283148,-0.336478,-0.297087,0.024332,6
2,0.336124,0.001661,0.06856,0.000249,sigmoid,,0.1,auto,0.001,"{'svr__kernel': 'sigmoid', 'svr__gamma': 'auto...",-0.287229,-0.283469,-0.309475,-0.278888,-0.331944,-0.298201,0.01988,7
5,0.349517,0.025196,0.067405,0.000759,sigmoid,,0.1,auto,0.01,"{'svr__kernel': 'sigmoid', 'svr__gamma': 'auto...",-0.288536,-0.285866,-0.30951,-0.276234,-0.331817,-0.298392,0.019929,8
1,0.514281,0.042758,0.164555,0.005221,rbf,,10.0,auto,0.01,"{'svr__kernel': 'rbf', 'svr__gamma': 'auto', '...",-0.283378,-0.292243,-0.324707,-0.296275,-0.340283,-0.307377,0.021501,9
17,11.468631,21.901781,0.173733,0.027473,rbf,,10.0,auto,0.001,"{'svr__kernel': 'rbf', 'svr__gamma': 'auto', '...",-0.284092,-0.293043,-0.3254,-0.296545,-0.341254,-0.308067,0.021611,10


In [14]:
rfr_pipeline = make_pipeline(pipeline, RandomForestRegressor())
rfr_randomized_search = RandomizedSearchCV(rfr_pipeline, rfr_params_grid, scoring = 'neg_root_mean_squared_error', cv = 5, n_iter = 30, random_state = 0)
y_train_flattened = np.ravel(y_train_transformed)
rfr_randomized_search.fit(X_train, y_train_flattened)



The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [15]:
print(-rfr_randomized_search.best_score_)
print(rfr_randomized_search.best_params_)
rfr_randomized_search_cv_results = pd.DataFrame(rfr_randomized_search.cv_results_)
rfr_randomized_search_cv_results.sort_values(by = 'rank_test_score')

0.30049721039775135
{'randomforestregressor__n_estimators': 500, 'randomforestregressor__min_samples_split': 10, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__max_features': 'sqrt', 'randomforestregressor__max_depth': 30, 'randomforestregressor__criterion': 'absolute_error'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_randomforestregressor__n_estimators,param_randomforestregressor__min_samples_split,param_randomforestregressor__min_samples_leaf,param_randomforestregressor__max_features,param_randomforestregressor__max_depth,param_randomforestregressor__criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,334.230443,392.878456,0.085564,0.054251,500,10,1,sqrt,30.0,absolute_error,"{'randomforestregressor__n_estimators': 500, '...",-0.282999,-0.286636,-0.312858,-0.286495,-0.333499,-0.300497,0.019679,1
17,33.577083,0.566663,0.079199,0.008998,800,10,1,log2,,absolute_error,"{'randomforestregressor__n_estimators': 800, '...",-0.283572,-0.28661,-0.313232,-0.285595,-0.333586,-0.300519,0.019791,2
28,1.131955,0.012524,0.061645,0.001968,500,2,1,log2,,squared_error,"{'randomforestregressor__n_estimators': 500, '...",-0.286259,-0.285933,-0.311932,-0.290446,-0.329195,-0.300753,0.017146,3
12,0.407571,0.001988,0.020968,0.000223,200,5,1,sqrt,30.0,squared_error,"{'randomforestregressor__n_estimators': 200, '...",-0.285494,-0.288428,-0.31209,-0.291193,-0.333709,-0.302183,0.018333,4
21,1.511283,0.089597,0.078288,0.021447,800,10,1,sqrt,,squared_error,"{'randomforestregressor__n_estimators': 800, '...",-0.286795,-0.289549,-0.313627,-0.290294,-0.334338,-0.30292,0.018439,5
19,23.499165,0.222664,0.045613,0.00317,500,5,1,sqrt,20.0,absolute_error,"{'randomforestregressor__n_estimators': 500, '...",-0.285947,-0.289995,-0.313697,-0.289478,-0.335895,-0.303002,0.01918,6
10,85.394257,1.082694,0.078652,0.004462,800,10,2,0.5,20.0,absolute_error,"{'randomforestregressor__n_estimators': 800, '...",-0.287225,-0.291694,-0.312802,-0.291268,-0.333521,-0.303302,0.017558,7
15,0.362365,0.001956,0.018496,0.000143,200,10,1,sqrt,,squared_error,"{'randomforestregressor__n_estimators': 200, '...",-0.286182,-0.290903,-0.313773,-0.291191,-0.334535,-0.303317,0.018323,8
14,23.806582,0.342894,0.055212,0.004487,500,2,1,sqrt,20.0,absolute_error,"{'randomforestregressor__n_estimators': 500, '...",-0.286699,-0.289865,-0.31503,-0.291668,-0.334553,-0.303563,0.018469,9
3,23.963353,0.259548,0.021389,0.000611,200,5,1,0.5,,absolute_error,"{'randomforestregressor__n_estimators': 200, '...",-0.284753,-0.287983,-0.313031,-0.295671,-0.337765,-0.303841,0.019585,10


After running the randomized search we can an idea of which hyperparmeters we should have set and which ones we can still play around with. For Lasso it's rather hard to tell what hyperparameters might be best to go with since they all ranked 1st in the cv results. However svr's and rfr's scoring are closer to 0 by a considerdable margin since our scoring is in log-space and there is a better sense of what hyperparameters to go with. Running an exhaustive grid search on the promising hyperparameters can hopefully squeeze out some better scoring and translate into better predictions.

In [16]:
from sklearn.model_selection import GridSearchCV

svr_grid_search_params = [{
    'svr__kernel': ['linear'], 
    'svr__epsilon': [0.1,0.01,1.0,0.001],
    'svr__C': [40,60,80,100,120] }]

rfr_grid_search_params = [{
    'randomforestregressor__n_estimators': [650,800,950],
    'randomforestregressor__max_depth': [10, None],
    'randomforestregressor__min_samples_split': [10, 15, 20],
    'randomforestregressor__min_samples_leaf': [1],
    'randomforestregressor__max_features': ['sqrt', 'log2'],
    'randomforestregressor__criterion' : ['absolute_error']}]


In [17]:
svr_grid_search = GridSearchCV(svr_pipeline, svr_grid_search_params, scoring = 'neg_root_mean_squared_error', cv = 5)
svr_grid_search.fit(X_train, y_train_flattened)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [18]:
print(-svr_grid_search.best_score_)
print(svr_grid_search.best_params_)

0.29019480244638035
{'svr__C': 100, 'svr__epsilon': 0.1, 'svr__kernel': 'linear'}


In [19]:
rfr_grid_search = GridSearchCV(rfr_pipeline, rfr_grid_search_params, scoring = 'neg_root_mean_squared_error', cv = 5)
rfr_grid_search.fit(X_train, y_train_flattened)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [20]:
print(-rfr_grid_search.best_score_)

0.3001411167400498


A DummyRegressor will be trained to serve as a baseline against the models from the gridsearch if we peform once the the baseline dummy then something is very wrong. 

In [None]:
from sklearn.dummy import DummyRegressor
dummy_regressor = DummyRegressor(strategy='mean')
dummy_regressor.fit(X_train_transformed, y_train_transformed)


The Joblib library will be used to store the results and fitted models to avoid having to rerun the code cells, helping to save time.

In [21]:
import joblib
import json 

In [None]:
with open("../models/svr_grid_model_score.json", "w") as f:
    json.dump(svr_grid_search.best_score_, f, indent=4)
    
with open("../models/rfr_grid_model_score.json", "w") as f:
    json.dump(rfr_grid_search.best_score_, f, indent=4)
    


In [60]:
joblib.dump(svr_grid_search.best_estimator_, '../models/svr_model.joblib')
joblib.dump(rfr_grid_search.best_estimator_, '../models/rfr_model.joblib')
joblib.dump(dummy_regressor, '../models/dummy_regressor.joblib')

['../models/dummy_regressor.joblib']