In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import sys
sys.path.append('..') #get root directory

Now that we know what we know which features we want to use and the format we want it to be in we can try out some different models, hoping to pick out some promising ones. The first thing that needs to be done is get the data pipeline going.

# Transformation Pipeline 
Now that the preprocessing steps are defined we can wrap all of this neatly into a Pipeline, allowing us to train and test various different models more efficiently.

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from src.features import logTransformer
from src.features import cbrtTransformer
standard_scaler = StandardScaler()
one_hot_encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore') # Instead of a SciPy sparse matrix, we get a NumPy array

In [3]:
preprocess = ColumnTransformer([("onehot_cities", one_hot_encoder, ['city']),("log", logTransformer(), ['sqft_lot']),('cbrt', cbrtTransformer(), ['sqft_living', 'sqft_basement', 'sqft_above'])], remainder = 'passthrough')
pipeline = Pipeline([("preprocess", preprocess),('stdscaler', standard_scaler)])

In [4]:
X_train = pd.read_csv("../data/interim/X_train.csv", index_col = 0)
X_test = pd.read_csv("../data/interim/X_test.csv", index_col = 0)
y_train = pd.read_csv("../data/interim/y_train.csv", index_col = 0)
y_test = pd.read_csv("../data/interim/y_test.csv", index_col = 0)

In [5]:
X_train

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,city
2156,2.0,2.00,1030,21712,1.0,0,0,4,1030,0,Seattle
2862,3.0,2.00,1300,11374,1.5,0,0,5,1300,0,SeaTac
3554,3.0,1.75,1010,9600,1.0,0,0,5,1010,0,Covington
3647,3.0,2.75,1300,14197,1.0,0,0,3,860,440,Kent
171,4.0,2.50,2290,12047,2.0,0,0,4,2290,0,Bellevue
...,...,...,...,...,...,...,...,...,...,...,...
1033,3.0,1.50,1270,1443,3.0,0,0,3,1270,0,Seattle
3264,2.0,1.00,970,5500,1.0,0,0,3,970,0,Seattle
1653,5.0,2.75,2080,13189,2.0,0,0,3,2080,0,Sammamish
2607,4.0,2.50,3070,34412,1.0,0,3,4,2070,1000,Medina


In [6]:
X_train_transformed_data = pipeline.fit_transform(X_train) 
X_train_transformed = pd.DataFrame(data=X_train_transformed_data, index = X_train.index, columns= pipeline.get_feature_names_out())
X_train_transformed.head()

Unnamed: 0,onehot_cities__city_Algona,onehot_cities__city_Auburn,onehot_cities__city_Bellevue,onehot_cities__city_Black Diamond,onehot_cities__city_Bothell,onehot_cities__city_Burien,onehot_cities__city_Carnation,onehot_cities__city_Clyde Hill,onehot_cities__city_Covington,onehot_cities__city_Des Moines,...,log__sqft_lot,cbrt__sqft_living,cbrt__sqft_basement,cbrt__sqft_above,remainder__bedrooms,remainder__bathrooms,remainder__floors,remainder__waterfront,remainder__view,remainder__condition
2156,-0.051691,-0.20757,-0.268594,-0.02582,-0.077667,-0.110171,-0.063351,-0.051691,-0.113228,-0.124746,...,1.089377,-1.397531,-0.796909,-1.066643,-1.604735,-0.216542,-0.938962,-0.06845,-0.311955,0.822946
2862,-0.051691,-0.20757,-0.268594,-0.02582,-0.077667,-0.110171,-0.063351,-0.051691,-0.113228,-0.124746,...,0.378045,-0.947752,-0.796909,-0.600687,-0.477063,-0.216542,-0.021993,-0.06845,-0.311955,2.320209
3554,-0.051691,-0.20757,-0.268594,-0.02582,-0.077667,-0.110171,-0.063351,-0.051691,8.831761,-0.124746,...,0.191484,-1.433844,-0.796909,-1.104263,-0.477063,-0.534574,-0.938962,-0.06845,-0.311955,2.320209
3647,-0.051691,-0.20757,-0.268594,-0.02582,-0.077667,-0.110171,-0.063351,-0.051691,-0.113228,-0.124746,...,0.621965,-0.947752,0.890399,-1.403615,-0.477063,0.737555,-0.938962,-0.06845,-0.311955,-0.674317
171,-0.051691,-0.20757,3.723088,-0.02582,-0.077667,-0.110171,-0.063351,-0.051691,-0.113228,-0.124746,...,0.441292,0.303469,-0.796909,0.695539,0.650609,0.419523,0.894976,-0.06845,-0.311955,0.822946


In [7]:
X_test_transformed_data = pipeline.transform(X_test)
X_test_transformed = pd.DataFrame(data = X_test_transformed_data, index = X_test.index, columns = pipeline.get_feature_names_out())

y_train_transformed = np.log(y_train) # doesn't need to go through a pipeline so no need to use logTransformer class
y_test_transformed = np.log(y_test)

In [8]:
X_train_transformed.to_csv("../data/processed/X_train_transformed.csv")
X_test_transformed.to_csv("../data/processed/X_test_transformed.csv")
y_train_transformed.to_csv("../data/processed/y_train_transformed.csv")
y_test_transformed.to_csv("../data/processed/y_test_transformed.csv")

# Model Selection

Since this is a supervised learning problem and we want to predict a house price we will use a regression model. We'll try out a few: linear regression, suppor vector regression, and random forest regressor. 

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline


In [10]:
from scipy.stats import randint
linear_regression_param_grid = [{'linearregression__fit_intercept': [True, False], 'linearregression__positive': [True, False]}] 
#setting positive to True since we want are coefficents to be positive since prices are a non-negative quantity

svr_params_grid = [{'svr__kernel': ['poly', 'linear'], 'svr__C': [40, 50, 60]},
          {'svr__kernel': ['poly'], 'svr__degree': [2,4,5]},
          {'svr__kernel': ['rbf, sigmoid'], 'svr__gamma': ['auto']},
           {'svr__kernel': ['linear'], 'svr__C' : [70,80,100]}]

random_forest_params_grid = [{'randomforestregressor__n_estimators': randint(100, 201), 'randomforestregressor__criterion': ['squared_error','absolute_error', 'friedman_mse', 'poisson']}]


In [11]:
linear_regression_pipeline = make_pipeline(pipeline, LinearRegression())
linear_regression_randomized_search = RandomizedSearchCV(linear_regression_pipeline, linear_regression_param_grid, scoring = 'neg_root_mean_squared_error', cv = 5, n_iter = 4)
linear_regression_randomized_search.fit(X_train, y_train_transformed)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [12]:
linear_regression_randomized_search_bestscore = -linear_regression_randomized_search.best_score_
linear_regression_randomized_search_bestparams = linear_regression_randomized_search.best_params_
print(linear_regression_randomized_search_bestparams)
print(linear_regression_randomized_search_bestscore)

{'linearregression__positive': False, 'linearregression__fit_intercept': True}
0.30195774000920617


In [13]:
svr_pipeline = make_pipeline(pipeline, SVR())
svr_randomized_search = RandomizedSearchCV(svr_pipeline, svr_params_grid, scoring = 'neg_root_mean_squared_error', cv = 5, n_iter = 10)
y_train_flattened = np.ravel(y_train_transformed) # svr excepts a 1d array not a column vector
svr_randomized_search.fit(X_train, y_train_flattened)

5 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    ~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File

In [14]:
svr_randomized_search_bestscore = -svr_randomized_search.best_score_
svr_randomized_search_bestparams = svr_randomized_search.best_params_
print(svr_randomized_search_bestparams)
print(svr_randomized_search_bestscore)

{'svr__kernel': 'linear', 'svr__C': 60}
0.2982328119618905


In [15]:
random_forest_pipeline = make_pipeline(pipeline, RandomForestRegressor())
random_forest_randomized_search = RandomizedSearchCV(random_forest_pipeline, random_forest_params_grid, scoring = 'neg_root_mean_squared_error', cv = 5, n_iter = 10)
y_train_flattened = np.ravel(y_train_transformed)
random_forest_randomized_search.fit(X_train, y_train_flattened)



The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [16]:
random_forest_randomized_search_best_score = -random_forest_randomized_search.best_score_
random_forest_randomized_search_best_params = random_forest_randomized_search.best_params_
random_forest_randomized_search_best_estimator = random_forest_randomized_search.best_estimator_
print(random_forest_randomized_search_best_score)
print(random_forest_randomized_search_best_params)
print(random_forest_randomized_search_best_estimator)

0.32683377728104207
{'randomforestregressor__criterion': 'absolute_error', 'randomforestregressor__n_estimators': 183}
Pipeline(steps=[('pipeline',
                 Pipeline(steps=[('preprocess',
                                  ColumnTransformer(remainder='passthrough',
                                                    transformers=[('onehot_cities',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False),
                                                                   ['city']),
                                                                  ('log',
                                                                   logTransformer(),
                                                                   ['sqft_lot']),
                                                                  ('cbrt',
                                        

In [17]:
import joblib
import json 


The Joblib library will be used to store the results and fitted models to avoid having to rerun the code cells, helping to save time.

In [18]:
svr_info = {"svr_best_score" : svr_randomized_search_bestscore , "svr_best_params": svr_randomized_search_bestparams }
lr_info = {"lr_best_score" : linear_regression_randomized_search_bestscore, "lr_best_params": linear_regression_randomized_search_bestparams}
rfr_info = {"rfr_best_score": random_forest_randomized_search_best_score, "rfr_best_params" : random_forest_randomized_search_best_params}

In [19]:
with open("../models/svr_info.json", "w") as f:
  json.dump(svr_info, f, indent= 2)

with open("../models/lr_info.json", "w") as f:
  json.dump(lr_info, f, indent= 2)

with open("../models/rfr_info.json", "w") as f:
  json.dump(rfr_info, f, indent= 2)

In [20]:
joblib.dump(svr_randomized_search.best_estimator_, '../models/svr_model.joblib')
joblib.dump(linear_regression_randomized_search.best_estimator_, '../models/lr_model.joblib')
joblib.dump(random_forest_randomized_search_best_estimator, '../models/rfr_model.joblib')

['../models/rfr_model.joblib']