In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import sys
sys.path.append('..') #get root directory



Now that we know what we know which features we want to use and the format we want it to be in we can try out some different models, hoping to pick out some promising ones. The first thing that needs to be done is get the data pipeline going.

# Transformation Pipeline 
Now that the preprocessing steps are defined we can wrap all of this neatly into a Pipeline, allowing us to train and test various different models more efficiently.

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from src.features import logTransformer
from src.features import cbrtTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
standard_scaler = StandardScaler()
one_hot_encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore') # Instead of a SciPy sparse matrix, we get a NumPy array

In [4]:
preprocess = ColumnTransformer([("onehot_cities", one_hot_encoder, ['city']),("log", logTransformer(), ['sqft_lot']),('cbrt', cbrtTransformer(), ['sqft_living', 'sqft_basement', 'sqft_above'])], remainder = 'passthrough')
pipeline = Pipeline([("preprocess", preprocess),('stdscaler', standard_scaler)])


In [5]:
from src.dataset import split_training_val_test
housing_data = pd.read_csv('../data/interim/housing_data_with_dropped_features')
housing_data.drop('Unnamed: 0', inplace= True, axis=1)
y = housing_data.iloc[:, : 1]
X = housing_data.iloc[:, 1: ]
X_train, X_val, X_test, y_train, y_val, y_test, train_df = split_training_val_test(X=X, y=y)
X_train

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city
3204,3.0,1.75,1720,4080,1.0,0,0,4,960,760,1924,0,Seattle
407,3.0,2.25,1445,1606,2.0,0,0,3,1300,145,2003,0,Issaquah
42,2.0,1.00,850,6174,1.0,0,0,4,850,0,1950,1983,Shoreline
27,4.0,2.25,2200,11250,1.5,0,0,5,1300,900,1920,0,Kirkland
3517,4.0,3.25,3060,3898,2.0,0,0,3,2300,760,2014,0,Issaquah
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3797,4.0,1.75,1420,5000,1.5,0,0,4,1420,0,1945,0,Seattle
4025,3.0,2.50,2040,10086,2.0,0,0,3,2040,0,1987,2000,Redmond
955,4.0,2.75,2050,3960,1.0,0,0,4,1180,870,1986,0,Seattle
3716,4.0,1.75,2130,5080,1.5,0,0,3,2130,0,1914,1993,Seattle


In [None]:
X_train_transformed_data = pipeline.fit_transform(X_train) 
X_train_transformed = pd.DataFrame(data=X_train_transformed_data, index = X_train.index, columns= pipeline.get_feature_names_out())
X_train_transformed.head()

Unnamed: 0,onehot_cities__city_Algona,onehot_cities__city_Auburn,onehot_cities__city_Bellevue,onehot_cities__city_Black Diamond,onehot_cities__city_Bothell,onehot_cities__city_Burien,onehot_cities__city_Carnation,onehot_cities__city_Clyde Hill,onehot_cities__city_Covington,onehot_cities__city_Des Moines,...,cbrt__sqft_basement,cbrt__sqft_above,remainder__bedrooms,remainder__bathrooms,remainder__floors,remainder__waterfront,remainder__view,remainder__condition,remainder__yr_built,remainder__yr_renovated
3204,-0.022135,-0.188406,-0.24987,-0.031311,-0.073594,-0.122109,-0.049544,-0.038358,-0.086024,-0.122109,...,1.265745,-1.162915,-0.428522,-0.499473,-0.956399,-0.076885,-0.298047,0.765143,-1.494857,-0.818251
407,-0.022135,-0.188406,-0.24987,-0.031311,-0.073594,-0.122109,-0.049544,-0.038358,-0.086024,-0.122109,...,0.395872,-0.57013,-0.428522,0.154115,0.902684,-0.076885,-0.298047,-0.672228,1.117649,-0.818251
42,-0.022135,-0.188406,-0.24987,-0.031311,-0.073594,-0.122109,-0.049544,-0.038358,-0.086024,-0.122109,...,-0.784317,-1.384509,-1.53477,-1.479856,-0.956399,-0.076885,-0.298047,0.765143,-0.635045,1.210714
27,-0.022135,-0.188406,-0.24987,-0.031311,-0.073594,-0.122109,-0.049544,-0.038358,-0.086024,-0.122109,...,1.384602,-0.57013,0.677726,0.154115,-0.026857,-0.076885,-0.298047,2.202514,-1.627136,-0.818251
3517,-0.022135,-0.188406,-0.24987,-0.031311,-0.073594,-0.122109,-0.049544,-0.038358,-0.086024,-0.122109,...,1.265745,0.721655,0.677726,1.461291,0.902684,-0.076885,-0.298047,-0.672228,1.481416,-0.818251


In [7]:
X_val_transformed_data = pipeline.transform(X_val)
X_val_transformed = pd.DataFrame(data= X_val_transformed_data, index = X_val.index, columns = pipeline.get_feature_names_out())

X_test_transformed_data = pipeline.transform(X_test)
X_test_transformed = pd.DataFrame(data = X_test_transformed_data, index = X_test.index, columns = pipeline.get_feature_names_out())

y_train_transformed = np.log(y_train) # doesn't need to go through a pipeline so no need to use logTransformer class
y_test_transformed = np.log(y_test)
y_val_transformed = np.log(y_val)

# Model Selection