In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import sys
sys.path.append('..') #get root directory



Now that we know what we know which features we want to use and the format we want it to be in we can try out some different models, hoping to pick out some promising ones. The first thing that needs to be done is get the data pipeline going.

# Transformation Pipeline 
Now that the preprocessing steps are defined we can wrap all of this neatly into a Pipeline, allowing us to train and test various different models more efficiently.

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from src.features import logTransformer
from src.features import cbrtTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
standard_scaler = StandardScaler()
one_hot_encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore') # Instead of a SciPy sparse matrix, we get a NumPy array

In [13]:
preprocess = ColumnTransformer([("onehot_cities", one_hot_encoder, ['city']),("log", logTransformer(), ['sqft_lot']),('cbrt', cbrtTransformer(), ['sqft_living', 'sqft_basement', 'sqft_above'])], remainder = 'passthrough')
pipeline = Pipeline([("preprocess", preprocess),('stdscaler', standard_scaler)])


In [17]:
from src.dataset import split_training_val_test
housing_data = pd.read_csv('../data/interim/housing_data_with_dropped_features')
housing_data.drop('Unnamed: 0', inplace= True, axis=1)
y = housing_data.iloc[:, : 1]
X = housing_data.iloc[:, 1: ]
X_train, X_val, X_test, y_train, y_val, y_test, train_df = split_training_val_test(X=X, y=y)
X_train

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city
4159,4.0,2.50,2610,5140,2.0,0,0,3,2610,0,2006,0,Seattle
666,3.0,2.50,3100,20553,1.0,0,0,3,3100,0,1954,2005,Shoreline
3337,3.0,1.50,2680,4775,2.0,0,2,5,1880,800,1913,0,Seattle
3794,4.0,1.50,2550,5055,2.0,0,0,4,2550,0,1910,0,Seattle
4206,3.0,1.00,1450,7200,1.0,0,0,3,1010,440,1969,2010,Kirkland
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4414,4.0,2.25,1720,8300,1.0,0,0,4,1720,0,1973,0,Federal Way
2003,4.0,2.50,2260,3713,2.0,0,0,3,2260,0,2003,0,Kirkland
3647,3.0,2.75,1300,14197,1.0,0,0,3,860,440,1996,0,Kent
4170,3.0,2.50,2500,7394,2.0,0,0,3,2500,0,1990,2009,Woodinville


In [18]:
pipeline_data = pipeline.fit_transform(X_train) 
pipeline_data

array([[-0.19011728, -0.25819889, -0.04927843, ..., -0.68818703,
         1.20909842, -0.82170389],
       [-0.19011728, -0.25819889, -0.04927843, ..., -0.68818703,
        -0.53627307,  1.22812265],
       [-0.19011728, -0.25819889, -0.04927843, ...,  2.28962855,
        -1.91243136, -0.82170389],
       ...,
       [-0.19011728, -0.25819889, -0.04927843, ..., -0.68818703,
         0.87345006, -0.82170389],
       [-0.19011728, -0.25819889, -0.04927843, ..., -0.68818703,
         0.67206104,  1.23221208],
       [-0.19011728, -0.25819889, -0.04927843, ..., -0.68818703,
        -1.40895881,  1.17598242]])

In [20]:
pipeline_data_df = pd.DataFrame(data=pipeline_data, index = X_train.index, columns= pipeline.get_feature_names_out())
pipeline_data_df.head()

Unnamed: 0,onehot_cities__city_Auburn,onehot_cities__city_Bellevue,onehot_cities__city_Black Diamond,onehot_cities__city_Bothell,onehot_cities__city_Burien,onehot_cities__city_Carnation,onehot_cities__city_Clyde Hill,onehot_cities__city_Covington,onehot_cities__city_Des Moines,onehot_cities__city_Duvall,...,cbrt__sqft_basement,cbrt__sqft_above,remainder__bedrooms,remainder__bathrooms,remainder__floors,remainder__waterfront,remainder__view,remainder__condition,remainder__yr_built,remainder__yr_renovated
4159,-0.190117,-0.258199,-0.049278,-0.076472,-0.115129,-0.062378,-0.053995,-0.076472,-0.106155,-0.101385,...,-0.81273,1.028149,0.653373,0.425783,0.922382,-0.079614,-0.315334,-0.688187,1.209098,-0.821704
666,-0.190117,-0.258199,-0.049278,-0.076472,-0.115129,-0.062378,-0.053995,-0.076472,-0.106155,-0.101385,...,-0.81273,1.487456,-0.435934,0.425783,-0.954204,-0.079614,-0.315334,-0.688187,-0.536273,1.228123
3337,-0.190117,-0.258199,-0.049278,-0.076472,-0.115129,-0.062378,-0.053995,-0.076472,-0.106155,-0.101385,...,1.232885,0.222069,-0.435934,-0.837338,0.922382,-0.079614,2.25212,2.289629,-1.912431,-0.821704
3794,-0.190117,-0.258199,-0.049278,-0.076472,-0.115129,-0.062378,-0.053995,-0.076472,-0.106155,-0.101385,...,-0.81273,0.968059,0.653373,-0.837338,0.922382,-0.079614,-0.315334,0.800721,-2.013126,-0.821704
4206,-0.190117,-0.258199,-0.049278,-0.076472,-0.115129,-0.062378,-0.053995,-0.076472,-0.106155,-0.101385,...,0.863286,-1.082769,-0.435934,-1.468899,-0.954204,-0.079614,-0.315334,-0.688187,-0.032801,1.233234
