# Lab 04: Ames Iowa Home Prices

---
author: Brady Brooks
date: October 4, 2024
embed-resources: true
---

## Introduction

## Methods

In [204]:
# imports
import pandas as pd
import seaborn as sns
import numpy as np 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import warnings

### Data

In [205]:
# load data
ames_train = pd.read_csv(
    "https://cs307.org/lab-04/data/ames-train.csv",
)
ames_test = pd.read_csv(
    "https://cs307.org/lab-04/data/ames-test.csv",
)
ames_train

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,2832,908188140,160,RM,24.0,2522,Pave,,Reg,Lvl,...,0,,,,0,4,2006,WD,Normal,137500
1,2736,905426150,80,RL,,19690,Pave,,IR1,Lvl,...,738,Gd,GdPrv,,0,8,2006,WD,Alloca,274970
2,2135,907200130,20,RL,97.0,11800,Pave,,IR1,Bnk,...,0,,,,0,8,2007,WD,Family,131000
3,2424,528228415,120,RM,,3072,Pave,,Reg,Lvl,...,0,,,,0,5,2006,WD,Normal,178740
4,1967,535457020,20,RL,80.0,8000,Pave,,Reg,Lvl,...,0,,MnPrv,,0,11,2007,WD,Normal,156500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1870,1020,527302070,20,RL,,10825,Pave,,IR1,Lvl,...,0,,,,0,7,2008,WD,Normal,181900
1871,237,905426200,20,RL,65.0,11479,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,144500
1872,1547,910202050,30,RM,40.0,3636,Pave,,Reg,Lvl,...,0,,MnPrv,,0,1,2008,WD,Normal,55000
1873,1855,533251110,20,RL,80.0,12000,Pave,,Reg,Lvl,...,0,,,,0,3,2007,WD,Normal,255000


In [206]:
# summary statistics
ames_train.shape

(1875, 82)

In [207]:
# visualizations


In [208]:
# take out garbage features
# Suggest using a histogram boosted regressor
num_feat_temp = ames_train.select_dtypes(include=["int64", "float64"])
corr_price = num_feat_temp.corr()
corr_price = corr_price[["SalePrice"]]
corr_price

Unnamed: 0,SalePrice
Order,-0.006905
PID,-0.227301
MS SubClass,-0.06785
Lot Frontage,0.328198
Lot Area,0.274394
Overall Qual,0.791695
Overall Cond,-0.103323
Year Built,0.536898
Year Remod/Add,0.511713
Mas Vnr Area,0.491531


In [209]:
cat_feat_temp = ames_train.select_dtypes(include=["object", "category"])
cat_feat_temp.isna().sum()

MS Zoning            0
Street               0
Alley             1738
Lot Shape            0
Land Contour         0
Utilities            0
Lot Config           0
Land Slope           0
Neighborhood         0
Condition 1          0
Condition 2          0
Bldg Type            0
House Style          0
Roof Style           0
Roof Matl            0
Exterior 1st         0
Exterior 2nd         0
Mas Vnr Type      1121
Exter Qual           0
Exter Cond           0
Foundation           0
Bsmt Qual           47
Bsmt Cond           47
Bsmt Exposure       49
BsmtFin Type 1      47
BsmtFin Type 2      47
Heating              0
Heating QC           0
Central Air          0
Electrical           0
Kitchen Qual         0
Functional           0
Fireplace Qu       905
Garage Type        100
Garage Finish      102
Garage Qual        102
Garage Cond        102
Paved Drive          0
Pool QC           1864
Fence             1502
Misc Feature      1807
Sale Type            0
Sale Condition       0
dtype: int6

In [210]:
ames_train = ames_train.drop(["Order", "PID", "MS SubClass", "Low Qual Fin SF", "Bsmt Half Bath", "Misc Val", "Alley", "Misc Feature"], axis=1)
ames_test = ames_test.drop(["Order", "PID", "MS SubClass", "Low Qual Fin SF", "Bsmt Half Bath", "Misc Val", "Alley", "Misc Feature"], axis=1)

### Models

In [211]:
# process data for ML
# create X and y for train dataset
X_train = ames_train.drop("SalePrice", axis=1)
y_train = ames_train["SalePrice"]

# create X and y for test dataset
X_test = ames_test.drop("SalePrice", axis=1)
y_test = ames_test["SalePrice"]

In [212]:
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

# pipeline for numeric
numeric_preprocessor = Pipeline(steps=[
    ("MedianImputer", SimpleImputer(strategy="median")), 
    ("Standardize", StandardScaler()),
    ])
# pipeline for categorical
categorical_preprocessor = Pipeline(steps=[
    ("ModelImputer", SimpleImputer(strategy="most_frequent")), 
    ("OneHotEncoder", OneHotEncoder(handle_unknown="infrequent_if_exist", max_categories=5)),
    ])
# column transformer 
preprocessor = ColumnTransformer([
    ("NumericProcessing", numeric_preprocessor, numeric_features),
    ("CategoricalProcessing", categorical_preprocessor, categorical_features),
])
# full pipeline
pipeline = Pipeline(steps=[
    ("Preprocessor", preprocessor),
    ("Regressor", KNeighborsRegressor())
])

In [213]:
#add params for KNN
param_grid = [
    {"Regressor": [KNeighborsRegressor()],
    "Preprocessor__CategoricalProcessing__OneHotEncoder__drop": [None, "first"],
    "Regressor__n_neighbors": range(5,20,1),
    "Regressor__metric": ["euclidean", "manhattan"]},
    {"Regressor": [HistGradientBoostingRegressor()],
        "Regressor__learning_rate": [0.1, 0.01],
        "Regressor__max_iter": [1000],
        "Regressor__max_leaf_nodes": [19,20,21,22,23],
        "Regressor__max_depth": [None, 3],
        "Regressor__l2_regularization": [0.5,0.6,0,7],
    }
]

In [None]:
# train models
warnings.filterwarnings("ignore")
mod = GridSearchCV(pipeline, param_grid = param_grid, n_jobs=-1, cv=5, verbose=3, scoring="neg_mean_absolute_percentage_error")
mod.fit(X_train, y_train)

## Results

In [216]:
mod.best_params_

{'Regressor': HistGradientBoostingRegressor(),
 'Regressor__l2_regularization': 0.6,
 'Regressor__learning_rate': 0.01,
 'Regressor__max_depth': None,
 'Regressor__max_iter': 1000,
 'Regressor__max_leaf_nodes': 21}

In [217]:
# report model metrics
-mod.best_score_

np.float64(0.090955937647953)

## Discussion

### Conclusion

In [None]:
from joblib import dump
dump(mod, "ames-housing.joblib")

['ames-housing.joblib']