# Model Training and Evaluation

This notebook is for experimenting with, benchmarking and documenting the accuracy of various models.

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
from joblib import dump
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsRegressor

In [3]:
data = pd.read_csv("data.csv")
data = data[data.price > 10]
data.info()

categorical_features = data.dtypes[
    data.dtypes == "object"
].index.values  # ["housing_type", "laundry", "parking"]

data.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27833 entries, 0 to 27832
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  27833 non-null  int64  
 1   price               27833 non-null  float64
 2   cats_ok             27833 non-null  bool   
 3   dogs_ok             27833 non-null  bool   
 4   housing_type        27833 non-null  object 
 5   laundry             27833 non-null  object 
 6   parking             27833 non-null  object 
 7   bedrooms            27833 non-null  float64
 8   bathrooms           27833 non-null  float64
 9   no_smoking          27833 non-null  bool   
 10  is_furnished        27833 non-null  bool   
 11  ev_charging         27833 non-null  bool   
 12  wheelchair_acccess  27833 non-null  bool   
 13  latitude            27833 non-null  float64
 14  longitude           27833 non-null  float64
dtypes: bool(6), float64(5), int64(1), object(3)
memory us

Unnamed: 0,id,price,bedrooms,bathrooms,latitude,longitude
count,27833.0,27833.0,27833.0,27833.0,27833.0,27833.0
mean,7256984000.0,2166.11375,1.700895,1.15708,40.739576,-73.948984
std,8396103.0,774.96046,1.050926,0.39069,0.267998,0.989485
min,7241938000.0,16.0,0.0,0.0,20.9174,-156.6772
25%,7249436000.0,1700.0,1.0,1.0,40.6836,-73.972518
50%,7257756000.0,2000.0,2.0,1.0,40.712702,-73.942769
75%,7264252000.0,2499.0,2.0,1.0,40.765152,-73.907604
max,7271335000.0,10000.0,8.0,5.0,44.3227,-71.0325


## Trial 1: Sklearn Random Forest Regressor

In [18]:
# 1. Split the data
X = data.drop(["price", "id"], axis=1, inplace=False)
y = data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 2. preprocess the features for training
ct = make_column_transformer(
    (
        OneHotEncoder(handle_unknown = "ignore"),
        categorical_features,
    ),
    remainder="passthrough",
)

# 3. Fit and score the model
rf = RandomForestRegressor()
model = make_pipeline(ct, rf)
model = model.fit(X_train, y_train)

predictions = model.predict(X_test)
error = abs(predictions - y_test)
print(error.describe())
model.score(X_test, y_test)

# 4. Spot check worst errors
spotcheck = data.copy(deep = True)
spotcheck["predictions"] = model.predict(X)
spotcheck["error"] = abs(spotcheck.predictions - spotcheck.price)
spotcheck.sort_values(by = "error", ascending = False).head(10)

count    5567.000000
mean      483.518139
std       510.198484
min         1.306751
25%       181.806751
50%       355.306751
75%       605.306751
max      6052.319777
Name: price, dtype: float64


0.19109741621410914

Unnamed: 0,id,price,cats_ok,dogs_ok,housing_type,laundry,parking,bedrooms,bathrooms,no_smoking,is_furnished,ev_charging,wheelchair_acccess,latitude,longitude,predictions,error
7362,7250359896,10000.0,False,False,loft,w/d in unit,attached garage,2.0,3.0,False,True,False,False,40.743934,-73.926171,2943.79612,7056.20388
26867,7270316139,10000.0,True,True,apartment,w/d in unit,street parking,3.0,3.0,False,False,False,False,40.710823,-74.00565,2947.680223,7052.319777
18051,7261439030,9995.0,True,True,apartment,laundry in bldg,street parking,3.0,3.0,False,False,False,False,40.717075,-74.00483,2947.680223,7047.319777
22930,7266212958,8718.0,False,False,apartment,w/d in unit,street parking,3.0,0.0,False,False,False,False,40.710884,-74.00561,2009.190855,6708.809145
20005,7263688258,9000.0,False,False,loft,w/d in unit,attached garage,2.0,3.0,False,True,False,False,40.743934,-73.926171,2943.79612,6056.20388
13930,7258118719,9000.0,False,False,apartment,laundry in bldg,no parking,3.0,2.0,False,False,False,False,40.777003,-73.961628,2947.680223,6052.319777
3024,7245127913,8800.0,False,False,apartment,w/d in unit,street parking,3.0,2.0,False,False,False,False,40.72998,-73.98207,2947.680223,5852.319777
15747,7259654551,8718.0,True,True,apartment,w/d in unit,detached garage,3.0,2.5,False,False,False,False,40.710397,-74.005599,2947.680223,5770.319777
3025,7245127220,8700.0,False,False,condo,w/d in unit,street parking,2.0,2.0,False,False,False,False,40.716488,-74.01445,2943.79612,5756.20388
13598,7257365911,8500.0,True,True,apartment,w/d in unit,attached garage,3.0,2.0,False,False,False,False,40.709904,-73.995667,2947.680223,5552.319777


In [None]:
from .config import options
model_columns = options["column order"]
raw_features = dict(zip(model.steps[0][1].get_feature_names(),model.steps[1][1].feature_importances_))
from web.app import clean_features
cleaned_features = clean_features(raw_features, model_columns)
import json
json.dump (cleaned_features, open("feature_importances.json", "wb"))

## Trial 1.1 - SHAP

In [17]:
import shap

from web.app import form_data_to_dataframe

explainer = shap.TreeExplainer (model.steps[1][1])

test_case = {
    "address": "50 Eldridge Street New York NY",
    "cats_ok": True,
    "dogs_ok": False,
    "housing_type": "apartment",
    "laundry": "laundry in bldg",
    "bedrooms": 3,
    "bathrooms": 1,
    "parking": "street parking",
    "no_smoking": False,
    "is_furnished": True,
    "wheelchair_acccess": True,
    "ev_charging": False
}

test_dataframe = form_data_to_dataframe(test_case)
test_vector = model.steps[0][1].transform(test_dataframe)

explainer.shap_values(test_vector)
model.steps[0][1].get_feature_names()

array([[-2.47282564e+00, -3.77158754e-02, -1.91803463e-02,
        -9.27437646e-01, -5.47930724e-03,  5.61004090e-01,
        -4.85093852e-03, -1.11600223e-02, -1.40015397e+01,
        -3.38415320e-02,  1.58666985e+01,  1.50162682e+00,
         2.09539081e+01,  7.12996387e-01, -2.75409479e+01,
        -4.65363966e+00,  7.26057437e-01, -7.19232686e+00,
         4.28834769e-01,  9.84963462e+00, -1.98421437e+00,
        -1.08693085e+00,  2.84791277e-01, -1.11942223e+01,
         2.93316795e+02, -1.32805494e+02,  1.29448992e+00,
        -7.69719251e+01, -2.60916226e+00, -1.42894480e+01,
         1.30075124e+01,  2.68914199e+02]])

['onehotencoder__x0_apartment',
 'onehotencoder__x0_condo',
 'onehotencoder__x0_cottage/cabin',
 'onehotencoder__x0_duplex',
 'onehotencoder__x0_flat',
 'onehotencoder__x0_house',
 'onehotencoder__x0_in-law',
 'onehotencoder__x0_land',
 'onehotencoder__x0_loft',
 'onehotencoder__x0_townhouse',
 'onehotencoder__x1_laundry in bldg',
 'onehotencoder__x1_laundry on site',
 'onehotencoder__x1_no laundry on site',
 'onehotencoder__x1_w/d hookups',
 'onehotencoder__x1_w/d in unit',
 'onehotencoder__x2_attached garage',
 'onehotencoder__x2_carport',
 'onehotencoder__x2_detached garage',
 'onehotencoder__x2_no parking',
 'onehotencoder__x2_off-street parking',
 'onehotencoder__x2_street parking',
 'onehotencoder__x2_valet parking',
 'cats_ok',
 'dogs_ok',
 'bedrooms',
 'bathrooms',
 'no_smoking',
 'is_furnished',
 'ev_charging',
 'wheelchair_acccess',
 'latitude',
 'longitude']

## Trial 2: KNN Regressor

KNN is inherently more explainable than Random Forest

In [None]:
# 1. Split the data
X = data.drop(["price", "id"], axis=1, inplace=False)
y = data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 2. preprocess the features for training
ct = make_column_transformer(
    (
        OneHotEncoder(handle_unknown = "ignore"),
        categorical_features,
    ),
    remainder="passthrough",
)

# 3. Fit and score the model
knn = KNeighborsRegressor()
model = make_pipeline(ct, knn)
model = model.fit(X_train, y_train)

predictions = model.predict(X_test)
error = abs(predictions - y_test)
print(error.describe())
model.score(X_test, y_test)

# 4. Spot check worst errors
spotcheck = data.copy(deep = True)
spotcheck["predictions"] = model.predict(X)
spotcheck["error"] = abs(spotcheck.predictions - spotcheck.price)
spotcheck.sort_values(by = "error", ascending = False).head(10)

## Trial 3: Linear Regressor

In [None]:
from sklearn.linear_model import LinearRegression

# 1. Split the data
X = data.drop(["price", "id"], axis=1, inplace=False)
y = data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 2. preprocess the features for training
ct = make_column_transformer(
    (
        OneHotEncoder(handle_unknown = "ignore"),
        categorical_features,
    ),
    remainder="passthrough",
)

# 3. Fit and score the model
lr = LinearRegression()
model = make_pipeline(ct, lr)
model = model.fit(X_train, y_train)

predictions = model.predict(X_test)
error = abs(predictions - y_test)
print(error.describe())
model.score(X_test, y_test)

# 4. Spot check worst errors
spotcheck = data.copy(deep = True)
spotcheck["predictions"] = model.predict(X)
spotcheck["error"] = abs(spotcheck.predictions - spotcheck.price)
spotcheck.sort_values(by = "error", ascending = False).head(10)

In [None]:
[round(i, 2) for i in model.steps[1][1].coef_]
model.steps[1][1].intercept_
model.steps[0][1].get_feature_names()