# Model Training and Evaluation

This notebook is for experimenting with, benchmarking and documenting the accuracy of various models.

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
from joblib import dump
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsRegressor

In [2]:
data = pd.read_csv("data/data.csv")
data = data[data.price > 10]
data.info()

categorical_features = data.dtypes[
    data.dtypes == "object"
].index.values  # ["housing_type", "laundry", "parking"]

data.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28339 entries, 0 to 28338
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  28339 non-null  int64  
 1   price               28339 non-null  float64
 2   cats_ok             28339 non-null  bool   
 3   dogs_ok             28339 non-null  bool   
 4   housing_type        28339 non-null  object 
 5   laundry             28339 non-null  object 
 6   parking             28339 non-null  object 
 7   bedrooms            28339 non-null  float64
 8   bathrooms           28339 non-null  float64
 9   no_smoking          28339 non-null  bool   
 10  is_furnished        28339 non-null  bool   
 11  ev_charging         28339 non-null  bool   
 12  wheelchair_acccess  28339 non-null  bool   
 13  latitude            28339 non-null  float64
 14  longitude           28339 non-null  float64
dtypes: bool(6), float64(5), int64(1), object(3)
memory us

Unnamed: 0,id,price,bedrooms,bathrooms,latitude,longitude
count,28339.0,28339.0,28339.0,28339.0,28339.0,28339.0
mean,7257245000.0,2165.772716,1.699919,1.156675,40.739434,-73.948593
std,8542605.0,774.67224,1.050495,0.391407,0.266018,0.980717
min,7241938000.0,16.0,0.0,0.0,20.9174,-156.6772
25%,7249549000.0,1700.0,1.0,1.0,40.68351,-73.972518
50%,7257926000.0,2000.0,2.0,1.0,40.712265,-73.943129
75%,7264566000.0,2499.0,2.0,1.0,40.76515,-73.907905
max,7271850000.0,10000.0,8.0,5.0,44.3227,-71.0325


In [8]:
import plotly.express as px
fig = px.violin(data, y = "price", x= "bedrooms", width = 800, height = 800)
fig.show()

## Trial 1: Sklearn Random Forest Regressor

In [10]:
# 1. Split the data
X = data.drop(["price", "id"], axis=1, inplace=False)
y = data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 2. preprocess the features for training
ct = make_column_transformer(
    (
        OneHotEncoder(handle_unknown = "ignore"),
        categorical_features,
    ),
    remainder="passthrough",
)

# 3. Fit and score the model
rf = DecisionTreeRegressor()
model = make_pipeline(ct, rf)
model = model.fit(X_train, y_train)

predictions = model.predict(X_test)
error = abs(predictions - y_test)
print(error.describe())
model.score(X_test, y_test)

# 4. Spot check worst errors
spotcheck = data.copy(deep = True)
spotcheck["predictions"] = model.predict(X)
spotcheck["error"] = abs(spotcheck.predictions - spotcheck.price)
spotcheck.sort_values(by = "error", ascending = False).head(10)

count    5567.000000
mean      223.454704
std       455.112597
min         0.000000
25%         0.000000
50%        55.000000
75%       249.250000
max      7800.000000
Name: price, dtype: float64


0.5840020987041878

Unnamed: 0,id,price,cats_ok,dogs_ok,housing_type,laundry,parking,bedrooms,bathrooms,no_smoking,is_furnished,ev_charging,wheelchair_acccess,latitude,longitude,predictions,error
16827,7260419304,700.0,False,False,apartment,laundry on site,attached garage,3.0,2.0,False,False,False,False,40.7152,-73.9877,8500.0,7800.0
13930,7258118719,9000.0,False,False,apartment,laundry in bldg,no parking,3.0,2.0,False,False,False,False,40.777003,-73.961628,1550.0,7450.0
12861,7256880366,1346.0,True,True,townhouse,w/d in unit,attached garage,3.0,3.0,False,False,False,True,40.791,-74.0634,6818.0,5472.0
6430,7249400042,2699.0,True,True,apartment,w/d in unit,street parking,3.0,2.0,False,False,False,False,40.69321,-73.971872,7900.0,5201.0
19720,7262999306,3986.0,True,True,apartment,w/d in unit,detached garage,3.0,2.5,False,False,False,True,40.7859,-74.2568,8718.0,4732.0
20697,7263944924,350.0,True,True,apartment,no laundry on site,no parking,3.0,2.0,True,False,False,False,40.7852,-73.9615,4950.0,4600.0
26867,7270316139,10000.0,True,True,apartment,w/d in unit,street parking,3.0,3.0,False,False,False,False,40.710823,-74.00565,5436.5,4563.5
22864,7266288698,1200.0,False,True,duplex,w/d hookups,street parking,4.0,2.5,False,False,False,False,40.6789,-73.9664,5650.0,4450.0
13581,7257373866,6943.0,True,True,apartment,w/d in unit,attached garage,2.0,1.0,False,False,False,False,40.709106,-74.005516,2590.0,4353.0
22980,7266171826,675.0,True,True,apartment,w/d in unit,attached garage,4.0,2.0,False,True,False,False,40.696986,-73.935222,4811.0,4136.0


In [None]:
from .config import options
model_columns = options["column order"]
raw_features = dict(zip(model.steps[0][1].get_feature_names(),model.steps[1][1].feature_importances_))
from web.app import clean_features
cleaned_features = clean_features(raw_features, model_columns)
import json
json.dump (cleaned_features, open("feature_importances.json", "wb"))

## Trial 1.1 - SHAP

In [15]:
import shap

from web.app import form_data_to_dataframe

explainer = shap.TreeExplainer (model.steps[1][1])

test_case = {
    "address": "50 Eldridge Street New York NY",
    "cats_ok": True,
    "dogs_ok": True,
    "housing_type": "apartment",
    "laundry": "laundry in bldg",
    "bedrooms": 3,
    "bathrooms": 1,
    "parking": "street parking",
    "no_smoking": False,
    "is_furnished": True,
    "wheelchair_acccess": True,
    "ev_charging": True
}

test_dataframe = form_data_to_dataframe(test_case)
test_vector = model.steps[0][1].transform(test_dataframe)
explainer.expected_value
model.steps[0][1].get_feature_names()

array([[-6.14387583e+00,  6.95824321e-02, -7.79417799e-03,
        -2.45298836e+00,  3.06865673e-03,  8.80173969e-01,
         0.00000000e+00,  0.00000000e+00, -2.37387749e+01,
        -2.08677344e-01,  3.72397314e+01,  1.90031368e+00,
         6.22063821e-01,  4.71036196e-01, -1.73983219e+01,
        -7.56887753e+00,  1.31973739e+00, -4.98988248e+00,
         7.08418162e+00,  1.34848265e+01,  9.70213909e+00,
        -2.03265691e+00, -7.75532234e+00, -6.54604540e-01,
         1.10619157e+02, -1.57103975e+02, -3.57132867e+00,
         2.52075786e+01, -1.67394227e+02, -1.40368381e+01,
         3.79503916e+01,  4.01054464e+02]])

['onehotencoder__x0_apartment',
 'onehotencoder__x0_condo',
 'onehotencoder__x0_cottage/cabin',
 'onehotencoder__x0_duplex',
 'onehotencoder__x0_flat',
 'onehotencoder__x0_house',
 'onehotencoder__x0_in-law',
 'onehotencoder__x0_land',
 'onehotencoder__x0_loft',
 'onehotencoder__x0_townhouse',
 'onehotencoder__x1_laundry in bldg',
 'onehotencoder__x1_laundry on site',
 'onehotencoder__x1_no laundry on site',
 'onehotencoder__x1_w/d hookups',
 'onehotencoder__x1_w/d in unit',
 'onehotencoder__x2_attached garage',
 'onehotencoder__x2_carport',
 'onehotencoder__x2_detached garage',
 'onehotencoder__x2_no parking',
 'onehotencoder__x2_off-street parking',
 'onehotencoder__x2_street parking',
 'onehotencoder__x2_valet parking',
 'cats_ok',
 'dogs_ok',
 'bedrooms',
 'bathrooms',
 'no_smoking',
 'is_furnished',
 'ev_charging',
 'wheelchair_acccess',
 'latitude',
 'longitude']

## Trial 2: KNN Regressor

KNN is inherently more explainable than Random Forest

In [None]:
# 1. Split the data
X = data.drop(["price", "id"], axis=1, inplace=False)
y = data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 2. preprocess the features for training
ct = make_column_transformer(
    (
        OneHotEncoder(handle_unknown = "ignore"),
        categorical_features,
    ),
    remainder="passthrough",
)

# 3. Fit and score the model
knn = KNeighborsRegressor()
model = make_pipeline(ct, knn)
model = model.fit(X_train, y_train)

predictions = model.predict(X_test)
error = abs(predictions - y_test)
print(error.describe())
model.score(X_test, y_test)

# 4. Spot check worst errors
spotcheck = data.copy(deep = True)
spotcheck["predictions"] = model.predict(X)
spotcheck["error"] = abs(spotcheck.predictions - spotcheck.price)
spotcheck.sort_values(by = "error", ascending = False).head(10)

## Trial 3: Linear Regressor

In [None]:
from sklearn.linear_model import LinearRegression

# 1. Split the data
X = data.drop(["price", "id"], axis=1, inplace=False)
y = data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 2. preprocess the features for training
ct = make_column_transformer(
    (
        OneHotEncoder(handle_unknown = "ignore"),
        categorical_features,
    ),
    remainder="passthrough",
)

# 3. Fit and score the model
lr = LinearRegression()
model = make_pipeline(ct, lr)
model = model.fit(X_train, y_train)

predictions = model.predict(X_test)
error = abs(predictions - y_test)
print(error.describe())
model.score(X_test, y_test)

# 4. Spot check worst errors
spotcheck = data.copy(deep = True)
spotcheck["predictions"] = model.predict(X)
spotcheck["error"] = abs(spotcheck.predictions - spotcheck.price)
spotcheck.sort_values(by = "error", ascending = False).head(10)

In [None]:
[round(i, 2) for i in model.steps[1][1].coef_]
model.steps[1][1].intercept_
model.steps[0][1].get_feature_names()