# Model Training and Evaluation

This notebook is for experimenting with, benchmarking and documenting the accuracy of various models.

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
from joblib import dump
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

In [None]:
data = pd.read_csv("data.csv")
data = data[data.price > 10]

categorical_features = data.dtypes[
    data.dtypes == "object"
].index.values  # ["housing_type", "laundry", "parking"]

# 1. Split the data
X = data.drop(["price", "id"], axis=1, inplace=False)
y = data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 2. preprocess the features for training
ct = make_column_transformer(
    (
        OneHotEncoder(handle_unknown = "ignore"),
        categorical_features,
    ),
    remainder="passthrough",
)
rf = RandomForestRegressor()

# 3. Fit and score the model
model = make_pipeline(ct, rf)
model.fit(X_train, y_train)

# predictions = model.predict(X_test)
# error = abs(predictions - y_test)
# print(error.describe())

data["predictions"] = model.predict(X)
data["error"] = abs(data.predictions - data.price)
data.error.describe()
model.score(X_test, y_test)

In [25]:
data.sort_values(by = "error", ascending = False).head(10)

Unnamed: 0,id,price,cats_ok,dogs_ok,housing_type,laundry,bedrooms,bathrooms,parking,no_smoking,is_furnished,wheelchair_acccess,ev_charging,latitude,longitude,predictions,error
1830,7242059570,5975.0,True,True,apartment,w/d in unit,3.0,1.0,detached garage,True,False,False,False,40.680488,-74.000344,2982.58,2992.42
1276,7242887976,750.0,False,False,condo,w/d in unit,2.0,2.0,street parking,True,True,False,False,36.776,-76.0766,3545.691667,2795.691667
2812,7241056352,11750.0,True,True,apartment,w/d in unit,2.0,2.0,street parking,True,False,False,False,40.723882,-74.003466,9100.046,2649.954
859,7243291968,7100.0,True,True,house,laundry in bldg,6.0,4.5,attached garage,False,False,False,False,40.790235,-73.742666,4620.88,2479.12
552,7243680274,1000.0,False,False,apartment,w/d in unit,3.0,0.5,street parking,True,False,False,False,40.6883,-74.0007,3371.29,2371.29
464,7243721252,2500.0,False,False,apartment,w/d hookups,3.0,2.0,no parking,False,False,False,False,40.762396,-73.978601,4808.34,2308.34
3301,7244515400,4500.0,False,False,apartment,laundry in bldg,3.0,1.0,street parking,False,False,False,False,40.6451,-73.945,2376.35,2123.65
1551,7242457718,1615.0,True,True,apartment,laundry in bldg,0.0,1.0,street parking,False,False,False,False,40.7012,-73.9436,3692.76,2077.76
2502,7241353528,10000.0,False,False,loft,laundry in bldg,2.0,3.0,attached garage,False,True,False,False,40.743934,-73.926171,8087.44,1912.56
2741,7241094494,1300.0,True,True,apartment,w/d in unit,1.0,1.5,street parking,False,False,False,False,40.708151,-73.942481,3170.24,1870.24
