# Model Training and Evaluation

This notebook is for experimenting with, benchmarking and documenting the accuracy of various models.

In [2]:
import numpy as np
import pandas as pd
from joblib import dump
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

In [6]:
data = pd.read_csv("data.csv")

categorical_features = data.dtypes[
    data.dtypes == "object"
].index.values  # ["housing_type", "laundry", "parking"]

# 1. Split the data
X = data.drop(["price", "id"], axis=1, inplace=False)
y = data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 2. preprocess the features for training
ct = make_column_transformer(
    (
        OneHotEncoder(handle_unknown = "ignore"),
        categorical_features,
    ),
    remainder="passthrough",
)
rf = RandomForestRegressor()

# 3. Fit and score the model
model = make_pipeline(ct, rf)
model.fit(X_train, y_train)

# predictions = model.predict(X_test)
# error = abs(predictions - y_test)
# print(error.describe())

data["predictions"] = model.predict(X)
data["error"] = abs(data.predictions - data.price)
data.error.describe()
data.sort_values(by = "error", ascending = False).head(10)

Unnamed: 0,id,price,cats_ok,dogs_ok,housing_type,laundry,bedrooms,bathrooms,parking,no_smoking,is_furnished,wheelchair_acccess,ev_charging,latitude,longitude,predictions,error
400,7243739917,5995.0,True,True,apartment,laundry in bldg,2.0,1.0,street parking,False,False,False,False,40.72664,-73.994129,2406.187619,3588.812381
981,7243204343,1199.0,True,False,apartment,laundry in bldg,2.0,2.0,street parking,False,False,False,False,40.719348,-74.00923,4699.225,3500.225
2594,7241233839,7000.0,True,True,apartment,laundry in bldg,7.0,2.0,attached garage,False,False,False,False,40.718864,-73.946798,3694.194167,3305.805833
1448,7242612783,1295.0,False,False,loft,laundry in bldg,0.0,4.0,street parking,True,False,False,False,40.679,-73.9644,4194.1,2899.1
2502,7241353528,10000.0,False,False,loft,laundry in bldg,2.0,3.0,attached garage,False,True,False,False,40.743934,-73.926171,7471.13,2528.87
884,7243270829,5000.0,False,False,apartment,laundry in bldg,4.0,1.0,street parking,True,False,False,False,40.675146,-73.981326,2505.0,2495.0
2812,7241056352,11750.0,True,True,apartment,w/d in unit,2.0,2.0,street parking,True,False,False,False,40.723882,-74.003466,9384.566667,2365.433333
2214,7241674966,4585.0,True,True,apartment,w/d in unit,1.0,1.0,street parking,False,False,False,False,40.7101,-74.0013,2629.255,1955.745
3325,7244489860,5600.0,False,False,apartment,w/d in unit,2.0,2.0,street parking,False,False,False,False,40.774294,-73.978812,3758.48,1841.52
3431,7244424237,1200.0,True,True,apartment,laundry in bldg,4.0,1.0,street parking,True,True,False,False,40.713177,-73.950725,2928.1,1728.1


In [5]:
model.score(X_test, y_test)

0.7144159543626989