[Cook County Assessors Office Code](https://gitlab.com/ccao-data-science---modeling)

Data Sources:

* [Cook County GIS Open Data](https://hub-cookcountyil.opendata.arcgis.com/)

In [1]:
# import packages
import csv, datetime, glob, joblib, math, pickle, pydot, time, os, sklearn

from dask.distributed import Client, progress
from datetime import datetime
from IPython.display import display
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
from tqdm import tqdm

In [2]:
dataset_dropdown = widgets.Dropdown(
    options = glob.glob("./data/processed/*.pkl"),
    value = "./data/processed/Cleaned_Chicago_Sales.pkl",
    description = "Data: "
)
display(dataset_dropdown)

Dropdown(description='Data: ', index=1, options=('./data/processed/Cleaned_Chicago_Sales_PC208.pkl', './data/p…

In [3]:
# Data from: https://datacatalog.cookcountyil.gov/Property-Taxation/Cook-County-Assessor-s-Residential-Sales-Data/5pge-nu6u
dataset = dataset_dropdown.value
print("Loading data from {}".format(dataset))
ml_df = pd.read_pickle(dataset)
print("Data frame has {} rows and {} columns".format(len(ml_df), len(ml_df.columns)))
ml_df.head()

Loading data from ./data/processed/Cleaned_Chicago_Sales.pkl
Data frame has 326484 rows and 133 columns


Unnamed: 0,PIN,Property Class,Neighborhood Code,Land Square Feet,Town Code,Type of Residence,Apartments,Wall Material,Roof Material,Rooms,...,Bool-Type of Residence-1,Bool-Type of Residence-2,Bool-Type of Residence-4,Bool-Type of Residence-8,Bool-Type of Residence-7,Bool-Type of Residence-6,Bool-Wall Material-2,Bool-Wall Material-3,Bool-Wall Material-1,Bool-Wall Material-4
2,16094150130000,211,13,-0.195903,77,3.0,6.0,2.0,2.0,5.090833,...,0,0,0,0,0,0,1,0,0,0
5,4252000820000,204,100,2.10298,25,5.0,0.0,2.0,4.0,1.182544,...,0,0,0,0,0,0,1,0,0,0
8,14322110150000,208,12,-0.256898,74,3.0,0.0,2.0,6.0,0.581268,...,0,0,0,0,0,0,1,0,0,0
9,27021200080000,204,34,0.709559,28,1.0,0.0,3.0,1.0,-0.020007,...,1,0,0,0,0,0,0,1,0,0
11,13121080620000,204,42,0.043385,71,1.0,0.0,2.0,1.0,-0.320645,...,1,0,0,0,0,0,1,0,0,0


In [4]:
ml_df = pd.read_pickle(dataset)
drop_these = ["PIN", "Sale Date", 
              "Estimate (Land)", "Estimate (Building)", 
              "Deed No.", "Pure Market Filter"]
tree_slider = widgets.SelectionSlider(
    options=[ 2**i for i in range(1,10)],
    value=64,
    description="N Trees",
)
start_date, end_date = min(ml_df["Sale Date"]), max(ml_df["Sale Date"])
dates = pd.date_range(start_date, end_date, freq="D")
dates_slider = widgets.SelectionRangeSlider(
    options = [ (date.strftime("%d %b %Y"), date) for date in dates ],
    index=(0,len(dates)-1),
    layout={'width':'500px'}
)
display(tree_slider, dates_slider)

SelectionSlider(description='N Trees', index=5, options=(2, 4, 8, 16, 32, 64, 128, 256, 512), value=64)

SelectionRangeSlider(index=(0, 2554), layout=Layout(width='500px'), options=(('02 Jan 2013', Timestamp('2013-0…

In [5]:
ml_df = pd.read_pickle(dataset)
date_slider_i = dates_slider.index
start, end = dates[date_slider_i[0]], dates[date_slider_i[1]]
#print(start, end)
ml_df = ml_df[ml_df["Sale Date"] <= end]
ml_df = ml_df[ml_df["Sale Date"] >= start]

ml_df = ml_df.drop(drop_these, axis=1)
print("Data frame has {} rows and {} columns".format(len(ml_df), len(ml_df.columns)))
ml_df.head()

Data frame has 326484 rows and 127 columns


Unnamed: 0,Property Class,Neighborhood Code,Land Square Feet,Town Code,Type of Residence,Apartments,Wall Material,Roof Material,Rooms,Bedrooms,...,Bool-Type of Residence-1,Bool-Type of Residence-2,Bool-Type of Residence-4,Bool-Type of Residence-8,Bool-Type of Residence-7,Bool-Type of Residence-6,Bool-Wall Material-2,Bool-Wall Material-3,Bool-Wall Material-1,Bool-Wall Material-4
2,211,13,-0.195903,77,3.0,6.0,2.0,2.0,5.090833,5.583668,...,0,0,0,0,0,0,1,0,0,0
5,204,100,2.10298,25,5.0,0.0,2.0,4.0,1.182544,0.305487,...,0,0,0,0,0,0,1,0,0,0
8,208,12,-0.256898,74,3.0,0.0,2.0,6.0,0.581268,0.965259,...,0,0,0,0,0,0,1,0,0,0
9,204,34,0.709559,28,1.0,0.0,3.0,1.0,-0.020007,-0.354286,...,1,0,0,0,0,0,0,1,0,0
11,204,42,0.043385,71,1.0,0.0,2.0,1.0,-0.320645,0.965259,...,1,0,0,0,0,0,1,0,0,0


# Random Forest

In [6]:
ml_df.describe()

Unnamed: 0,Property Class,Neighborhood Code,Land Square Feet,Town Code,Type of Residence,Apartments,Wall Material,Roof Material,Rooms,Bedrooms,...,Bool-Type of Residence-1,Bool-Type of Residence-2,Bool-Type of Residence-4,Bool-Type of Residence-8,Bool-Type of Residence-7,Bool-Type of Residence-6,Bool-Wall Material-2,Bool-Wall Material-3,Bool-Wall Material-1,Bool-Wall Material-4
count,326484.0,326484.0,326484.0,326484.0,326484.0,326484.0,326484.0,326484.0,326484.0,326484.0,...,326484.0,326484.0,326484.0,326484.0,326484.0,326484.0,326484.0,326484.0,326484.0,326484.0
mean,220.004202,108.797384,1.635785e-10,44.663628,2.117905,0.387976,1.947195,1.167034,-3.703885e-09,2.979466e-09,...,0.387909,0.388273,0.083863,3e-06,6e-06,6e-06,0.435786,0.230321,0.316971,0.016923
std,29.478777,100.022183,1.0,23.764622,1.28143,1.042638,0.78243,0.611666,1.0,1.0,...,0.487274,0.487358,0.277183,0.00175,0.002475,0.002475,0.49586,0.421039,0.465297,0.128982
min,202.0,10.0,-0.5361447,10.0,1.0,-5.0,1.0,1.0,-1.523195,-1.673831,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,203.0,34.0,-0.2568976,24.0,1.0,0.0,1.0,1.0,-0.6212823,-0.354286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,205.0,80.0,-0.1470285,37.0,2.0,0.0,2.0,1.0,-0.3206446,-0.354286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,211.0,150.0,0.07779244,71.0,2.0,0.0,2.0,1.0,0.2806307,0.3054867,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
max,295.0,600.0,232.544,77.0,8.0,6.0,4.0,6.0,70.3292,57.04593,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
label_cols = ["Sale Price"]

labels = np.array(ml_df[label_cols])
features = ml_df.drop(label_cols, axis=1)
feature_list = list(features.columns)
#print(feature_list)
features = np.array(features)

In [8]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(np.nan_to_num(features), np.nan_to_num(labels), test_size = 0.25, random_state = 42)
train_labels, test_labels = train_labels.ravel(), test_labels.ravel()
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (244863, 126)
Training Labels Shape: (244863,)
Testing Features Shape: (81621, 126)
Testing Labels Shape: (81621,)


In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
ntrees = tree_slider.value
print("Running RFR with {} trees".format(ntrees))
rf = RandomForestRegressor(n_estimators = ntrees, random_state = 42)
# Train the model on training data
with Client(processes=False, threads_per_worker=5, n_workers=1, memory_limit='10GB') as client:
    with joblib.parallel_backend("dask"):
        rf.fit(train_features, train_labels)
pickle.dump( rf, open( "rfr.pkl", "wb" ) )

Running RFR with 64 trees


In [None]:
plt.hist([rf.estimators_[i].get_depth() for i in range(ntrees)],density=True)
plt.xlabel("Depth/Height of Tree")
plt.ylabel("Density")
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
print(label_transform)
predictions = np.exp(predictions)
train_labels = np.exp(train_labels)
test_labels = np.exp(test_labels)
# Calculate the absolute errors
errors = predictions-test_labels
print("Mean Sale Price in Training Set: ${:7.2f}".format(np.mean(train_labels)))
print("Mean Sale Price in Test Set: ${:7.2f}".format(np.mean(test_labels)))
print('Mean Error: ${:7.2f}'.format(np.mean(errors)))
plt.boxplot(errors)
plt.show()

In [None]:
# Print out the mean absolute error (mae)
mae = np.mean(np.abs(errors))
print('Mean Absolute Error: ${:7.2f}'.format(mae))
plt.boxplot(np.abs(errors))
plt.show()

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (np.abs(errors) / test_labels)
# Calculate and display accuracy
mape = np.mean(mape)
print('MAPE: {:3.2f}%'.format(mape))
plt.boxplot(np.abs(errors)/test_labels)
plt.show()

In [None]:
plt.scatter(test_labels, errors)
plt.title("Price vs. Error")
plt.xlabel("Price")
plt.ylabel("Error")
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()

In [None]:
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_],axis=0)
indices = np.argsort(importances)[::-1][:20]

# Print the feature ranking
print("Feature ranking:")

for f in range(len(indices)):
    print("{:2d}. feature {:2d} = {:<30s} ({:1.4f})".format(f+1, indices[f], feature_list[indices[f]], importances[indices[f]]))

# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), [ feature_list[i] for i in indices], rotation=45, ha="right")
plt.xlim([-1, len(indices)])
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()

In [None]:
feature_list = list(feature_list)
index = feature_list.index('Property Class')
classes = set()
for i in range(len(test_features)):
    classes.add(int(test_features[i][index]))
pcerrors = {}
#print(classes)
for elem in classes:
    pcerrors[elem] = {}
    pcerrors[elem]["error"] = []
    pcerrors[elem]["abs_err"] = []
abs_error = np.abs(errors)
for i in range(len(test_features)):
    pc = int(test_features[i][index])
    pcerrors[pc]["error"].append(errors[i])
    pcerrors[pc]["abs_err"].append(abs_error[i])
classes = list(classes)
classes.sort()
for elem in classes:
    print("Mean error for class {} is ${:7.2f} and mean absolute error is ${:7.2f}".format(elem, np.mean(pcerrors[elem]["error"]), np.mean(pcerrors[elem]["abs_err"]) ))
plt.bar([str(i) for i in classes], [np.mean(pcerrors[elem]["error"]) for elem in classes], yerr=[np.std(pcerrors[elem]["error"])/100 for elem in classes])
plt.xlabel("Property Classes", fontsize=20)
plt.ylabel("Mean Error (error bars = 1% $\sigma$)", fontsize=20)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()

In [None]:
plt.bar([str(i) for i in classes], [np.mean(pcerrors[elem]["abs_err"]) for elem in classes], yerr=[np.std(pcerrors[elem]["error"]) for elem in classes])
plt.xlabel("Property Classes", fontsize=20)
plt.ylabel("Mean Abs Error (error bars = $\sigma$)", fontsize=20)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()

In [None]:
to_plt = [ pcerrors[pc]["error"] for pc in classes ]
plt.boxplot(to_plt)
plt.xticks(range(len(classes)+1), [""]+classes)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()

In [None]:
# check if depth is too high
if np.mean([rf.estimators_[0].get_depth() for i in range(ntrees)]) < 8:
    tree = rf.estimators_[0]
    export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
    (graph, ) = pydot.graph_from_dot_file('tree.dot')
    graph.write_png('tree.png')
else:
    print("Tree height is too large to realistically plot.")