In [1]:
from esda.moran import Moran
import geopandas as gpd
import numpy as np
import pandas as pd
from libpysal.weights import DistanceBand, KNN
from matplotlib import colors
import matplotlib.pyplot as plt
from splot.esda import plot_moran
import spreg

from statshelper import q_q_plot

In [2]:
df = pd.read_csv("./data/processed/Cleaned_Chicago_Sales.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,PIN,Property Class,Neighborhood Code,Land Square Feet,Town Code,Type of Residence,Apartments,Wall Material,Roof Material,...,Bool-Type of Residence-1,Bool-Type of Residence-2,Bool-Type of Residence-4,Bool-Type of Residence-8,Bool-Type of Residence-7,Bool-Type of Residence-6,Bool-Wall Material-2,Bool-Wall Material-3,Bool-Wall Material-1,Bool-Wall Material-4
0,2,16094150130000,211,13,-0.29758,77,3.0,6.0,2.0,2.0,...,0,0,0,0,0,0,1,0,0,0
1,5,4252000820000,204,100,2.86818,25,5.0,0.0,2.0,4.0,...,0,0,0,0,0,0,1,0,0,0
2,8,14322110150000,208,12,-0.59601,74,3.0,0.0,2.0,6.0,...,0,0,0,0,0,0,1,0,0,0
3,9,27021200080000,204,34,1.698873,28,1.0,0.0,3.0,1.0,...,1,0,0,0,0,0,0,1,0,0
4,11,13121080620000,204,42,0.515768,71,1.0,0.0,2.0,1.0,...,1,0,0,0,0,0,1,0,0,0


In [3]:
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Longitude, df.Latitude))

In [7]:
i_vars = [
    'Age',
    'Bedrooms',  
    'Building Square Feet', 
    'Estimate (Building)', 
    'Estimate (Land)', 
    'Fireplaces', 
    'Full Baths',
    'Garage 1 Area',
    'Garage 2 Area', 
    'Half Baths', 
    'Land Square Feet',
    'Rooms',
    'Sale Quarter'
    ]
dep_var = ['Sale Price']
other_to_keep = ["PIN", "Neighborhood Code", "Town Code", "geometry"]
my_rows = i_vars+dep_var+other_to_keep
X = gdf[i_vars].values
y = gdf[dep_var].values.ravel()
gdf = gdf[my_rows]

In [8]:
gdf.describe()

Unnamed: 0,Age,Bedrooms,Building Square Feet,Estimate (Building),Estimate (Land),Fireplaces,Full Baths,Garage 1 Area,Garage 2 Area,Half Baths,Land Square Feet,Rooms,Sale Quarter,Sale Price,PIN,Neighborhood Code,Town Code
count,324261.0,324261.0,324261.0,324261.0,324261.0,324261.0,324261.0,324261.0,324261.0,324261.0,324261.0,324261.0,324261.0,324261.0,324261.0,324261.0,324261.0
mean,-7.120576e-10,-1.092022e-08,7.167054e-10,1.43786e-10,5.046344e-10,1.617216e-09,-3.436738e-09,1.68352e-09,1.066499e-09,-2.931867e-08,-3.191524e-10,-1.353879e-08,78.852471,-5.396458e-10,16516280000000.0,108.992253,44.675265
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9999999,1.0,1.0,1.0,1.0,7.98735,1.0,8069302000000.0,100.150467,23.754169
min,-2.077472,-3.513837,-3.056954,-10.16233,-9.972469,-0.5779852,-0.8912787,-2.100215,-0.04176153,-0.8080311,-5.640353,-3.44403,65.0,-3.193263,1011000000000.0,10.0,10.0
25%,-0.6126325,-0.292681,-0.7569215,-0.6368033,-0.6296315,-0.5779852,-0.8912787,-0.7751074,-0.04176153,-0.8080311,-0.5930628,-0.7957876,72.0,-0.4957421,10362080000000.0,34.0,24.0
50%,-0.166812,-0.292681,-0.1910355,-0.01657273,-0.09333429,-0.5779852,-0.8912787,0.55,-0.04176153,-0.8080311,-0.09124129,-0.268846,79.0,0.104553,16083160000000.0,80.0,37.0
75%,0.7885178,0.5508094,0.6588384,0.5936153,0.5069814,1.270738,0.6356368,0.55,-0.04176153,0.9666556,0.6044575,0.5626062,86.0,0.6221981,23114010000000.0,150.0,71.0
max,3.654507,9.679707,5.458316,5.511245,7.154818,3.119462,7.342334,0.55,25.98733,15.16415,9.886416,10.40469,92.0,4.165429,33323020000000.0,600.0,77.0


In [9]:
# determine weight matrix
w = KNN.from_dataframe(gdf, k=128) # needed k large enough that it was connected
#w = DistanceBand.from_dataframe(gdf,threshold=.0075) # needed k large enough that it was connected

In [10]:
w.transform = 'R'

In [None]:
sem = spreg.ML_Error(y, X, w=w, name_x=i_vars, name_y="Sale Price", method="LU")

  warn("Method 'bounded' does not support relative tolerance in x; "


In [None]:
print(sem.summary)

In [None]:
sem_moran = Moran(sem.e_filtered, w, permutations=199)
plot_moran(sem_moran, zstandard=True, figsize=(10,4))

In [None]:
data = sem.e_filtered.reshape(-1)
_min, _max = min(data), max(data)
print(_min, _max)
_width = (_max-_min)
norm = colors.BoundaryNorm(np.arange(_min, int(_max), _width/7),ncolors=256)
ax = gdf.plot(column=sem.e_filtered.reshape(-1),legend=True,figsize=(15,8), norm=norm, linewidth=0.0)
plt.title("Map of residuals of the SEM model",fontsize=15)
plt.show()