In [1]:
# General python libraries
import json
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import pickle
import re

# Geo mapping libraries
from branca.colormap import linear
import geopandas as gpd
from shapely.geometry import Point, Polygon
from ipyleaflet import Map, GeoData, basemaps, LayersControl, Choropleth
from ipyleaflet import WidgetControl, GeoJSON 
from ipywidgets import Text, HTML
import geopandas

# Statistics and regression libraries
from scipy.stats import zscore
from sklearn.preprocessing import scale
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
import statsmodels.stats.api as sms
import statsmodels.api as sm

In [4]:
df = pd.read_csv('kc_house_data.csv')

# Missing values
df.view.fillna(0, inplace=True)
df.yr_renovated.fillna(0, inplace=True)
df.waterfront.fillna(0, inplace=True)
sqft_basement_mean = df.sqft_basement.loc[df.sqft_basement != '?'].astype('float').mean()
df.sqft_basement.replace('?', sqft_basement_mean, inplace=True)

# Incorrect dtypes
df.date = pd.to_datetime(df.date)
df.price = df.price.astype('int64')
df.waterfront = df.waterfront.astype('int64')
df.sqft_basement = df.sqft_basement.astype('float').astype('int64')
df.view.astype('int64')

# Convert to binary column
df['has_basement'] = df.sqft_basement.map(lambda x: 1 if x > 0 else 0)
df['renovated'] = df.yr_renovated.map(lambda x: 1 if x > 0 else 0)

# Drop unused or already transformed columns
df.drop(['id', 'date', 'sqft_basement', 'yr_renovated'], axis=1, inplace=True);

outlier_cols = [
    'bedrooms',
    'bathrooms',
    'sqft_living',
    'sqft_lot',
    'sqft_above',
    'sqft_living15',
    'sqft_lot15'
]

for col in outlier_cols:
    df.drop(df.loc[zscore(df[col]) > 3].index, axis=0, inplace=True)
    
map_df = df

In [5]:
# Create my own geodataframe using lats/longs to create the cells, need id
def create_geobins(df_lat_ser, df_long_ser, n_lats, n_longs):
    polygons = []
    max_lat = df_lat_ser.max()
    min_lat = df_lat_ser.min()
    max_long = df_long_ser.max()
    min_long = df_long_ser.min()
    lat_step = abs((max_lat - min_lat)) / n_lats
    long_step = abs((max_long - min_long)) / n_longs
    for this_lat in np.arange(min_lat, max_lat, lat_step):
        for this_long in np.arange(min_long, max_long, long_step):
            polygons.append(Polygon([(this_lat, this_long),
                                     (this_lat + lat_step, this_long),
                                     (this_lat + lat_step, this_long + long_step),
                                     (this_lat, this_long + long_step),
                                     (this_lat, this_long)
                                     ]
                                    )
                            )
    return gpd.GeoDataFrame(polygons, columns=['geometry'])

geobins = create_geobins(map_df.lat, map_df.long, 10, 6)

In [6]:
# Create basemap, add layer
geobins_layer = GeoData(geo_dataframe=geobins)
m = Map(center=(47.5391,-122.070), zoom=9)
m.add_layer(geobins_layer)
m.add_control(LayersControl())
m

Map(center=[47.5391, -122.07], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zo…