In [114]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection

# Predicting building demolition risk in Philadelphia, 2015-2021

## Introduction and Literature Review


## Presentation of Data
### Data collection
#### Property characteristics

In [2]:
#Get data on private demolitions post-2015 from API
demolitions = pd.read_csv("https://phl.carto.com/api/v2/sql?format=CSV&q=SELECT%20address,%20typeofwork%20FROM%20demolitions%20WHERE%20city_demo%20=%20%27NO%27%20AND%20start_date%20>=%20%272018-01-01%27")

In [3]:
demolitions.drop(index=demolitions[demolitions['typeofwork'] == 'TANKRI'].index, inplace=True) #drop tank removals

In [4]:
demolitions.shape

(1333, 2)

In [5]:
#Get selected features of all properties in Philadelphia
chunks = []
dtypes = {
    'lng': 'float64',
    'lat': 'float64',
    'location': 'object',
    'category_code_description': 'category',
    'interior_condition': 'category', #ordinal scale, therefore encoding this feature as a category
    'exterior_condition': 'category',
    'total_area': 'float64',
    'year_built': 'object',
    'parcel_number': 'object'
}

chunked_df = pd.read_csv('https://phl.carto.com/api/v2/sql?format=CSV&q=SELECT%20ST_X(the_geom)%20AS%20lng,%20ST_Y(the_geom)%20AS%20lat,%20location,%20category_code_description,%20interior_condition,%20exterior_condition,%20total_area,%20year_built,%20parcel_number%20FROM%20opa_properties_public', dtype=dtypes, chunksize=40000)

for chunk in chunked_df:
    chunk['demolition'] = chunk['location'].isin(demolitions['address']).astype(np.int8) #create binary field encoding whether or not an address is associated with a demolition permit
    chunks.append(chunk)

In [6]:
properties = pd.concat(chunks)

In [7]:
properties.shape

(581456, 10)

In [8]:
properties.sample(5)

Unnamed: 0,lng,lat,location,category_code_description,interior_condition,exterior_condition,total_area,year_built,parcel_number,demolition
382903,-75.224617,39.937923,5531 REGENT ST,Single Family,4,4,780.0,1925,514142800,0
110670,-75.223799,40.027466,4417 MANSION ST,Single Family,4,4,815.09,1925,211233610,0
271240,-75.160004,39.923753,2017 S MILDRED ST,Single Family,4,4,588.0,1920,393357100,0
24813,-75.228826,39.961687,51 N YEWDALL ST,Single Family,4,4,810.0,1925,41113800,0
502032,-74.964116,40.106312,3123 MAUREEN DR,Single Family,4,4,13252.0,1996,663062580,0


The most comprehensive property dataset is the most recent one. However, for 86.3% of properties with an associated demolition permit, the property assessment on file is for prior to their demolition, or the building has not yet been demolished. The remaining properties have been re-assessed since their demolition and are now listed as vacant land. Since the model includes building characteristics, and these features are not available for properties classed as 'Vacant Land', demolished properties that have been re-assessed since their demolition were dropped from the dataset. 

In [9]:
properties.groupby(['category_code_description', 'demolition']).size()

category_code_description  demolition
Commercial                 0              13921
                           1                138
Industrial                 0               4275
                           1                 79
Mixed Use                  0              14324
                           1                 72
Multi Family               0              42316
                           1                146
Single Family              0             460990
                           1                583
Vacant Land                0              44450
                           1                162
dtype: int64

In [10]:
properties.drop(index=properties[properties['category_code_description'] == 'Vacant Land'].index, inplace=True)

In [11]:
properties.shape

(536844, 10)

In [12]:
#Load in market value for properties in 2018
value18 = pd.read_csv('https://phl.carto.com/api/v2/sql?format=CSV&q=SELECT%20parcel_number,%20market_value%20FROM%20assessments%20WHERE%20year%20=%202018',
                     dtype={
                         'parcel_number': 'object',
                         'market_value': 'float64'
                     })

In [14]:
properties = pd.merge(properties, value18, how='left', on='parcel_number')

In [17]:
#Drop properties with text or null values in year_built field
properties.drop(index=properties[properties['year_built'].str.contains(r'[A-Za-z]+', na=True)].index, inplace=True)
properties['year_built'] = properties['year_built'].astype('int64')

In [20]:
#Drop properties constructed after 2018
properties.drop(index=properties[properties['year_built'] > 2018].index, inplace=True)

In [21]:
#Add field encoding age of properties in 2018
properties['age'] = 2018 - properties['year_built']

In [24]:
#Convert data frame to georeferenced dataframe to 
geo_properties = gpd.GeoDataFrame(properties,
                                  geometry=gpd.points_from_xy(properties.lng, properties.lat))

In [49]:
geo_properties.set_crs('epsg:4326', inplace=True) #set projection to WGS84
geo_properties.to_crs('epsg:2272', inplace=True) #reproject to NAD 1983 for southern PA

In [63]:
#Add field encoding distance from each property to City Hall

#1. Get Easting and Northing coordinates for City Hall
city_hall = gpd.GeoSeries(Point(-75.1635112, 39.952335), crs=4326)
city_hall.to_crs('epsg:2272') #reproject to NAD 1983

0    POINT (2693536.305 236112.283)
dtype: geometry

In [69]:
#Turn City Hall into point object
city_hall = Point(2693536.305, 236112.283)

#Add field with distance to City Hall
geo_properties['dist_city_hall'] = geo_properties.distance(city_hall)

#Convert from feet to miles
geo_properties['dist_city_hall'] = geo_properties['dist_city_hall'] * 0.000189394

In [111]:
#Import public transportation shapefiles

##Trolley stops (within Philadelphia county)
trolley = gpd.read_file('https://services2.arcgis.com/9U43PSoL47wawX5S/arcgis/rest/services/Trolley_Stops1/FeatureServer/0/query?where=1%3D1&outFields=Route&outSR=4326&f=json')

##MFL stops
MFL = gpd.read_file('https://services2.arcgis.com/9U43PSoL47wawX5S/arcgis/rest/services/Market_Frankford_Line_Stations/FeatureServer/0/query?where=1%3D1&outFields=Route&outSR=4326&f=json')

##BSL stops
BSL = gpd.read_file('https://services2.arcgis.com/9U43PSoL47wawX5S/arcgis/rest/services/Broad_Street_Line_Stations/FeatureServer/0/query?where=1%3D1&outFields=Route&outSR=4326&f=json')

##Regional Rail stops (within Philadelphia County)
RR = gpd.read_file('https://services2.arcgis.com/9U43PSoL47wawX5S/arcgis/rest/services/Regional_Rail_Stations/FeatureServer/0/query?where=1%3D1&outFields=Line_Name,County&outSR=4326&f=json')

In [112]:
#Drop Regional Rail stations that aren't in Philadelphia
RR.drop(index=RR[RR['County'] != 'Philadelphia'].index, inplace=True)
RR.drop(columns='County', inplace=True)
RR.rename(columns={'Line_Name': 'Route'}, inplace=True)

In [115]:
##Concatenate transportation stops into one gdf of all transportation stops
transport = pd.concat([trolley, MFL, BSL, RR])
transport.to_crs('epsg:2272', inplace=True)

In [125]:
#Find distance of each property to closest transport stop
transport_mp = transport.unary_union #create multi-point object from transportation stops

def dist_to_transport(row):
    ng = nearest_points(row['geometry'], transport_mp) #returns tuple of points, with second point as nearest point in unary union
    dist_transport = row['geometry'].distance(ng[1]) * 0.000189394
    return dist_transport

properties['dist_transport'] = properties.apply(dist_to_transport, axis=1)

ValueError: The second input geometry is empty

In [None]:
#Get Census data from API (ACS 5-year estimates from 2010 to 2015)

In [None]:
#Join Census tract data to each property

### Data cleaning

In [None]:
#Check for columns where most values are null, then drop

In [None]:
#Check rows where most values are null, then drop

In [None]:
#Convert categorical features into dummies using sklearn DictVectorizer

### Summary statistics

In [None]:
#Correlation matrix

In [2]:
#Histograms of continuous variables

## Methodology

In [4]:
#Write function to tune hyperparameters for each classifier with GridSearchCV

### Random Forest classifier

In [None]:
#Tune hyperparameters using GridSearchCV

### k-Nearest Neighbours classifier

In [None]:
#Tune hyperparameters using GridSearchCV

## Results and Discussion

In [3]:
#Metrics for each classifier (table)

In [None]:
#Confusion matrix for each classifier (figure)

In [None]:
#Map of false positives for each classifier

In [None]:
#Map of false negatives for each classifier

## Conclusion

## References