# Process Permits and Violations

In [1]:
# Data processing
import numpy as np
import pandas as pd
from datetime import datetime

import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF8')

# Common DGLIM utilities
import os, sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import dglim
dglim.setProjectPath('../../')

## Count Number of Permits for each Parcel

** Note: ** business name matching between the permits dataset and the active businesses dataset was extremely unreliable, even when looking for imperfect matches, so we match by parcel.

*** Load permits data ***

In [2]:
raw_permits_df = pd.read_csv(dglim.datasets_path + 'Building_Permits.csv').dropna().rename(columns={
    'Parcel Number' : 'Parcel'
})

raw_permits_df['Parcel'] = raw_permits_df['Parcel'].apply(lambda x: str(x).replace('-', '').rjust(11, '0'))
raw_permits_df['Issue'] = raw_permits_df['Issue'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y'))

*** Count number of permits for each parcel ***

In [3]:
# Only use recently issued permits
recent_date = datetime(year=2016, month=1, day=1)
temp_df = raw_permits_df[raw_permits_df['Issue'] > recent_date]

# Count number of permits for each parcel
parcel_permit_counts_df = temp_df.groupby('Parcel').size().reset_index(name='Number of Permits')
parcel_permit_counts_df.sort_values(by='Number of Permits', ascending=False)[:5]

Unnamed: 0,Parcel,Number of Permits
666,9276000000,46
433,6810001002,36
444,7176014000,34
424,6795002000,33
219,6108002000,25


## Count Number of Zoning Violations for each Parcel

*** Load zoning violations data ***

In [4]:
raw_zoning_df = pd.read_excel(dglim.datasets_path + 'Commercial Zoning Violations.xls')
raw_zoning_df['Parcel'] = raw_zoning_df['Parcel'].apply(lambda x: str(x).rjust(11, '0'))
raw_zoning_df

Unnamed: 0,Number,Violation,Case Type,Infraction,Parcel,Address
0,CE-15-02841,Sanitation - Commercial,ZONING_C - Commercial Zoning Violations,2015-11-23,16107150000,01515 SE 8TH PL
1,CE-15-02842,Sanitation - Commercial,ZONING_C - Commercial Zoning Violations,2015-11-23,16106129000,01021 SE 15TH ST
2,CE-11-01679,Permitted Use,ZONING_C - Commercial Zoning Violations,2011-06-11,16106020000,01521 SE 12TH AVE
3,CE-11-01610,,ZONING_C - Commercial Zoning Violations,2011-06-03,15975001000,00922 SE WILLISTON RD
4,CE-11-01541,,ZONING_C - Commercial Zoning Violations,2011-05-25,15954024000,01038 SE 3RD ST
5,CE-11-00818,,ZONING_C - Commercial Zoning Violations,2011-03-17,15954022000,01014 SE 3RD ST
6,CE-17-00494,,ZONING_C - Commercial Zoning Violations,2017-03-01,15954009000,01124 SE 4TH ST
7,CE-13-02316,,ZONING_C - Commercial Zoning Violations,2013-08-13,15703001000,01001 SW 17TH LN
8,CE-10-03110,,ZONING_C - Commercial Zoning Violations,2010-09-01,15703000000,00718 SW 16TH AVE APT 101
9,CE-17-01111,Dead or Hazardous Trees,ZONING_C - Commercial Zoning Violations,2017-05-02,15701055000,00205 SE 16TH AVE


*** Count number of zoning violations for each parcel ***

In [5]:
# Only use recent infractions
recent_date = datetime(year=2016, month=1, day=1)
temp_df = raw_zoning_df[raw_zoning_df['Infraction'] > recent_date]

# Count number of zoning violations for each parcel
parcel_zoning_counts_df = temp_df.groupby('Parcel').size().reset_index(name='Number of Zoning Violations')
parcel_zoning_counts_df.sort_values(by='Number of Zoning Violations', ascending=False)[:5]

Unnamed: 0,Parcel,Number of Zoning Violations
48,8251000000,4
59,9876001000,3
26,7966007000,3
47,8244001014,2
99,14975000000,2


## Count Number of Building Code Violations for each Parcel

In [6]:
raw_code_df = pd.read_excel(dglim.datasets_path + 'Commercial Building Code Violations.xls')
raw_code_df['Parcel'] = raw_code_df['Parcel'].apply(lambda x: str(x).rjust(11, '0'))
raw_code_df

Unnamed: 0,Number,Violation,Case Type,Infraction,Parcel,Address
0,CE-16-00109,,COM_BLD - Commercial Building Code,2016-01-12,05977357000,08620 NW 13TH ST
1,CE-17-00662,,COM_BLD - Commercial Building Code,2017-03-27,06013005009,02623 NW 74TH PL
2,CE-16-00577,,COM_BLD - Commercial Building Code,2016-03-03,06013006000,07615 NW 13TH BLVD
3,CE-14-01675,Fence Ordinance - Maintenance,COM_BLD - Commercial Building Code,2014-06-26,06014001090,05620 NW 23RD TER
4,CE-11-03125,,COM_BLD - Commercial Building Code,2011-10-26,06014001091,05620 NW 23RD TER
5,CE-11-02403,Odor,COM_BLD - Commercial Building Code,2011-08-24,06014030000,06250 NW 23RD ST
6,CE-11-02403,"Fire Hazard, Unsanitary Conditions, Or Dangero...",COM_BLD - Commercial Building Code,2011-08-24,06014030000,06250 NW 23RD ST
7,CE-11-02403,Hazardous Condition - Commercial,COM_BLD - Commercial Building Code,2011-08-24,06014030000,06250 NW 23RD ST
8,CE-11-02403,Sanitation - Commercial,COM_BLD - Commercial Building Code,2011-08-24,06014030000,06250 NW 23RD ST
9,CE-11-02403,Weed/Plant Growth in Excess of 10 Inches - Com...,COM_BLD - Commercial Building Code,2011-08-24,06014030000,06250 NW 23RD ST


*** Count number of building code violations for each parcel ***

In [7]:
# Only use recent infractions
recent_date = datetime(year=2016, month=1, day=1)
temp_df = raw_code_df[raw_code_df['Infraction'] > recent_date]

# Count number of zoning violations for each parcel
parcel_code_counts_df = temp_df.groupby('Parcel').size().reset_index(name='Number of Building Code Violations')
parcel_code_counts_df.sort_values(by='Number of Building Code Violations', ascending=False)[:5]

Unnamed: 0,Parcel,Number of Building Code Violations
49,11626000000,4
48,11624000000,4
12,6780001000,2
43,10859010003,2
52,11698000000,2


## Aggregate and Save Data

In [16]:
business_parcels_df = dglim.loadData('Active Business Parcels').reset_index(drop=True).drop_duplicates()
business_parcels_df['Parcel'] = business_parcels_df['Parcel'].apply(lambda x: str(x).rjust(11, '0'))

In [19]:
# Collect counts in a single dataset
data_df = business_parcels_df.copy()
data_df = data_df.merge(parcel_permit_counts_df, on='Parcel', how='outer')
data_df = data_df.merge(parcel_zoning_counts_df, on='Parcel', how='outer')
data_df = data_df.merge(parcel_code_counts_df, on='Parcel', how='outer')

# Remove fake parcel values
bad_parcel_values = [
    '00000000000'
]
data_df = data_df[~data_df['Parcel'].isin(bad_parcel_values)].sort_values('Parcel').fillna(0)

# Save dataset
dglim.saveData(data_df, 'Parcel Permit and Violation Counts')

# Print some numbers
print 'Number of parcels with at least one permit:', \
    sum(data_df['Number of Permits'] > 0)
print 'Number of parcels with at least one zoning violation:', \
    sum(data_df['Number of Zoning Violations'] > 0)
print 'Number of parcels with at least one building code violation:', \
    sum(data_df['Number of Building Code Violations'] > 0)
print 'Number of parcels with at least one of each count:', \
    sum((data_df['Number of Permits'] > 0) & \
        (data_df['Number of Zoning Violations'] > 0) & \
        (data_df['Number of Building Code Violations'] > 0))

data_df[:5]

Number of parcels with at least one permit: 889
Number of parcels with at least one zoning violation: 108
Number of parcels with at least one building code violation: 89
Number of parcels with at least one of each count: 1


Unnamed: 0,Parcel,Number of Permits,Number of Zoning Violations,Number of Building Code Violations
618,5975005000,0.0,0.0,0.0
2339,5976000000,0.0,0.0,0.0
2483,5977205000,1.0,0.0,0.0
2484,5977236000,1.0,0.0,0.0
2485,5977239000,1.0,0.0,0.0
