In [109]:
import pandas as pd

import geopandas as gpd

from shapely.geometry import Point, Polygon

In [110]:
data = pd.read_csv('DOB_Violations.csv')
print(f"Length with Nans: {len(data)}")



Length with Nans: 2035211


# Removing missing or malformed Building Ids

In [111]:
data = data.dropna(subset=['BORO'])
print(f"Length without BORO Nans: {len(data)}")

data = data.dropna(subset=['BLOCK'])
print(f"Length without BORO Nans: {len(data)}")

data = data.dropna(subset=['LOT'])
print(f"Length without BORO Nans: {len(data)}")

Length without BORO Nans: 2035209
Length without BORO Nans: 2022372
Length without BORO Nans: 2022333


In [112]:
print(f"Canaility before cleaning: {len(data)}")
data.BORO = data.BORO.astype('str')
data = data[data.BORO.apply(lambda row: row.isnumeric())]
print(f"Canaility after cleaning: {len(data)}")

print(f"Canaility before cleaning: {len(data)}")
data.BLOCK = data.BLOCK.astype('str')
data = data[data.BLOCK.apply(lambda row: row.isnumeric())]
print(f"Canaility after cleaning: {len(data)}")

print(f"Canaility before cleaning: {len(data)}")
data.LOT = data.LOT.astype('str')
data = data[data.LOT.apply(lambda row: row.isnumeric())]
print(f"Canaility after cleaning: {len(data)}")

Canaility before cleaning: 2022333
Canaility after cleaning: 1988139
Canaility before cleaning: 1988139
Canaility after cleaning: 1500904
Canaility before cleaning: 1500904
Canaility after cleaning: 1499942


# Number of DOB violations

In [113]:
data.groupby('VIOLATION_TYPE_CODE').count().sort_values('BORO')

Unnamed: 0_level_0,ISN_DOB_BIS_VIOL,BORO,BIN,BLOCK,LOT,ISSUE_DATE,VIOLATION_NUMBER,HOUSE_NUMBER,STREET,DISPOSITION_DATE,DISPOSITION_COMMENTS,DEVICE_NUMBER,DESCRIPTION,ECB_NUMBER,NUMBER,VIOLATION_CATEGORY,VIOLATION_TYPE
VIOLATION_TYPE_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
7S,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,0
V*,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,0
UB*,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,0
LL41,1,1,1,1,1,1,1,1,1,0,1,0,0,0,1,1,0
COMPBLD,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,1,1
EB,4,4,4,4,4,4,4,4,4,4,3,0,0,0,4,4,0
IMD,5,5,5,5,5,5,5,5,5,3,0,0,1,0,5,5,5
A,83,83,83,83,83,83,83,83,83,38,32,0,47,0,83,83,83
HVCAT5,123,123,123,123,123,123,123,123,123,40,40,123,123,0,123,123,123
CLOS,285,285,285,285,285,285,285,285,285,180,25,0,21,1,285,285,285


# Format BBLs for violations dataset

In [114]:
def format_bbl(parcel):
    borough = str(parcel['BORO'])
    block = str(parcel['BLOCK']).zfill(5)
    lot = str(parcel['LOT'])[-4:]
    
    return int(borough + block + lot)

data['BBL'] = data.apply(format_bbl, axis=1)

In [149]:
data = data.set_index('BBL')
data.head()

Unnamed: 0_level_0,ISN_DOB_BIS_VIOL,BORO,BIN,BLOCK,LOT,ISSUE_DATE,VIOLATION_TYPE_CODE,VIOLATION_NUMBER,HOUSE_NUMBER,STREET,DISPOSITION_DATE,DISPOSITION_COMMENTS,DEVICE_NUMBER,DESCRIPTION,ECB_NUMBER,NUMBER,VIOLATION_CATEGORY,VIOLATION_TYPE
BBL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1007900009,493111,1,1014517.0,790,9,19970916,LL6291,171810,147R,WEST 14 STREET,,,00913773,,,V091697LL6291171810,V-DOB VIOLATION - ACTIVE,LL6291-LOCAL LAW 62/91 - BOILERS
2034980017,1503278,2,2123698.0,3498,17,20100521,C,0904AS,426,LELAND AVENUE,20120719.0,RESC LETTER DATED 8/6/10 ISSD TO REINST.PRMT #...,,,,V*052110C0904AS,V*-DOB VIOLATION - DISMISSED,C-CONSTRUCTION
1003000001,199556,1,1079648.0,300,1,19850816,LL10/81,SS0881,85,CANAL STREET,20090108.0,LL1081 WAIVER,,,,V*081685LL10/81SS0881,V*-DOB VIOLATION - DISMISSED,LL10/81-LOCAL LAW 10/81 - ELEVATOR
3086890097,83534,3,3245102.0,8689,97,19900612,E,1456D/04,501,BRIGHTWATER CT,19900911.0,253-81,3P1833,,,V*061290E1456D/04,V*-DOB VIOLATION - Resolved,E-ELEVATOR
2038230056,410729,2,2026956.0,3823,56,19950804,IMEGNCY,414/95,2253,NEWBOLD AVENUE,,,,,,V080495IMEGNCY414/95,V-DOB VIOLATION - ACTIVE,IMEGNCY-IMMEDIATE EMERGENCY


In [150]:
# Filter out elevator violations
elevator_data = data[data['VIOLATION_TYPE_CODE'] == 'E']

In [151]:
print("Elevators are {}% of violations".format(len(elevator_data) / len(data) * 100))

Elevators are 31.681158338122405% of violations


In [152]:
elevator_data.head()

Unnamed: 0_level_0,ISN_DOB_BIS_VIOL,BORO,BIN,BLOCK,LOT,ISSUE_DATE,VIOLATION_TYPE_CODE,VIOLATION_NUMBER,HOUSE_NUMBER,STREET,DISPOSITION_DATE,DISPOSITION_COMMENTS,DEVICE_NUMBER,DESCRIPTION,ECB_NUMBER,NUMBER,VIOLATION_CATEGORY,VIOLATION_TYPE
BBL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3086890097,83534,3,3245102.0,8689,97,19900612,E,1456D/04,501,BRIGHTWATER CT,19900911.0,253-81,3P1833,,,V*061290E1456D/04,V*-DOB VIOLATION - Resolved,E-ELEVATOR
1004740022,1392427,1,1007044.0,474,22,20090414,E,9444/292997,100,GRAND STREET,20100511.0,PPN203 AOC SUB ON 042810 BY METROPOLITAN ELEV ...,001P2176,,,V*041409E9444/292997,V*-DOB VIOLATION - Resolved,E-ELEVATOR
4014430019,1958386,4,4437128.0,1443,19,20150513,E,9027/542548,34-25,82 STREET,20171030.0,JMA CAT 5 PERFORMED 8-23-17,04P10310,,,V*051315E9027/542548,V*-DOB VIOLATION - Resolved,E-ELEVATOR
1010350001,1079930,1,1024982.0,1035,1,20060413,E,9011/158224,622,9 AVENUE,20060914.0,PPN203 2YR TEST FILED ON 081606 BY TRANSEL ...,01P17122,,,V*041306E9011/158224,V*-DOB VIOLATION - Resolved,E-ELEVATOR
2031340014,1201300,2,2013274.0,3134,14,20070501,E,9444H199363,984,BRONX PARK SOUTH,20070621.0,PPN203 AOC SUBMITTED ON 6/18/07 BY SERVI-TEK ...,002P5192,,,V*050107E9444H199363,V*-DOB VIOLATION - Resolved,E-ELEVATOR


# Question: Do buildings have multiple violations?

In [153]:
print("{}% of building have multiple violations. ".format(len(data.groupby('BBL').count()) / len(data) * 100))

12.996302523697583% of building have multiple violations. 


# Question: Where are the elevator violations occuring?

In [119]:
# Load up PLUTO for location info!
pluto = gpd.read_file('pluto.geojson')

In [148]:
pluto = pluto.rename(columns={'bbl':'BBL'})
pluto = pluto.set_index('BBL')
pluto.head()

Unnamed: 0_level_0,geometry
BBL,Unnamed: 1_level_1
2038090031,"(POLYGON ((-73.851883 40.829962, -73.851977000..."
2038090032,"(POLYGON ((-73.851794 40.829974, -73.851883 40..."
2038090033,"(POLYGON ((-73.85168899999999 40.829988, -73.8..."
2038090035,"(POLYGON ((-73.851603 40.83, -73.851618 40.829..."
2038090037,"(POLYGON ((-73.851563 40.830271, -73.851504000..."


In [156]:
# Join PLUTO and elevator data to get location of elevators
elevator_data_geo = elevator_data.join(pluto, how='inner', lsuffix='_left', rsuffix='_right')

In [159]:
print(len(elevator_data_geo))
print(len(elevator_data))
# I guess this means some elevator bbls were not in pluto? We should validate that we formated the bbls correctly... using address?

445613
475199


In [None]:
import folium
import geopandas

m = folium.Map(location=[ 40.7128, -74.0060],zoom_start=11,tiles="Stamen Toner")
m

# Question: When are the elevator violations occuring?