In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import pickle

### Initial Cleaning
* Read in Data, get Manhattan & Bronx Info
* Only initial inspections
* Turn house number into address
* Datetime cleaning & grouping as necessary

In [2]:
df = pd.read_csv("Rodent_Inspection.csv")

In [3]:
df = df[df.INSPECTION_TYPE == 'INITIAL']
df = df[df.BOROUGH.isin(['Manhattan','Bronx'])]
df = df[df.HOUSE_NUMBER == df.HOUSE_NUMBER]
df['ADDRESS'] = df.apply(lambda x: str(x['HOUSE_NUMBER']).strip() + " " + str(x['STREET_NAME']).strip(), axis=1)

In [4]:
df['INSPECTION_DATE_TS']= df.apply(lambda x: datetime.strptime(x['INSPECTION_DATE'], '%m/%d/%Y %H:%M:%S %p'), axis=1)
df['INSPECTION_MONTH'] = df.apply(lambda x: x['INSPECTION_DATE_TS'].month, axis=1)
df['INSPECTION_YEAR'] = df.apply(lambda x: x['INSPECTION_DATE_TS'].year, axis=1)

### More cleanup
* Only keep pertinent rows, only keep data from 2010-2017
* Get one record for each address to merge with tax lot data

In [5]:
col_to_keep = ['ADDRESS','INSPECTION_YEAR']

df2 = df[col_to_keep].copy()
df2 = df2[df2['INSPECTION_YEAR'] > 2009]
df2 = df2[df2['INSPECTION_YEAR'] < 2018]
df2 = df2.groupby('ADDRESS', as_index=False).INSPECTION_YEAR.count()
del df2['INSPECTION_YEAR']

### Tax Lot Data
* Read in Bronx and Manhattan data & concat
* Clean out record missing X Coordindates
* Get the first iteration of each address (some duplicates exist?)

In [13]:
df3 = pd.read_csv('PLUTO17v1.1/BX2017V11.csv')
df4 = pd.read_csv('PLUTO17v1.1/MN2017V11.csv')
df5 = pd.concat([df3,df4])
df5 = df5[df5.XCoord == df5.XCoord]
df5 = df5.groupby('Address', as_index=False).first()


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


### Merging Lot & Inspection Data to be used in Calculating_Distances.ipynb
* Don't want to calculate nearest neighbors if they arent in the dataset

In [7]:
df6 = df5.merge(df2, left_on = 'Address', right_on = 'ADDRESS').copy()

In [8]:
import pickle

with open('df_address.pkl', 'wb') as picklefile:
    pickle.dump(df6, picklefile)

### More initial lot data cleaning

In [14]:
def convert_zoning(x):
    if '/' in x:
        return 'Mixed Man/Residential'
    if x[0] == 'R':
        return 'Residential'
    if x[0] == 'C':
        return 'Commericial'
    if x[0] == 'M':
        return 'Manufacturing'
    if x == 'BPC':
        return 'Battery Park City'
    if x[:4] == 'PARK' or x[:3] == 'ZNA':
        return x
    if x[:2] == 'ZR':
        return 'Special'
df5['Zoning'] = df5.apply(lambda x: convert_zoning(str(x['ZoneDist1'])), axis=1)
df5['Commercial_Overlay'] = df5.apply(lambda x: 1 if x['Overlay1'] == x['Overlay1'] else 0, axis=1)
df5['Height_Limited'] = df5.apply(lambda x: 1 if x['LtdHeight'] == x['LtdHeight'] else 0, axis=1)
df5['Landmark'] = df5.apply(lambda x: 1 if x['Landmark'] == x['Landmark'] else 0, axis=1)
df5['HistDist'] = df5.apply(lambda x: 1 if x['HistDist'] == x['HistDist'] else 0, axis=1)

df5.PFIRM15_FLAG.fillna(0, inplace=True)
df5.Ext.fillna(0, inplace=True)
df5.SPDist1.fillna(0, inplace=True)
df5.OwnerType.fillna(0, inplace=True)
df5.OwnerName.fillna(0, inplace=True)
df5.SanitSub.fillna(0, inplace=True)
df5.CB2010.fillna(0, inplace=True)
df5.SanitBoro.fillna(0, inplace=True)
df5.LandUse.fillna(999, inplace=True)
df5 = df5[df5.XCoord == df5.XCoord]

In [15]:
col_list = [
 'Address',
 'Block',
 'SanitBoro',
 'SanitDistrict',
 'SanitSub',
 'BldgClass',
 'LandUse',
 'Easements',
 'OwnerType',
 'LotArea',
 'BldgArea',
 'ComArea',
 'ResArea',
 'OfficeArea',
 'RetailArea',
 'GarageArea',
 'StrgeArea',
 'FactryArea',
 'OtherArea',
 'NumBldgs',
 'NumFloors',
 'UnitsRes',
 'UnitsTotal',
 'LotFront',
 'LotDepth',
 'BldgFront',
 'BldgDepth',
 'Ext',
 'ProxCode',
 'IrrLotCode',
 'LotType',
 'BsmtCode',
 'AssessLand',
 'AssessTot',
 'ExemptLand',
 'ExemptTot',
 'YearBuilt',
 'YearAlter1',
 'YearAlter2',
 'HistDist',
 'Landmark',
 'BuiltFAR',
 'ResidFAR',
 'CommFAR',
 'FacilFAR',
 'XCoord',
 'YCoord',
 'PFIRM15_FLAG',
 'Zoning',
 'Commercial_Overlay',
 'Height_Limited']

In [16]:
df5 = df5[col_list].copy()

In [19]:
with open('lot_data.pkl', 'wb') as picklefile:
    pickle.dump(df5, picklefile)