# NYC PLUTO Data Processing

### Author: Bartosz Bonczak

In [1]:
import pandas as pd
import numpy as np

from IPython.display import clear_output

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

# get PLUTO data

Obtain PLUTO from NYC Open Data: https://data.cityofnewyork.us/resource/64uk-42ks.json

In [None]:
# setup Socrata client, in this example we use generic, unauthenticated account
client = Socrata("data.cityofnewyork.us", your_token)
dataset_id = "64uk-42ks"

# Obtain the dataset as JSON from API by sodapy.
data = client.get(dataset_id, limit=1000000)


# Convert results to pandas DataFrame
pluto = pd.DataFrame.from_records(data)
pluto.to_csv("../data/big_data/pluto_nycod.csv", index=False)


In [4]:

pluto = pd.read_csv("../data/big_data/pluto_nycod.csv")

# Process PLUTO data

In [5]:
bldg_class_dict = {
    'A':'ONE FAMILY DWELLINGS',
    'B':'TWO FAMILY DWELLINGS',
    'C':'WALK UP APARTMENTS',
    'D':'ELEVATOR APARTMENTS',
    'E':'WAREHOUSES',
    'F':'FACTORIES AND INDUSTRIAL BUILDINGS',
    'G':'GARAGES',
    'H':'HOTELS',
    'I':'HOSPITALS AND HEALTH FACILITIES',
    'J':'THEATRES',
    'K':'STORE BUILDINGS',
    'L':'LOFTS',
    'M':'RELIGIOUS FACILITIES',
    'N':'ASYLUMS AND HOMES',
    'O':'OFFICE BUILDINGS',
    'P':'INDOOR PUBLIC ASSEMBLY & CULT. FACILITIES',
    'Q':'OUTDOOR RECREATIONAL FACILITIES',
    'R':'CONDOMINIUMS',
    'S':'PRIMARILY RES. - MIXED USE',
    'T':'TRANSPORTATION FACILITIES',
    'U':'UTILITY BUREAU PROPERTIES',
    'V':'VACANT LAND',
    'W':'EDUCATIONAL FACILITIES',
    'Y':'GOVERNMENT/CITY DEPARTMENTS',
    'Z':'MISC. BUILDING CLASSIFICATIONS',
    'n':'OTHER'

}

In [6]:
cols_pluto = [
    'bbl', 'latitude', 'longitude',  
    'yearbuilt', 'yearalter1', 'yearalter2',
    'bldgclass', 'landuse', 'bldgarea', 'resarea',
    'numbldgs', 'numfloors', 'unitsres', 'unitstotal',
    'bldgfront', 'bldgdepth',  'proxcode', 'bsmtcode',
    'assessland','assesstot',
]

pluto = pluto[cols_pluto]

# feature engineering


In [7]:

# bbl
pluto.bbl = pluto.bbl.astype(int).astype(str).str.zfill(10)

for c in ['yearbuilt', 'yearalter1', 'yearalter2']:
    pluto[c] = pd.to_datetime(pluto[c], errors='coerce', format="%Y")

# convert to numeric
pluto[['latitude', 'longitude', 'bldgarea', 'resarea', 'numbldgs', 'numfloors', 
       'unitsres', 'unitstotal', 'bldgfront', 'bldgdepth',  'assessland','assesstot']] = \
pluto[['latitude', 'longitude', 'bldgarea', 'resarea', 'numbldgs', 'numfloors', 
       'unitsres', 'unitstotal', 'bldgfront', 'bldgdepth',  'assessland','assesstot']].astype(float)

# convert categorical to strings
for c in ['bldgclass', 'landuse',  'proxcode', 'bsmtcode']:
    pluto[c] = pluto[c].astype(str).apply(lambda x: x.split('.')[0].zfill(2))


In [8]:
pluto['most_recent_alt'] = pluto[['yearalter1', 'yearalter2']].max(axis=1)

pluto['age'] = int(pd.datetime.today().year) - pluto.yearbuilt.dt.year

pluto['age_since_alt'] = int(pd.datetime.today().year) - pluto.most_recent_alt.dt.year

pluto['pct_res'] = pluto['resarea'] / pluto['bldgarea']

pluto['avg_unit_size'] = pluto['resarea'] / pluto['unitsres']

pluto['bldg_shape_ratio'] = pluto['bldgfront'] / pluto['bldgdepth']

pluto['bldg_value'] = pluto['assesstot'] - pluto['assessland']

pluto['main_bldg_class'] = pluto.bldgclass.apply(lambda x: x[:1])

In [9]:
prox_code_dict = {
    '00':'not_available',
    '01':'detached',
    '02':'semi_attached',
    '03':'attached'
}

pluto['proxcode'] = pluto['proxcode'].map(prox_code_dict)

pluto = pluto.join(pd.get_dummies(pluto[['proxcode']]))

In [10]:
bsmt_code_dict = {
    '00':'no_basement',
    '01':'bsmt_above',
    '02':'bsmt_below',
    '03':'bsmt_above',
    '04':'bsmt_below',
    '05':'not_available',
}

pluto['bsmtcode'] = pluto['bsmtcode'].map(bsmt_code_dict)

pluto = pluto.join(pd.get_dummies(pluto[['bsmtcode']]))

In [11]:
pluto.to_csv("../output/pluto_processed.csv", index=False)