# Merging Pluto and Census data with HPD data.
In this notebook, we merge the processed HPD data with the PLUTO and census data. We will merge the HPD and PLUTO data first, using BBL as a merge key. Then we will merge this set with the census, using BoroughID and census tract as keys.

In [26]:
import pandas as pd
from get_clean_pluto_building_data import *
import re

pluto = get_clean_pluto_data()
hpd = pd.read_csv('../../data/merged_complaints_problems_violations.csv')

merged_hpd_pluto = pd.merge(pluto, hpd, on='BBL', how='inner')
merged_hpd_pluto.head(6)

Unnamed: 0.1,UnitsRes,AssessTot,BBL,CT2010,Avg_value_per_res_unit,BuildingAge,YearSinceLastAlter,Unnamed: 0,UnitTypeID,SpaceTypeID,...,MajorCategoryID,MinorCategoryID,CodeID,Probs_in_complaint,ViolationIssued,BoroughID,ReceivedDate,Tot_A_violations,Tot_B_violations,Tot_C_violations
0,8,104219,2022600018,1900,13027.375,95,6,571341,92,550,...,59,349,2716,1,0,2,2015-04-28,1,0,4
1,8,104219,2022600018,1900,13027.375,95,6,571342,91,543,...,59,348,2713,1,0,2,2015-04-28,1,0,4
2,12,347400,2022610045,1900,28950.0,90,90,175068,92,550,...,59,349,2715,1,0,2,2014-11-05,1,1,0
3,12,347400,2022610045,1900,28950.0,90,90,175069,91,543,...,59,348,2713,1,0,2,2014-11-13,1,1,0
4,12,347400,2022610045,1900,28950.0,90,90,175070,92,550,...,59,349,2715,1,0,2,2014-11-13,1,1,0
5,12,347400,2022610045,1900,28950.0,90,90,175071,92,550,...,59,349,2715,1,0,2,2014-11-13,1,1,0


In [27]:
merged_hpd_pluto.shape

(609157, 21)

In [28]:
merged_hpd_pluto.columns

Index([u'UnitsRes', u'AssessTot', u'BBL', u'CT2010', u'Avg_value_per_res_unit',
       u'BuildingAge', u'YearSinceLastAlter', u'Unnamed: 0', u'UnitTypeID',
       u'SpaceTypeID', u'TypeID', u'MajorCategoryID', u'MinorCategoryID',
       u'CodeID', u'Probs_in_complaint', u'ViolationIssued', u'BoroughID',
       u'ReceivedDate', u'Tot_A_violations', u'Tot_B_violations',
       u'Tot_C_violations'],
      dtype='object')

Note the index of from hpd has been added as a feature to the merged dataset (as evidenced by the fact it has 609157 unique values in a dataframe with 609157 rows), so we'll drop it before proceeding.

In [29]:
len(merged_hpd_pluto['Unnamed: 0'].unique())

609157

In [30]:
merged_hpd_pluto = merged_hpd_pluto.drop('Unnamed: 0', axis=1)

In [31]:
merged_hpd_pluto.shape

(609157, 20)

In [32]:
from get_income_data_from_census import *
income = get_clean_income_data()

merged_pluto_hpd_census = pd.merge(income, merged_hpd_pluto, on=['CT2010','BoroughID'], how='inner')

In [33]:
merged_pluto_hpd_census.head(6)

Unnamed: 0,Median_income,State,CT2010,BoroughID,UnitsRes,AssessTot,BBL,Avg_value_per_res_unit,BuildingAge,YearSinceLastAlter,...,TypeID,MajorCategoryID,MinorCategoryID,CodeID,Probs_in_complaint,ViolationIssued,ReceivedDate,Tot_A_violations,Tot_B_violations,Tot_C_violations
0,69514,36,200,2,2,7363,2034410083,3681.5,75,75,...,4,10,341,2678,1,0,2015-04-17,0,0,0
1,69514,36,200,2,1,8060,2034420123,8060.0,70,70,...,1,61,365,2767,1,0,2015-03-28,0,0,0
2,69514,36,200,2,1,8060,2034420123,8060.0,70,70,...,1,56,333,2664,5,0,2015-03-28,0,0,0
3,69514,36,200,2,1,8060,2034420123,8060.0,70,70,...,3,9,65,2536,5,0,2015-03-28,0,0,0
4,69514,36,200,2,1,8060,2034420123,8060.0,70,70,...,3,11,73,679,5,0,2015-03-28,0,0,0
5,69514,36,200,2,1,8060,2034420123,8060.0,70,70,...,3,63,376,2822,5,0,2015-03-28,0,0,0


In [34]:
print merged_pluto_hpd_census.shape

(609157, 22)


In [35]:
merged_pluto_hpd_census.columns

Index([u'Median_income', u'State', u'CT2010', u'BoroughID', u'UnitsRes',
       u'AssessTot', u'BBL', u'Avg_value_per_res_unit', u'BuildingAge',
       u'YearSinceLastAlter', u'UnitTypeID', u'SpaceTypeID', u'TypeID',
       u'MajorCategoryID', u'MinorCategoryID', u'CodeID',
       u'Probs_in_complaint', u'ViolationIssued', u'ReceivedDate',
       u'Tot_A_violations', u'Tot_B_violations', u'Tot_C_violations'],
      dtype='object')

Now, we drop features that we anticipate are non-informative.

In [36]:
merged_pluto_hpd_census = merged_pluto_hpd_census.drop(['State','CT2010','BBL'],axis=1)
merged_pluto_hpd_census.shape

(609157, 19)

In [37]:
merged_pluto_hpd_census.columns

Index([u'Median_income', u'BoroughID', u'UnitsRes', u'AssessTot',
       u'Avg_value_per_res_unit', u'BuildingAge', u'YearSinceLastAlter',
       u'UnitTypeID', u'SpaceTypeID', u'TypeID', u'MajorCategoryID',
       u'MinorCategoryID', u'CodeID', u'Probs_in_complaint',
       u'ViolationIssued', u'ReceivedDate', u'Tot_A_violations',
       u'Tot_B_violations', u'Tot_C_violations'],
      dtype='object')

Now, let's see if it has any missing values:

In [38]:
merged_pluto_hpd_census.isnull().any(axis=1).sum()

35

It looks like we still have 35 records with missing values- since they represent an insignificant fraction of our total dataset, we drop them.

In [39]:
merged_pluto_hpd_census = merged_pluto_hpd_census[~(merged_pluto_hpd_census.isnull().any(axis=1))]

In [40]:
merged_pluto_hpd_census.shape

(609122, 19)

In [41]:
merged_pluto_hpd_census.head(5)

Unnamed: 0,Median_income,BoroughID,UnitsRes,AssessTot,Avg_value_per_res_unit,BuildingAge,YearSinceLastAlter,UnitTypeID,SpaceTypeID,TypeID,MajorCategoryID,MinorCategoryID,CodeID,Probs_in_complaint,ViolationIssued,ReceivedDate,Tot_A_violations,Tot_B_violations,Tot_C_violations
0,69514,2,2,7363,3681.5,75,75,91,543,4,10,341,2678,1,0,2015-04-17,0,0,0
1,69514,2,1,8060,8060.0,70,70,93,573,1,61,365,2767,1,0,2015-03-28,0,0,0
2,69514,2,1,8060,8060.0,70,70,91,545,1,56,333,2664,5,0,2015-03-28,0,0,0
3,69514,2,1,8060,8060.0,70,70,91,546,3,9,65,2536,5,0,2015-03-28,0,0,0
4,69514,2,1,8060,8060.0,70,70,91,546,3,11,73,679,5,0,2015-03-28,0,0,0


Now we convert ReceivedDate to a datetime feature, then extract the month (to use a categorical variable approximating season).

In [42]:
merged_pluto_hpd_census['ReceivedDate'] = pd.to_datetime(merged_pluto_hpd_census['ReceivedDate'])
merged_pluto_hpd_census['Month'] = merged_pluto_hpd_census['ReceivedDate'].map(lambda x: x.month)
merged_pluto_hpd_census = merged_pluto_hpd_census.drop('ReceivedDate', axis=1)

Now, before we finish, we need to make the following dummy variables:
  - Month
  - BoroughID
  - UnitTypeID
  - SpaceTypeID
  - TypeID
  - MajorCategoryID
  - MinorCategoryID
  - CodeID
Before we do, we describe the dataset (since adding the dummies will dramatically increase the number of features, we do this first).

In [43]:
merged_pluto_hpd_census.describe()

Unnamed: 0,BoroughID,UnitsRes,AssessTot,Avg_value_per_res_unit,BuildingAge,YearSinceLastAlter,UnitTypeID,SpaceTypeID,TypeID,MajorCategoryID,MinorCategoryID,CodeID,Probs_in_complaint,ViolationIssued,Tot_A_violations,Tot_B_violations,Tot_C_violations,Month
count,609122.0,609122.0,609122.0,609122.0,609122.0,609122.0,609122.0,609122.0,609122.0,609122.0,609122.0,609122.0,609122.0,609122.0,609122.0,609122.0,609122.0,609122.0
mean,2.419904,84.783628,2220525.0,23228.780804,83.888305,65.596491,91.354267,546.264729,1.785552,45.71983,293.525647,2565.203797,3.701505,0.191119,7.025087,22.061372,5.286616,6.216817
std,1.001496,417.443972,15380700.0,58894.912807,22.943957,32.934967,0.600159,6.046985,1.033644,21.369265,107.187189,495.660288,3.356061,0.393183,16.584919,48.557439,11.001849,3.832106
min,1.0,1.0,0.0,0.0,1.0,0.0,91.0,541.0,1.0,8.0,59.0,617.0,1.0,0.0,0.0,0.0,0.0,1.0
25%,2.0,8.0,111016.0,12515.625,80.0,30.0,91.0,543.0,1.0,28.0,198.0,2617.0,1.0,0.0,0.0,0.0,0.0,2.0
50%,2.0,31.0,619650.0,17318.181818,88.0,84.0,91.0,543.0,1.0,59.0,348.0,2713.0,2.0,0.0,1.0,4.0,1.0,6.0
75%,3.0,64.0,1327050.0,24347.727273,95.0,90.0,92.0,550.0,3.0,59.0,349.0,2806.0,6.0,0.0,7.0,23.0,6.0,10.0
max,5.0,10914.0,569465100.0,14307300.0,215.0,215.0,93.0,577.0,4.0,65.0,381.0,2834.0,42.0,1.0,303.0,896.0,151.0,12.0


In [44]:
merged_pluto_hpd_census.shape

(609122, 19)

Now we make dummy variables for our categorical features.

In [61]:
def make_dummy_variables(dataframe, feature):
    uniques = dataframe[feature].unique()
    dummies = pd.get_dummies(dataframe[feature])
    colnames = ['{}_{}'.format(str(feature), str(int(x))) for x in uniques]
    dummies.columns = colnames
    dummies.drop(colnames[-1], axis=1, inplace=True)
    return dummies

def drop_minor_classes_for_feature(dataframe, feature, min_num_obs=500):
    important_classes = dataframe[feature].value_counts()
    important_classes = important_classes[important_classes > min_num_obs]
    dataframe[feature] = dataframe[feature].map(lambda x: x if (int(x) in important_classes) else 0)
    return dataframe

def clean_categorical_vars(dataframe, list_of_cat_vars):
    for var in list_of_cat_vars:
        drop_minor_classes_for_feature(dataframe, var)
        dummies = make_dummy_variables(dataframe, var)
        dataframe = pd.concat([dataframe, dummies], axis=1)
        dataframe = dataframe.drop(var, axis=1)
    return dataframe

In [62]:
cat_vars = ['BoroughID','UnitTypeID','SpaceTypeID','TypeID','MajorCategoryID','MinorCategoryID','CodeID','Month']
cleaned_df = clean_categorical_vars(merged_pluto_hpd_census, cat_vars)

In [63]:
cleaned_df.shape

(609122, 214)

In [66]:
cleaned_df.isnull().any(axis=1).sum()

0

In [None]:
cleaned_df.to_csv('../../data/merged_hpd_census_pluto.csv')