# Generate tax features

### Introduction

This notebook documents the process of generating feature data from the file matched_Fire_Incidents.csv. These features will be used as the target variables in modeling.

### Load libraries and CSV

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

path = 'C:\\Users\\Kevin\\Desktop\\Fire Risk\\Model_matched_to_EAS'

#This will take a while to load.  Very large file...
tax_df = pd.read_csv('data/Model_matched_to_EAS/matched_EAS_Tax_Data.csv', 
              low_memory=False)[[
                 'EAS BaseID',
                 'Neighborhoods - Analysis Boundaries',
                 'Property Class Code',
                 'Property_Class_Code_Desc',
                 'Location_y',
                 'Address',
                 'Year Property Built',
                 'Number of Bathrooms',
                 'Number of Bedrooms',
                 'Number of Rooms',
                 'Number of Stories',
                 'Number of Units',
                 'Percent of Ownership',
                 'Closed Roll Assessed Land Value',
                 'Property Area in Square Feet',
                 'Closed Roll Assessed Improvement Value'
                 ]].dropna()

#Create land value per square foot var
tax_df['landval_psqft'] = tax_df['Closed Roll Assessed Land Value'] / tax_df['Property Area in Square Feet']

tax_df.rename(columns = {'EAS BaseID': 'EAS'}, inplace=True)
tax_df.rename(columns = {'Neighborhoods - Analysis Boundaries': 'Neighborhood'}, inplace=True)

In [2]:
tax_df.head()

Unnamed: 0,EAS,Neighborhood,Property Class Code,Property_Class_Code_Desc,Location_y,Address,Year Property Built,Number of Bathrooms,Number of Bedrooms,Number of Rooms,Number of Stories,Number of Units,Percent of Ownership,Closed Roll Assessed Land Value,Property Area in Square Feet,Closed Roll Assessed Improvement Value,landval_psqft
0,467876,Financial District/South Beach,Z,Condominium,"(37.7862904935788, -122.401375196262)",188 MINNA ST,2005.0,2.0,2,5,0,0,1.0,1140725.0,1670,760483.0,683.068862
1,467876,Financial District/South Beach,Z,Condominium,"(37.7862904935788, -122.401375196262)",188 MINNA ST,2005.0,2.0,2,5,0,0,1.0,1168821.0,1670,779213.0,699.892814
2,467876,Financial District/South Beach,Z,Condominium,"(37.7862904935788, -122.401375196262)",188 MINNA ST,2005.0,2.0,2,5,0,0,1.0,1118358.0,1670,745572.0,669.675449
3,467876,Financial District/South Beach,Z,Condominium,"(37.7862904935788, -122.401375196262)",188 MINNA ST,2005.0,2.0,2,5,0,0,1.0,1110000.0,1670,740000.0,664.670659
4,467876,Financial District/South Beach,Z,Condominium,"(37.7862904935788, -122.401375196262)",188 MINNA ST,2005.0,2.0,2,5,0,0,0.75,837521.0,1670,837521.0,501.509581


### Remove outlier observations, then collapse by EAS

In [13]:
def removal(var, low, high):
    tax_df[(tax_df[var]<=low) & (tax_df[var]<=high)]
    return tax_df

#Remove if 0 stories, remove if > 30 stories
tax_df = removal('Number of Stories',1,30)

#Remove if landvalue/sq_foot = 1 or > 1000
tax_df = removal('landval_psqft',1,1000)

#Remove if num. bathrooms, bedrooms, extra rooms > 100
tax_df = removal('Number of Bathrooms',0,100)
tax_df = removal('Number of Bedrooms',0,100)
tax_df = removal('Number of Rooms',0,100)

#Remove if year_built < 1880 or > 2017
tax_df = removal('Year Property Built',1880,2017)

#Remove num units > 250
tax_df = removal('Number of Units',0,250)

#Remove percent ownership < 0, > 1
tax_df = removal('Percent of Ownership',0,1)

#Create Tot_rooms var
tax_df['Tot_Rooms'] = tax_df['Number of Bathrooms'] + \
                    tax_df['Number of Bedrooms']  + \
                    tax_df['Number of Rooms']
        
#Subset to numeric vars only, group by EAS average          
tax_df_num = tax_df[[
                 'EAS',
                 'Year Property Built',
                 'Number of Bathrooms',
                 'Number of Bedrooms',
                 'Number of Rooms',
                 'Number of Stories',
                 'Number of Units',
                 'Percent of Ownership',
                 'Closed Roll Assessed Land Value',
                 'Property Area in Square Feet',
                 'Closed Roll Assessed Improvement Value',
                 'Tot_Rooms',
                 'landval_psqft'
                 ]].groupby(by='EAS').mean().reset_index()

In [14]:
pd.options.display.float_format = '{:.2f}'.format
tax_df_num.describe()

Unnamed: 0,EAS,Year Property Built,Number of Bathrooms,Number of Bedrooms,Number of Rooms,Number of Stories,Number of Units,Percent of Ownership,Closed Roll Assessed Land Value,Property Area in Square Feet,Closed Roll Assessed Improvement Value,Tot_Rooms,landval_psqft
count,182644.0,182644.0,182644.0,182644.0,182644.0,182644.0,182644.0,182644.0,182644.0,182644.0,182644.0,182644.0,182644.0
mean,374906.74,1929.35,1.0,1.0,10.71,1.79,2.47,0.87,435003.43,4226.29,419898.65,12.72,143.15
std,58990.87,25.0,2.42,2.42,10.7,1.05,5.2,0.23,1865870.6,16384.31,2805331.13,12.43,140.32
min,274493.0,1880.0,0.0,0.0,0.0,1.0,0.0,0.0,1902.0,100.0,0.0,0.0,1.03
25%,324300.75,1908.0,0.0,0.0,5.0,1.0,1.0,0.78,78906.33,1325.0,101625.12,6.0,30.07
50%,373653.5,1925.0,0.0,0.0,7.0,2.0,1.0,1.0,224573.61,2000.0,198820.14,9.0,98.6
75%,421535.25,1945.0,1.33,1.33,12.0,2.0,1.89,1.0,444255.47,3484.0,354569.88,15.0,212.3
max,490644.0,2016.0,80.0,80.0,99.0,30.0,193.0,1.0,140883380.88,1320000.0,323993523.44,205.0,994.22


### Create subset of string vars

In [16]:
tax_df_str = tax_df[[
                 'EAS',
                 'Neighborhood',
                 'Property Class Code',
                 'Property_Class_Code_Desc',
                 'Location_y',
                 'Address',
                 ]].groupby(by='EAS').max().reset_index()

tax_df_str['Property_Class_Code_Desc'] = tax_df_str['Property_Class_Code_Desc'].apply(lambda x: x.upper())
tax_df_str['Neighborhood'] = tax_df_str['Neighborhood'].apply(lambda x: x.upper())

In [18]:
tax_df_str.head()

Unnamed: 0,EAS,Neighborhood,Property Class Code,Property_Class_Code_Desc,Location_y,Address
0,274493,RUSSIAN HILL,A,APARTMENT,"(37.8064516469645, -122.420784953602)",2761 HYDE ST
1,274494,RUSSIAN HILL,AC,APARTMNT & COMMERCIAL STORE,"(37.8050093143699, -122.420019737303)",2606 HYDE ST
2,274503,RUSSIAN HILL,A,APARTMENT,"(37.8041055728047, -122.416472710727)",2436 JONES ST
3,274504,NORTH BEACH,F2,FLAT & STORE,"(37.8045548007939, -122.41310730589)",2262 MASON ST
4,274505,NORTH BEACH,A,APARTMENT,"(37.8042828119518, -122.410214408196)",404 CHESTNUT ST


### Create more generalized grouping for Property Class

In [21]:
pd.set_option("display.max_rows",999)
tax_df_str.groupby(['Property Class Code', 'Property_Class_Code_Desc']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,EAS,Neighborhood,Location_y,Address
Property Class Code,Property_Class_Code_Desc,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,APARTMENT,22085,22085,22085,22085
AC,APARTMNT & COMMERCIAL STORE,1051,1051,1051,1051
B,BANK,126,126,126,126
C,COMMERCIAL STORES,4402,4402,4402,4402
C1,SHOPPING CENTER,14,14,14,14
CD,COMMERCIAL STORES,10,10,10,10
CM,COMMERCIAL/MIXED USE,27,27,27,27
CO,COOP UNITS UNSEGREGATED,27,27,27,27
COS,COOP UNITS SEGREGATED,4,4,4,4
COS,COOP UNITS UNSEGREGATED,23,23,23,23


Somewhat difficult to group.  I think we should seperate some of the large categories, and roll all of the smaller categories into "Other".  For example:

APARTMENTS -  A, AC, DA, TIA  
DWELLING - D  
FLATS AND DUPLEX - F, F2, FA, TIF  
CONDO - Z Condominium  
COMMERCIAL - C, CD, B, C1, CD, CM, CZ  
INDUSTRIAL - I, IDC, IW, IX, IZ  
OFFICE - O, OA, OAH, OBH, OBM, OC, OCH, OCL, OCM, OMD, OZ  
OTHER - All other codes

In [27]:
di = {'APARTMENT': ['A', 'AC', 'DA', 'TIA'], 
      'DWELLING': ['D'], 
      'FLATS AND DUPLEX': ['F','F2','FA','TIF'], 
      'CONDO, ETC.': ['Z'],
      'COMMERCIAL USE': ['C','CD','B','C1','CD','CM','CZ'],
      'INDUSTRIAL USE': ['I','IDC','IW','IX','IZ'],
      'OFFICE' : ['O', 'OA','OAH', 'OBH', 'OBM', 'OC', 'OCH', 'OCL', 'OCM', 'OMD', 'OZ']}

# reverse the mapping
di = {d:c for c, d_list in di.items()
        for d in d_list}

#Map to 'Building_Cat' groupings var
tax_df_str['Building_Cat'] = tax_df_str['Property Class Code'].map(di)

#Remainders placed in "OTHER" category
x = ['APARTMENT', 'DWELLING', 'FLATS AND DUPLEX', 'CONDO, ETC.', 'COMMERCIAL USE', 'INDUSTRIAL USE', 'OFFICE']
tax_df_str.loc[~tax_df_str['Building_Cat'].isin(x), 'Building_Cat'] = 'OTHER'

In [29]:
tax_df_str['Building_Cat'].value_counts()

DWELLING            95387
FLATS AND DUPLEX    40349
APARTMENT           24213
CONDO, ETC.          9674
COMMERCIAL USE       4842
OTHER                3977
INDUSTRIAL USE       2158
OFFICE               2044
Name: Building_Cat, dtype: int64

### Merge DF back, clean up, export 

In [31]:
exp_df = pd.merge(tax_df_str, tax_df_num, how='left', on='EAS')
exp_df.drop(['Property Class Code', 'Property_Class_Code_Desc'], inplace=True, axis=1)

In [39]:
#Rename
exp_df.rename(columns = {'Year Property Built': 'Yr_Property_Built'}, inplace=True)
exp_df.rename(columns = {'Number of Bathrooms': 'Num_Bathrooms'}, inplace=True)
exp_df.rename(columns = {'Number of Bedrooms': 'Num_Bedrooms'}, inplace=True)
exp_df.rename(columns = {'Number of Rooms': 'Num_Rooms'}, inplace=True)
exp_df.rename(columns = {'Number of Stories': 'Num_Stories'}, inplace=True)
exp_df.rename(columns = {'Number of Units': 'Num_Units'}, inplace=True)
exp_df.rename(columns = {'Percent of Ownership': 'Perc_Ownership'}, inplace=True)
exp_df.rename(columns = {'Closed Roll Assessed Land Value': 'Land_Value'}, inplace=True)
exp_df.rename(columns = {'Property Area in Square Feet': 'Property_Area'}, inplace=True)
exp_df.rename(columns = {'Closed Roll Assessed Improvement Value': 'Assessed_Improvement_Val'}, inplace=True)

In [40]:
exp_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182644 entries, 0 to 182643
Data columns (total 17 columns):
EAS                         182644 non-null int64
Neighborhood                182644 non-null object
Location_y                  182644 non-null object
Address                     182644 non-null object
Building_Cat                182644 non-null object
Yr_Property_Built           182644 non-null float64
Num_Bathrooms               182644 non-null float64
Num_Bedrooms                182644 non-null float64
Num_Rooms                   182644 non-null float64
Num_Stories                 182644 non-null float64
Num_Units                   182644 non-null float64
Perc_Ownership              182644 non-null float64
Land_Value                  182644 non-null float64
Property_Area               182644 non-null float64
Assessed_Improvement_Val    182644 non-null float64
Tot_Rooms                   182644 non-null float64
landval_psqft               182644 non-null float64
dtypes: flo

In [41]:
#Export data
exp_df.to_csv(path_or_buf= path + '\\' + 'tax_data_formerge_20170917.csv', index=False)