In [134]:
import pandas as pd
import numpy as np
import sklearn
import re

### Import Markdown

In [234]:
housing_data = pd.read_csv('data/2014_Housing_Market_Analysis_Data_by_Zip_Code.csv')
crime_2016_data = pd.read_csv('data/2016_Annual_Crime_Data.csv')
crime_2015_data = pd.read_csv('data/Annual_Crime_Dataset_2015.csv')
library_data = pd.read_csv('data/Austin_Public_Library_Locations.csv')
water_consumption_data = pd.read_csv('data/Austin_Water_-_Residential_Water_Consumption.csv')
campaign_finance_data = pd.read_csv('data/Campaign_Finance_Data_-_Report_Detail_Dataset.csv')
park_data = pd.read_csv('data/City_of_Austin_Parks_data.csv')
public_art_data = pd.read_csv('data/City_of_Austin_Public_Art_Collection.csv')
public_venue_data = pd.read_csv('data/Creative_Workspaces__Performance_Venues__Galleries___Museums.csv')
ev_charging_data = pd.read_csv('data/Electric_Vehicle_Charging_Network.csv')
restaurant_inspection_data = pd.read_csv('data/Restaurant_Inspection_Scores.csv')
traffic_camera_data = pd.read_csv('data/Traffic_Cameras.csv')

### Extract Zip Code from Address column in library data

In [239]:
library_data['Zip_Code'] = library_data['Address'].str.findall('\s+\d+\n')
library_data['Zip_Code'] = library_data['Zip_Code'].str[0].str[:6]

### Clean Restaurant Inspections (Multiple Dates per Restaurant)

In [240]:
restaurant_max_inspection = restaurant_inspection_data.groupby('Restaurant Name', as_index = False)['Inspection Date'].agg('max')

restaurant_inspection = pd.merge(left = restaurant_max_inspection, right = restaurant_inspection_data
                      , how = 'inner'
                      , left_on = ['Restaurant Name', 'Inspection Date']
                      , right_on = ['Restaurant Name', 'Inspection Date'])

restaurant_inspection_data = restaurant_inspection.drop_duplicates()

restaurant_inspection_data['Zip Code'] = restaurant_inspection_data['Zip Code'].str[-5:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


### Aggregate Files to Zip Code level

In [271]:
crime_2015 = crime_2015_data.groupby('GO Location Zip', as_index = False).size()
crime_2015 = crime_2015.reset_index()
crime_2015 = crime_2015.rename(columns = {0: 'crimes_2015'})
# print(crime_2015.head())

crime_2016 = crime_2016_data.groupby('GO Location Zip', as_index = False).size()
crime_2016 = crime_2016.reset_index()
crime_2016 = crime_2016.rename(columns = {0: 'crimes_2016'})
# crime_2016.head()

ev_charging = pd.DataFrame(ev_charging_data.groupby('Postal Code', as_index = False).size())
ev_charging = ev_charging.reset_index()
ev_charging = ev_charging.rename(columns = {0: 'ev_charging_stations'})
# ev_charging.head()

restaurant_inspection = restaurant_inspection_data.groupby('Zip Code', as_index = False)['Score'].agg(['median','size'])
restaurant_inspection = restaurant_inspection.reset_index()
restaurant_inspection = restaurant_inspection.rename(columns = {'median': 'median_rest_insp_score', 'size':'number_of_inspections'})
restaurant_inspection['Zip Code'] = pd.to_numeric(restaurant_inspection['Zip Code'])
# print(restaurant_inspection.head())

public_art = public_art_data.groupby('Location Zip Code', as_index = False).size()
public_art = public_art.reset_index()
public_art = public_art.rename(columns = {0: 'public_art_installations'})
# public_art.head()

public_venue = public_venue_data.groupby('ZIP', as_index = False).size()
public_venue = public_venue.reset_index()
public_venue = public_venue.rename(columns = {0: 'public_venues'})
# public_venue.head()

park = park_data.groupby('ZIP_CODE', as_index = False).size()
park = park.reset_index()
park = park.rename(columns = {0: 'parks'})
# park.head()

water_consumption = water_consumption_data.groupby('Postal Code', as_index = False)['Total Gallons'].median()
water_consumption = water_consumption.reset_index()
water_consumption = water_consumption.rename(columns = {'Total Gallons': 'median_water_used_gal'})
del water_consumption['index']
# water_consumption.head()

library = library_data.groupby('Zip_Code', as_index = False).size()
library = library.reset_index()
library = library.rename(columns = {0: 'libraries'})
library['Zip_Code'] = pd.to_numeric(library['Zip_Code'].str.strip())

### Combine Files

In [270]:
combined_data = crime_2015.copy()
# print(crime_2015.shape)
print(combined_data.shape)

combined_data = pd.merge(left = combined_data, right = crime_2016
                      , how = 'outer'
                      , on = 'GO Location Zip')
# print(combined_data.shape)

combined_data = pd.merge(left = combined_data, right = ev_charging
                      , how = 'left'
                      , left_on = 'GO Location Zip'
                      , right_on = 'Postal Code')
del combined_data['Postal Code']
# # # print(combined_data.shape)

combined_data = pd.merge(left = combined_data, right = housing_data
                      , how = 'left'
                      , left_on = 'GO Location Zip'
                      , right_on = 'Zip Code')
del combined_data['Zip Code']
# # print(combined_data.shape)

combined_data = pd.merge(left = combined_data, right = park
                      , how = 'left'
                      , left_on = 'GO Location Zip'
                      , right_on = 'ZIP_CODE')
del combined_data['ZIP_CODE']
# # print(combined_data.shape)

combined_data = pd.merge(left = combined_data, right = public_art
                      , how = 'left'
                      , left_on = 'GO Location Zip'
                      , right_on = 'Location Zip Code')
del combined_data['Location Zip Code']
# print(combined_data.shape)

combined_data = pd.merge(left = combined_data, right = water_consumption
                      , how = 'left'
                      , left_on = 'GO Location Zip'
                      , right_on = 'Postal Code')
del combined_data['Postal Code']
# print(combined_data.shape)

combined_data = pd.merge(left = combined_data, right = library
                      , how = 'left'
                      , left_on = 'GO Location Zip'
                      , right_on = 'Zip_Code')
del combined_data['Zip_Code']
print(combined_data.shape)

combined_data.head(10)

(47, 2)
(48, 37)


Unnamed: 0,GO Location Zip,crimes_2015,crimes_2016,ev_charging_stations,Population below poverty level,Median household income,"Non-White, Non-Hispanic or Latino","Hispanic or Latino, of any race",Population with disability,Unemployment,...,"Change in percentage of population below poverty, 2000-2012","Change in median rent, 2000-2012","Change in median home value, 2000-2012",Percentage of homes within 1/4-mi of transit stop,Average monthly transportation cost,Percentage of housing and transportation costs that is transportation-related,parks,public_art_installations,median_water_used_gal,libraries
0,78613,390.0,341.0,1.0,,,,,,,...,,,,,,,1.0,,636000.0,
1,78617,276.0,285.0,,18%,$43957,12%,67%,10%,15%,...,101%,74%,21%,16%,$865,42%,5.0,,3367450.0,
2,78652,13.0,16.0,,,,,,,,...,,,,,,,2.0,,69300.0,
3,78653,27.0,48.0,,,,,,,,...,,,,,,,,,311300.0,
4,78660,114.0,179.0,,,,,,,,...,,,,,,,1.0,,3306000.0,
5,78701,2103.0,2076.0,50.0,20%,$68152,16%,14%,10%,9%,...,12%,115%,59%,97%,$433,23%,13.0,114.0,942600.0,3.0
6,78702,1668.0,1582.0,18.0,33%,$34734,18%,56%,14%,11%,...,3%,73%,207%,96%,$590,39%,22.0,85.0,2884950.0,4.0
7,78703,738.0,660.0,10.0,10%,$92606,9%,9%,6%,4%,...,7%,65%,104%,67%,$629,25%,18.0,8.0,4255050.0,1.0
8,78704,2571.0,2557.0,29.0,21%,$50248,7%,30%,9%,7%,...,33%,40%,126%,76%,$629,33%,26.0,14.0,13373950.0,1.0
9,78705,1346.0,1134.0,4.0,66%,$11917,21%,17%,4%,14%,...,49%,70%,40%,100%,$511,30%,7.0,3.0,4309450.0,
