In [39]:
import pandas as pd
import numpy as np
import sklearn
import re

# import pip
# pip.main(['install', 'xgboost'])
import xgboost as xgb
import statsmodels.formula.api as sm

pd.set_option('display.max_columns', 500)

### Import Markdown

In [10]:
housing_data = pd.read_csv('data/2014_Housing_Market_Analysis_Data_by_Zip_Code.csv')
crime_2016_data = pd.read_csv('data/2016_Annual_Crime_Data.csv')
crime_2015_data = pd.read_csv('data/Annual_Crime_Dataset_2015.csv')
library_data = pd.read_csv('data/Austin_Public_Library_Locations.csv')
water_consumption_data = pd.read_csv('data/Austin_Water_-_Residential_Water_Consumption.csv')
campaign_finance_data = pd.read_csv('data/Campaign_Finance_Data_-_Report_Detail_Dataset.csv')
park_data = pd.read_csv('data/City_of_Austin_Parks_data.csv')
public_art_data = pd.read_csv('data/City_of_Austin_Public_Art_Collection.csv')
public_venue_data = pd.read_csv('data/Creative_Workspaces__Performance_Venues__Galleries___Museums.csv')
ev_charging_data = pd.read_csv('data/Electric_Vehicle_Charging_Network.csv')
restaurant_inspection_data = pd.read_csv('data/Restaurant_Inspection_Scores.csv')
traffic_camera_data = pd.read_csv('data/Traffic_Cameras.csv')

### Extract Zip Code from Address column in library data

In [11]:
library_data['Zip_Code'] = library_data['Address'].str.findall('\s+\d+\n')
library_data['Zip_Code'] = library_data['Zip_Code'].str[0].str[:6]

### Clean Restaurant Inspections (Multiple Dates per Restaurant)

In [12]:
restaurant_max_inspection = restaurant_inspection_data.groupby('Restaurant Name', as_index = False)['Inspection Date'].agg('max')

restaurant_inspection = pd.merge(left = restaurant_max_inspection, right = restaurant_inspection_data
                      , how = 'inner'
                      , left_on = ['Restaurant Name', 'Inspection Date']
                      , right_on = ['Restaurant Name', 'Inspection Date'])

restaurant_inspection_data = restaurant_inspection.drop_duplicates()

restaurant_inspection_data['Zip Code'] = restaurant_inspection_data['Zip Code'].str[-5:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


### Aggregate Files to Zip Code level

In [13]:
crime_2015 = crime_2015_data.groupby('GO Location Zip', as_index = False).size()
crime_2015 = crime_2015.reset_index()
crime_2015 = crime_2015.rename(columns = {0: 'crimes_2015'})
# print(crime_2015.head())

crime_2016 = crime_2016_data.groupby('GO Location Zip', as_index = False).size()
crime_2016 = crime_2016.reset_index()
crime_2016 = crime_2016.rename(columns = {0: 'crimes_2016'})
# crime_2016.head()

ev_charging = pd.DataFrame(ev_charging_data.groupby('Postal Code', as_index = False).size())
ev_charging = ev_charging.reset_index()
ev_charging = ev_charging.rename(columns = {0: 'ev_charging_stations'})
# ev_charging.head()

restaurant_inspection = restaurant_inspection_data.groupby('Zip Code', as_index = False)['Score'].agg(['median','size'])
restaurant_inspection = restaurant_inspection.reset_index()
restaurant_inspection = restaurant_inspection.rename(columns = {'median': 'median_rest_insp_score', 'size':'number_of_inspections'})
restaurant_inspection['Zip Code'] = pd.to_numeric(restaurant_inspection['Zip Code'])
# print(restaurant_inspection.head())

public_art = public_art_data.groupby('Location Zip Code', as_index = False).size()
public_art = public_art.reset_index()
public_art = public_art.rename(columns = {0: 'public_art_installations'})
# public_art.head()

public_venue = public_venue_data.groupby('ZIP', as_index = False).size()
public_venue = public_venue.reset_index()
public_venue = public_venue.rename(columns = {0: 'public_venues'})
# public_venue.head()

park = park_data.groupby('ZIP_CODE', as_index = False).size()
park = park.reset_index()
park = park.rename(columns = {0: 'parks'})
# park.head()

water_consumption = water_consumption_data.groupby('Postal Code', as_index = False)['Total Gallons'].median()
water_consumption = water_consumption.reset_index()
water_consumption = water_consumption.rename(columns = {'Total Gallons': 'median_water_used_gal'})
del water_consumption['index']
# water_consumption.head()

library = library_data.groupby('Zip_Code', as_index = False).size()
library = library.reset_index()
library = library.rename(columns = {0: 'libraries'})
library['Zip_Code'] = pd.to_numeric(library['Zip_Code'].str.strip())

### Combine Files

In [56]:
combined_data = crime_2015.copy()
# print(crime_2015.shape)
print(combined_data.shape)

combined_data = pd.merge(left = combined_data, right = crime_2016
                      , how = 'inner'
                      , on = 'GO Location Zip')
# print(combined_data.shape)

combined_data = pd.merge(left = combined_data, right = ev_charging
                      , how = 'left'
                      , left_on = 'GO Location Zip'
                      , right_on = 'Postal Code')
del combined_data['Postal Code']
# print(combined_data.shape)

combined_data = pd.merge(left = combined_data, right = housing_data
                      , how = 'inner'
                      , left_on = 'GO Location Zip'
                      , right_on = 'Zip Code')
del combined_data['Zip Code']
# print(combined_data.shape)

combined_data = pd.merge(left = combined_data, right = park
                      , how = 'left'
                      , left_on = 'GO Location Zip'
                      , right_on = 'ZIP_CODE')
del combined_data['ZIP_CODE']
# print(combined_data.shape)

combined_data = pd.merge(left = combined_data, right = public_art
                      , how = 'left'
                      , left_on = 'GO Location Zip'
                      , right_on = 'Location Zip Code')
del combined_data['Location Zip Code']
# print(combined_data.shape)

combined_data = pd.merge(left = combined_data, right = water_consumption
                      , how = 'left'
                      , left_on = 'GO Location Zip'
                      , right_on = 'Postal Code')
del combined_data['Postal Code']
# print(combined_data.shape)

combined_data = pd.merge(left = combined_data, right = library
                      , how = 'left'
                      , left_on = 'GO Location Zip'
                      , right_on = 'Zip_Code')
del combined_data['Zip_Code']
# print(combined_data.shape)
# print(combined_data)

data_study = pd.DataFrame(combined_data)
data_study
data_study.to_csv("study.csv")


(47, 2)


### Remove String Characters

In [35]:
combined_data['Population below poverty level'] = pd.to_numeric(combined_data['Population below poverty level'].str.replace('\%', ''))
combined_data['Median household income'] = pd.to_numeric(combined_data['Median household income'].str.replace('\$', ''))
combined_data['Non-White, Non-Hispanic or Latino'] = pd.to_numeric(combined_data['Non-White, Non-Hispanic or Latino'].str.replace('\%', ''))
combined_data['Hispanic or Latino, of any race'] = pd.to_numeric(combined_data['Hispanic or Latino, of any race'].str.replace('\%', ''))
combined_data['Population with disability'] = pd.to_numeric(combined_data['Population with disability'].str.replace('\%', ''))
combined_data['Unemployment'] = pd.to_numeric(combined_data['Unemployment'].str.replace('\%', ''))
combined_data['Change in percentage of population below poverty, 2000-2012'] = pd.to_numeric(combined_data['Change in percentage of population below poverty, 2000-2012'].str.replace('\%', ''))
combined_data['Change in median rent, 2000-2012'] = pd.to_numeric(combined_data['Change in median rent, 2000-2012'].str.replace('\%', ''))
combined_data['Change in median home value, 2000-2012'] = pd.to_numeric(combined_data['Change in median home value, 2000-2012'].str.replace('\%', ''))
combined_data['Percentage of homes within 1/4-mi of transit stop'] = pd.to_numeric(combined_data['Percentage of homes within 1/4-mi of transit stop'].str.replace('\%', ''))
combined_data['Average monthly transportation cost'] = pd.to_numeric(combined_data['Average monthly transportation cost'].str.replace('\$', ''))
combined_data['Percentage of housing and transportation costs that is transportation-related'] = pd.to_numeric(combined_data['Percentage of housing and transportation costs that is transportation-related'].str.replace('\%', ''))
combined_data['Large households (5+ members)'] = pd.to_numeric(combined_data['Large households (5+ members)'].str.replace('\%', ''))
combined_data['Homes affordable to people earning less than $50,000'] = pd.to_numeric(combined_data['Homes affordable to people earning less than $50,000'].str.replace('\%', ''))
combined_data['Rentals affordable to people earning less than $25,000'] = pd.to_numeric(combined_data['Rentals affordable to people earning less than $25,000'].str.replace('\%', ''))
combined_data['Rent-restricted units'] = pd.to_numeric(combined_data['Rent-restricted units'].str.replace('\%', ''))
combined_data['Median rent'] = pd.to_numeric(combined_data['Median rent'].str.replace('\$', ''))
combined_data['Median home value'] = pd.to_numeric(combined_data['Median home value'].str.replace('\$', ''))
combined_data['Percentage of rental units in poor condition'] = pd.to_numeric(combined_data['Percentage of rental units in poor condition'].str.replace('\%', ''))
combined_data['Housing Choice Voucher holders'] = pd.to_numeric(combined_data['Housing Choice Voucher holders'].str.replace('\%', ''))
combined_data['Percent change in number of housing units, 2000-2012'] = pd.to_numeric(combined_data['Percent change in number of housing units, 2000-2012'].str.replace('\%', ''))
combined_data['Owner units affordable to average retail/service worker'] = pd.to_numeric(combined_data['Owner units affordable to average retail/service worker'].str.replace('\%', ''))
combined_data['Rental units affordable to average retail/service worker'] = pd.to_numeric(combined_data['Rental units affordable to average retail/service worker'].str.replace('\%', ''))
combined_data['Rental units affordable to average artist'] = pd.to_numeric(combined_data['Rental units affordable to average artist'].str.replace('\%', ''))
combined_data['Owner units affordable to average artist'] = pd.to_numeric(combined_data['Owner units affordable to average artist'].str.replace('\%', ''))
combined_data['Rental units affordable to average teacher'] = pd.to_numeric(combined_data['Rental units affordable to average teacher'].str.replace('\%', ''))
combined_data['Owner units affordable to average teacher'] = pd.to_numeric(combined_data['Owner units affordable to average teacher'].str.replace('\%', ''))
combined_data['Rental units affordable to average tech worker'] = pd.to_numeric(combined_data['Rental units affordable to average tech worker'].str.replace('\%', ''))
combined_data['Owner units affordable to average tech worker'] = pd.to_numeric(combined_data['Owner units affordable to average tech worker'].str.replace('\%', ''))

In [16]:
# def forward_selected(data, response):
#     """Linear model designed by forward selection.

#     Parameters:
#     -----------
#     data : pandas DataFrame with all possible predictors and response

#     response: string, name of response column in data

#     Returns:
#     --------
#     model: an "optimal" fitted statsmodels linear model
#            with an intercept
#            selected by forward selection
#            evaluated by adjusted R-squared
#     """
#     remaining = set(data.columns)
#     remaining.remove(response)
#     selected = []
#     current_score, best_new_score = 0.0, 0.0
#     while remaining and current_score == best_new_score:
#         scores_with_candidates = []
#         for candidate in remaining:
#             formula = "{} ~ {} + 1".format(response,
#                                            ' + '.join(selected + [candidate]))
#             score = smf.ols(formula, data).fit().rsquared_adj
#             scores_with_candidates.append((score, candidate))
#         scores_with_candidates.sort()
#         best_new_score, best_candidate = scores_with_candidates.pop()
#         if current_score < best_new_score:
#             remaining.remove(best_candidate)
#             selected.append(best_candidate)
#             current_score = best_new_score
#     formula = "{} ~ {} + 1".format(response,
#                                    ' + '.join(selected))
#     model = smf.ols(formula, data).fit()
#     return model

In [37]:
# model = forward_selected(combined_data,'')

# print (model.model.formula)
# # sl ~ rk + yr + 1

# print (model.rsquared_adj)
# # 0.835190760538

In [41]:
# data = combined_data
# X = pd.DataFrame(data.data, columns=data.feature_names)
# y = data.target


# def stepwise_selection(X, y, 
#                        initial_list=[], 
#                        threshold_in=0.01, 
#                        threshold_out = 0.05, 
#                        verbose=True):
#     """ Perform a forward-backward feature selection 
#     based on p-value from statsmodels.api.OLS
#     Arguments:
#         X - pandas.DataFrame with candidate features
#         y - list-like with the target
#         initial_list - list of features to start with (column names of X)
#         threshold_in - include a feature if its p-value < threshold_in
#         threshold_out - exclude a feature if its p-value > threshold_out
#         verbose - whether to print the sequence of inclusions and exclusions
#     Returns: list of selected features 
#     Always set threshold_in < threshold_out to avoid infinite looping.
#     See https://en.wikipedia.org/wiki/Stepwise_regression for the details
#     """
#     included = list(initial_list)
#     while True:
#         changed=False
#         # forward step
#         excluded = list(set(X.columns)-set(included))
#         new_pval = pd.Series(index=excluded)
#         for new_column in excluded:
#             model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
#             new_pval[new_column] = model.pvalues[new_column]
#         best_pval = new_pval.min()
#         if best_pval < threshold_in:
#             best_feature = new_pval.argmin()
#             included.append(best_feature)
#             changed=True
#             if verbose:
#                 print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

#         # backward step
#         model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
#         # use all coefs except intercept
#         pvalues = model.pvalues.iloc[1:]
#         worst_pval = pvalues.max() # null if pvalues is empty
#         if worst_pval > threshold_out:
#             changed=True
#             worst_feature = pvalues.argmax()
#             included.remove(worst_feature)
#             if verbose:
#                 print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
#         if not changed:
#             break
#     return included

# result = stepwise_selection(X, y)

# print('resulting features:')
# print(result)