# Relevant imports

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date as dt

import folium
from folium import plugins
from folium.plugins import HeatMap
#http://geopandas.org/install.html
import geopandas as gpd

In [35]:
inspections = pd.read_pickle('./datasets/cleaned_inspections.pickle')

In [36]:
inspections.head(2)

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location,Community Area
0,2316058,CHICAGO TAICHI BUBBLE TEA,CHICAGO TAICHI BUBBLE TEA,2694548,restaurant,1,6800 N SHERIDAN RD,2019-10-18,License Re-Inspection,Pass,47. FOOD & NON-FOOD CONTACT SURFACES CLEANABLE...,42.005587,-87.661077,"42.00558686485114, -87.66107732040031",Belmont Cragin
1,2315877,CHICAGO TAICHI BUBBLE TEA,CHICAGO TAICHI BUBBLE TEA,2694548,restaurant,1,6800 N SHERIDAN RD,2019-10-15,License,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",42.005587,-87.661077,"42.00558686485114, -87.66107732040031",Belmont Cragin


# Exploratory data analysis

Let's display on a map the different restaurants in our dataset

In [37]:
#We only display restaurant in our map
restaurant_locations = inspections[inspections['Facility Type']=='restaurant']
#We reduce the data size & display only restaurants that still exist in year 2018
restaurant_locations = restaurant_locations[restaurant_locations['Inspection Date'].dt.year==2018]
restaurant_locations = restaurant_locations[['DBA Name', 'Latitude','Longitude']].drop_duplicates()
restaurant_locations_array = np.array(restaurant_locations)

In [46]:
locations_map = folium.Map(location=[41.86087, -87.608945], zoom_start=10)
for i, info in enumerate(restaurant_locations_array):
    folium.Marker(
        location=[info[1], info[2]],
        popup=info[0],
        icon=folium.Icon(color='red', icon='info-sign')).add_to(locations_map)
locations_map.save('restaurants_map.html')
locations_map

In [38]:
len(inspections['Location'].unique())

16244

In [39]:
len(inspections['DBA Name'].unique())

26555

In [40]:
len(inspections['Community Area'].unique())

75

* From the map displayed we see that the icons are stacked and condensed into the same regions, hence this time of displaying does not meet our needs.
* We also see above that apparently the locations of the restaurants we have are not very precise because we have less locations that establishments.Maybe this is also due to the fact that some restaurants which get closed are replaced by others at the same location.

We decide to show a heatmap of the restaurants which will display the restaurants "concentration" (areas with high concentration of restaurants)

In [51]:
locations_heatmap = folium.Map([41.86087, -87.608945], zoom_start=11)

# List comprehension to make out list of lists
heat_data = [[row['Latitude'],row['Longitude']] for index, row in restaurant_locations.iterrows()]

# Plot it on the map
HeatMap(heat_data, radius=14).add_to(locations_heatmap)

# Display the map
#locations_heatmap.save('restaurants_heatmap.html')
locations_heatmap

<folium.plugins.heat_map.HeatMap at 0x1a2c860160>

Now we want to display the community areas as a heatmap of the number of restaurants they have.

Get community areas boundaries from:
https://www.chicago.gov/city/en/depts/doit/dataset/boundaries_-_communityareas.html

In [180]:
# set the filepath and load in a shapefile
fp = "./datasets/boundaries.geojson"
boundaries_community_areas = gpd.read_file(fp)
# check data type so we can see that this is not a normal dataframe, but a GEOdataframe
boundaries_community_areas=boundaries_community_areas[['community','geometry']]
boundaries_community_areas.loc[74,'community']='O\'HARE'
boundaries_community_areas

Unnamed: 0,community,geometry
0,DOUGLAS,"MULTIPOLYGON (((-87.60914 41.84469, -87.60915 ..."
1,OAKLAND,"MULTIPOLYGON (((-87.59215 41.81693, -87.59231 ..."
2,FULLER PARK,"MULTIPOLYGON (((-87.62880 41.80189, -87.62879 ..."
3,GRAND BOULEVARD,"MULTIPOLYGON (((-87.60671 41.81681, -87.60670 ..."
4,KENWOOD,"MULTIPOLYGON (((-87.59215 41.81693, -87.59215 ..."
...,...,...
72,MOUNT GREENWOOD,"MULTIPOLYGON (((-87.69646 41.70714, -87.69644 ..."
73,MORGAN PARK,"MULTIPOLYGON (((-87.64215 41.68508, -87.64249 ..."
74,O'HARE,"MULTIPOLYGON (((-87.83658 41.98640, -87.83658 ..."
75,EDGEWATER,"MULTIPOLYGON (((-87.65456 41.99817, -87.65456 ..."


In [181]:
boundaries_community_areas.to_file("boundaries.geojson", driver='GeoJSON')

DriverIOError: GeoJSON driver doesn't support creating a layer on a read-only datasource

In [None]:
fp = "./boundaries.geojson"
boundaries_community_areas = gpd.read_file(fp)
boundaries_community_areas

We do a sanity check to see if the community areas in the inspection data we got from geopy correspond to the community areas we get from Chicago Government portal.

In [182]:
#We care about restaurants only
restaurants_intensity = inspections[inspections['Facility Type']=='restaurant']
#We focus on year 2018
restaurants_intensity = restaurants_intensity[restaurants_intensity['Inspection Date'].dt.year==2018]
restaurants_intensity = restaurants_intensity[['Community Area','DBA Name']].drop_duplicates().groupby('Community Area')['DBA Name'].agg({'nbr_restaurants' : len})
restaurants_intensity.reset_index(inplace=True)
restaurants_intensity['Community Area'] = restaurants_intensity['Community Area'].str.upper().str.strip()
restaurants_intensity

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  """


Unnamed: 0,Community Area,nbr_restaurants
0,ALBANY PARK,135
1,ARCHER HEIGHTS,49
2,ARMOUR SQUARE,18
3,AUBURN GRESHAM,74
4,AUSTIN,178
...,...,...
68,WEST LAWN,79
69,WEST PULLMAN,26
70,WEST RIDGE,258
71,WEST TOWN,363


There are 4 community areas that we don't have any restaurants in our dataset, let's add them to our data with nbr_restaurants = 0

In [183]:
missing_1 = set(boundaries_community_areas['community'])-set(restaurants_intensity['Community Area'])
missing_1 = pd.DataFrame(missing_1, columns=['Community Area'])
missing_1['nbr_restaurants'] = 0
print("Community areas missing in the inspection dataset: ")
missing_1

Community areas missing in the inspection dataset: 


Unnamed: 0,Community Area,nbr_restaurants
0,EAST SIDE,0
1,JEFFERSON PARK,0
2,ASHBURN,0
3,WEST ELSDON,0
4,BRIGHTON PARK,0


In [184]:
missing_2 = set(restaurants_intensity['Community Area'])-set(boundaries_community_areas['community'])
missing_2 = pd.DataFrame(missing_2, columns=['Community Area'])
print("Community areas missing in the government dataset: ")
missing_2

Community areas missing in the government dataset: 


Unnamed: 0,Community Area
0,CHINATOWN


Also CHINATOWN is mentioned in our dataset but not on the gov data.

After an easy Google search [source](https://www.google.com/search?client=safari&rls=en&q=chinatown+chicago&ie=UTF-8&oe=UTF-8), we see that CHINATOWN is in reality ARMOUR SQUARE

In [185]:
restaurants_intensity.loc[14,'Community Area']='ARMOUR SQUARE'
restaurants_intensity = restaurants_intensity.groupby('Community Area')['nbr_restaurants'].agg({'nbr_restaurants' : sum})
restaurants_intensity.reset_index(inplace=True)

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  


We can now merge the community areas and display the resulting nbr_restaurants

In [186]:
restaurants_intensity = restaurants_intensity.append(missing_1)
restaurants_intensity.reset_index(inplace=True, drop=True)
restaurants_intensity

Unnamed: 0,Community Area,nbr_restaurants
0,ALBANY PARK,135
1,ARCHER HEIGHTS,49
2,ARMOUR SQUARE,72
3,AUBURN GRESHAM,74
4,AUSTIN,178
...,...,...
72,EAST SIDE,0
73,JEFFERSON PARK,0
74,ASHBURN,0
75,WEST ELSDON,0


In [190]:
# Initialize the map:
restaurants_by_community = folium.Map([41.86087, -87.608945], zoom_start=11)
 
# Add the color for the chloropleth:
folium.Choropleth(
 geo_data=boundaries_community_areas,
 name='choropleth',
 data=restaurants_intensity,
 columns=['Community Area', 'nbr_restaurants'],
 key_on='columns.community',
 fill_color='YlGn',
 fill_opacity=0.7,
 line_opacity=0.2,
 legend_name='Number of restaurants by community'
)
folium.LayerControl().add_to(restaurants_by_community)
 
# Save to html
restaurants_by_community.save('restaurants_by_community.html')
restaurants_by_community

AttributeError: 'NoneType' object has no attribute 'get'

In [None]:
import pandas as pd


url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'
state_geo = f'{url}/us-states.json'
state_unemployment = f'{url}/US_Unemployment_Oct2012.csv'
state_data = pd.read_csv(state_unemployment)

m = folium.Map(location=[48, -102], zoom_start=3)

folium.Choropleth(
    geo_data=state_geo,
    name='choropleth',
    data=state_data,
    columns=['State', 'Unemployment'],
    key_on='feature.id',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Unemployment Rate (%)'
).add_to(m)

folium.LayerControl().add_to(m)

m

In [150]:
gpd.read_file(state_geo)

Unnamed: 0,id,name,geometry
0,AL,Alabama,"POLYGON ((-87.35930 35.00118, -85.60667 34.984..."
1,AK,Alaska,"MULTIPOLYGON (((-131.60202 55.11798, -131.5691..."
2,AZ,Arizona,"POLYGON ((-109.04250 37.00026, -109.04798 31.3..."
3,AR,Arkansas,"POLYGON ((-94.47384 36.50186, -90.15254 36.496..."
4,CA,California,"POLYGON ((-123.23326 42.00619, -122.37885 42.0..."
5,CO,Colorado,"POLYGON ((-107.91973 41.00391, -105.72895 40.9..."
6,CT,Connecticut,"POLYGON ((-73.05353 42.03905, -71.79931 42.022..."
7,DE,Delaware,"POLYGON ((-75.41409 39.80446, -75.50720 39.683..."
8,FL,Florida,"POLYGON ((-85.49714 30.99754, -85.00421 31.003..."
9,GA,Georgia,"POLYGON ((-83.10919 35.00118, -83.32279 34.787..."


In [151]:
state_data

Unnamed: 0,State,Unemployment
0,AL,7.1
1,AK,6.8
2,AZ,8.1
3,AR,7.2
4,CA,10.1
5,CO,7.7
6,CT,8.4
7,DE,7.1
8,FL,8.2
9,GA,8.8
