# Relevant imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date as dt

import folium
from folium import plugins
from folium.plugins import HeatMap
#http://geopandas.org/install.html
import geopandas as gpd

In [2]:
inspections = pd.read_pickle('./datasets/cleaned_inspections.pickle')

In [3]:
inspections.head(2)

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location,Community Area
0,2316058,CHICAGO TAICHI BUBBLE TEA,CHICAGO TAICHI BUBBLE TEA,2694548,restaurant,1,6800 N SHERIDAN RD,2019-10-18,License Re-Inspection,Pass,47. FOOD & NON-FOOD CONTACT SURFACES CLEANABLE...,42.005587,-87.661077,"42.00558686485114, -87.66107732040031",Belmont Cragin
1,2315877,CHICAGO TAICHI BUBBLE TEA,CHICAGO TAICHI BUBBLE TEA,2694548,restaurant,1,6800 N SHERIDAN RD,2019-10-15,License,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",42.005587,-87.661077,"42.00558686485114, -87.66107732040031",Belmont Cragin


# Exploratory data analysis

In [None]:
#TODO: Plot piecharts, barplots, histograms for our different features to see if there are any emerging patterns

# Data Visualization using folium

## 1. Restaurants on a map

Let's display on a map the different restaurants in our dataset

In [4]:
#We only display restaurant in our map
restaurant_locations = inspections[inspections['Facility Type']=='restaurant']
#We reduce the data size & display only restaurants that still exist in year 2018
restaurant_locations = restaurant_locations[restaurant_locations['Inspection Date'].dt.year==2018]
restaurant_locations = restaurant_locations[['DBA Name', 'Latitude','Longitude']].drop_duplicates()
restaurant_locations_array = np.array(restaurant_locations)

In [None]:
locations_map = folium.Map(location=[41.86087, -87.608945], zoom_start=10)
for i, info in enumerate(restaurant_locations_array):
    folium.Marker(
        location=[info[1], info[2]],
        popup=info[0],
        icon=folium.Icon(color='red', icon='info-sign')).add_to(locations_map)
#locations_map.save('restaurants_map.html')
locations_map

In [8]:
len(inspections['Location'].unique())

16244

In [9]:
len(inspections['DBA Name'].unique())

26555

In [10]:
len(inspections['Community Area'].unique())

75

* From the map displayed we see that the icons are stacked and condensed into the same regions, hence this time of displaying does not meet our needs.
* We also see above that apparently the locations of the restaurants we have are not very precise because we have less locations that establishments.Maybe this is also due to the fact that some restaurants which get closed are replaced by others at the same location.

We decide to show a heatmap of the restaurants which will display the restaurants "concentration" (areas with high concentration of restaurants)

## 2. Heatmap showing restaurants locations concentration

In [None]:
locations_heatmap = folium.Map([41.86087, -87.608945], zoom_start=11)

# List comprehension to make out list of lists
heat_data = [[row['Latitude'],row['Longitude']] for index, row in restaurant_locations.iterrows()]

# Plot it on the map
HeatMap(heat_data, radius=14).add_to(locations_heatmap)

# Display the map
#locations_heatmap.save('restaurants_heatmap.html')
locations_heatmap

As expected, there are a lot more restaurants in the city center of chicago.

Now we want to display the community areas as a heatmap of the number of restaurants they have.

Get community areas boundaries from:
https://www.chicago.gov/city/en/depts/doit/dataset/boundaries_-_communityareas.html

In [173]:
# set the filepath and load in a shapefile
#fp = "./datasets/boundaries.geojson"
#boundaries_community_areas = gpd.read_file(fp)
# check data type so we can see that this is not a normal dataframe, but a GEOdataframe
#boundaries_community_areas.head()

Unnamed: 0,community,area,shape_area,perimeter,area_num_1,area_numbe,comarea_id,comarea,shape_len,geometry
0,DOUGLAS,0,46004621.1581,0,35,35,0,0,31027.0545098,"MULTIPOLYGON (((-87.60914 41.84469, -87.60915 ..."
1,OAKLAND,0,16913961.0408,0,36,36,0,0,19565.5061533,"MULTIPOLYGON (((-87.59215 41.81693, -87.59231 ..."
2,FULLER PARK,0,19916704.8692,0,37,37,0,0,25339.0897503,"MULTIPOLYGON (((-87.62880 41.80189, -87.62879 ..."
3,GRAND BOULEVARD,0,48492503.1554,0,38,38,0,0,28196.8371573,"MULTIPOLYGON (((-87.60671 41.81681, -87.60670 ..."
4,KENWOOD,0,29071741.9283,0,39,39,0,0,23325.1679062,"MULTIPOLYGON (((-87.59215 41.81693, -87.59215 ..."


In [174]:
#We process the GEOdataframe in a way that is useful to our analysis and then export it as a new .geojson
#boundaries_community_areas=boundaries_community_areas[['community','geometry']]
#boundaries_community_areas.loc[74,'community']='O\'HARE'
#boundaries_community_areas.sort_values(by=['community'], inplace=True)
#boundaries_community_areas.reset_index(inplace=True,drop=True)
#boundaries_community_areas.reset_index(inplace=True,drop=False)
#boundaries_community_areas.rename(columns={"index": "area_number"},inplace=True)
#boundaries_community_areas['community'] = boundaries_community_areas['community'].astype('str')
#boundaries_community_areas.to_file("boundaries_processed.geojson", driver="GeoJSON")

In [183]:
fp = "./boundaries_processed.geojson"
boundaries_community_areas = gpd.read_file(fp)
boundaries_community_areas.head()

Unnamed: 0,area_number,community,geometry
0,0,ALBANY PARK,"MULTIPOLYGON (((-87.70404 41.97355, -87.70403 ..."
1,1,ARCHER HEIGHTS,"MULTIPOLYGON (((-87.71437 41.82604, -87.71436 ..."
2,2,ARMOUR SQUARE,"MULTIPOLYGON (((-87.62917 41.84556, -87.62947 ..."
3,3,ASHBURN,"MULTIPOLYGON (((-87.71255 41.75734, -87.71252 ..."
4,4,AUBURN GRESHAM,"MULTIPOLYGON (((-87.63990 41.75615, -87.63990 ..."


We do a sanity check to see if the community areas in the inspection data we got from geopy correspond to the community areas we get from Chicago Government portal.

In [184]:
#We care about restaurants only
restaurants_intensity = inspections[inspections['Facility Type']=='restaurant']
#We focus on year 2018
restaurants_intensity = restaurants_intensity[restaurants_intensity['Inspection Date'].dt.year==2018]
restaurants_intensity = restaurants_intensity[['Community Area','DBA Name']].drop_duplicates().groupby('Community Area')['DBA Name'].agg({'nbr_restaurants' : len})
restaurants_intensity.reset_index(inplace=True)
restaurants_intensity['Community Area'] = restaurants_intensity['Community Area'].str.upper().str.strip()
restaurants_intensity

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  """


Unnamed: 0,Community Area,nbr_restaurants
0,ALBANY PARK,135
1,ARCHER HEIGHTS,49
2,ARMOUR SQUARE,18
3,AUBURN GRESHAM,74
4,AUSTIN,178
...,...,...
68,WEST LAWN,79
69,WEST PULLMAN,26
70,WEST RIDGE,258
71,WEST TOWN,363


There are 4 community areas that we don't have any restaurants in our dataset, let's add them to our data with nbr_restaurants = 0

In [185]:
missing_1 = set(boundaries_community_areas['community'])-set(restaurants_intensity['Community Area'])
missing_1 = pd.DataFrame(missing_1, columns=['Community Area'])
missing_1['nbr_restaurants'] = 0
print("Community areas missing in the inspection dataset: ")
missing_1

Community areas missing in the inspection dataset: 


Unnamed: 0,Community Area,nbr_restaurants
0,BRIGHTON PARK,0
1,EAST SIDE,0
2,ASHBURN,0
3,JEFFERSON PARK,0
4,WEST ELSDON,0


In [186]:
missing_2 = set(restaurants_intensity['Community Area'])-set(boundaries_community_areas['community'])
missing_2 = pd.DataFrame(missing_2, columns=['Community Area'])
print("Community areas missing in the government dataset: ")
missing_2

Community areas missing in the government dataset: 


Unnamed: 0,Community Area
0,CHINATOWN


Also CHINATOWN is mentioned in our dataset but not on the gov data.

After an easy Google search [source](https://www.google.com/search?client=safari&rls=en&q=chinatown+chicago&ie=UTF-8&oe=UTF-8), we see that CHINATOWN is in reality ARMOUR SQUARE

In [187]:
restaurants_intensity.loc[14,'Community Area']='ARMOUR SQUARE'
restaurants_intensity = restaurants_intensity.groupby('Community Area')['nbr_restaurants'].agg({'nbr_restaurants' : sum})
restaurants_intensity.reset_index(inplace=True)
restaurants_intensity

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  


Unnamed: 0,Community Area,nbr_restaurants
0,ALBANY PARK,135
1,ARCHER HEIGHTS,49
2,ARMOUR SQUARE,72
3,AUBURN GRESHAM,74
4,AUSTIN,178
...,...,...
67,WEST LAWN,79
68,WEST PULLMAN,26
69,WEST RIDGE,258
70,WEST TOWN,363


We can now merge the community areas and display the resulting nbr_restaurants

In [188]:
restaurants_intensity = restaurants_intensity.append(missing_1)
restaurants_intensity.sort_values(by=['Community Area'], inplace=True)
restaurants_intensity.reset_index(inplace=True,drop=True)
restaurants_intensity['Community Area'] = restaurants_intensity['Community Area'].astype('str')
restaurants_intensity

Unnamed: 0,Community Area,nbr_restaurants
0,ALBANY PARK,135
1,ARCHER HEIGHTS,49
2,ARMOUR SQUARE,72
3,ASHBURN,0
4,AUBURN GRESHAM,74
...,...,...
72,WEST LAWN,79
73,WEST PULLMAN,26
74,WEST RIDGE,258
75,WEST TOWN,363


In [189]:
import os
import webbrowser
# Initialize the map:
restaurants_by_community = folium.Map([41.86087, -87.608945], zoom_start=11, tiles = "cartodbpositron")
 
# Add the color for the chloropleth:
restaurants_by_community.choropleth(
 geo_data=fp,
 data=restaurants_intensity,
 columns=['Community Area', 'nbr_restaurants'],
 key_on='feature.properties.community',
 fill_color='YlGn',
 fill_opacity=0.7,
 line_opacity=0.2,
 legend_name='Number of restaurants by community'
)
folium.LayerControl().add_to(restaurants_by_community)
 
# Save to html
restaurants_by_community.save('restaurants_by_community.html')
restaurants_by_community

As expected, there are a lot more restaurants in the city center of chicago than the rest of the 