In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import math

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

import findspark
findspark.init()

import pyspark
import pyspark.sql
from pyspark.sql import *
from pyspark.sql.functions import *

# Dataset(s) preparation and cleaning

Before we proceed to tackle each of our research questions, some data cleaning is in order.

## Load the data and explore its structure

In [61]:
inspections = pd.read_csv('datasets/food-inspections.csv')

In [62]:
inspections.size

4306192

In [63]:
#Display first rows of data
inspections.head(50)

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,...,Results,Violations,Latitude,Longitude,Location,Historical Wards 2003-2015,Zip Codes,Community Areas,Census Tracts,Wards
0,2345318,SUBWAY,SUBWAY,2529116.0,Restaurant,Risk 1 (High),2620 N NARRAGANSETT AVE,CHICAGO,IL,60639.0,...,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.927995,-87.785752,"{'longitude': '41.92799528871574', 'latitude':...",,,,,
1,2345334,LA MICHOACANA ICE CREAM SHOP,LA MICHOACANA ICE CREAM SHOP,2698396.0,Restaurant,Risk 1 (High),3591-3597 N MILWAUKEE AVE,CHICAGO,IL,60641.0,...,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.94614,-87.735183,"{'longitude': '41.94614005344282', 'latitude':...",,,,,
2,2345339,THE CREPE SHOP,THE CREPE SHOP,2699005.0,Restaurant,Risk 1 (High),2934 N BROADWAY,CHICAGO,IL,60657.0,...,Fail,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,41.93593,-87.644407,"{'longitude': '41.93592957402078', 'latitude':...",,,,,
3,2345321,GOPUFF,GOPUFF,2684560.0,Grocery Store,Risk 3 (Low),1801 W WARNER AVE,CHICAGO,IL,60613.0,...,Pass,,41.956846,-87.674395,"{'longitude': '41.956845683288854', 'latitude'...",,,,,
4,2345319,GOPUFF,GOPUFF,2684558.0,Grocery Store,Risk 3 (Low),1801 W WARNER AVE,CHICAGO,IL,60613.0,...,Pass,39. CONTAMINATION PREVENTED DURING FOOD PREPAR...,41.956846,-87.674395,"{'longitude': '41.956845683288854', 'latitude'...",,,,,
5,2345302,Pizza Hut,Pizza Hut,2476781.0,Restaurant,Risk 1 (High),5035 N LINCOLN AVE,CHICAGO,IL,60625.0,...,Out of Business,,41.972793,-87.691092,"{'longitude': '41.972793193319774', 'latitude'...",,,,,
6,2345217,"LUNA PARK, INC.",LUNA PARK DAY CARE,2215485.0,Children's Services Facility,Risk 1 (High),3250 W FOSTER AVE,CHICAGO,IL,60625.0,...,Pass,49. NON-FOOD/FOOD CONTACT SURFACES CLEAN - Com...,41.975834,-87.710746,"{'longitude': '41.97583445690982', 'latitude':...",,,,,
7,2345244,OSITO'S TAP,OSITO'S TAP,2658134.0,Restaurant,Risk 1 (High),2553 S RIDGEWAY AVE,CHICAGO,IL,60623.0,...,Pass,"55. PHYSICAL FACILITIES INSTALLED, MAINTAINED ...",41.844597,-87.718313,"{'longitude': '41.84459718759861', 'latitude':...",,,,,
8,2345247,STOCKTON,STOCKTON,2684366.0,Restaurant,Risk 3 (Low),1009-1011 N RUSH ST,CHICAGO,IL,60611.0,...,Pass w/ Conditions,,41.901042,-87.62749,"{'longitude': '41.901042374044735', 'latitude'...",,,,,
9,2345242,L & M FINE FOODS,L & M FINE FOODS,2631147.0,Grocery Store,Risk 1 (High),4361-4365 N LINCOLN AVE,CHICAGO,IL,60618.0,...,Pass,,41.96127,-87.683589,"{'longitude': '41.96127036208352', 'latitude':...",,,,,


The dataset has 22 columns. Let's examine what each of them is.

In [64]:
#Display columns
inspections.columns

Index(['Inspection ID', 'DBA Name', 'AKA Name', 'License #', 'Facility Type',
       'Risk', 'Address', 'City', 'State', 'Zip', 'Inspection Date',
       'Inspection Type', 'Results', 'Violations', 'Latitude', 'Longitude',
       'Location', 'Historical Wards 2003-2015', 'Zip Codes',
       'Community Areas', 'Census Tracts', 'Wards'],
      dtype='object')

## Clean data

The 'Location' column contains the latitude and longitude of the establishment. However, there are separate 'Latitude' and 'Longitude' columns. We can hence safely drop the 'Location' column.

In [65]:
inspections = inspections.drop(columns=['Location'])

The head of the dataset only contains NaN entries for the 'Historical Wards 2003-2015', 'Zip Codes', 'Community Areas', 'Census Tracts', 'Wards' columns. Let's see if this is true for the whole dataset.

In [66]:
# make sure that our assumption is correct
print('Values taken by \'Historical Wards 2003-2015\': ', inspections['Zip Codes'].unique())
print('Values taken by \'Zip Codes\': ', inspections['Zip Codes'].unique())
print('Values taken by \'Community Areas\': ', inspections['Zip Codes'].unique())
print('Values taken by \'Census Tracts\': ', inspections['Zip Codes'].unique())
print('Values taken by \'Wards\': ', inspections['Zip Codes'].unique())


Values taken by 'Historical Wards 2003-2015':  [nan]
Values taken by 'Zip Codes':  [nan]
Values taken by 'Community Areas':  [nan]
Values taken by 'Census Tracts':  [nan]
Values taken by 'Wards':  [nan]


We drop all columns apart from the 'Community Areas' because we will be needing it in our study. We will fill later.

In [67]:
inspections = inspections.drop(columns=['Historical Wards 2003-2015'])
inspections = inspections.drop(columns=['Zip Codes'])
inspections = inspections.drop(columns=['Census Tracts'])
inspections = inspections.drop(columns=['Wards'])

Let's examine if the whole dataset is relevent to the study we are conducting by seeing which entries correspond to facilities in Chicago.

First, we check if there are any missing values for the column 'City' or 'State'

In [68]:
#Investigate the state=nan and city=nan restaurants
inspections[pd.isnull(inspections.State) | pd.isnull(inspections.City)]

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Community Areas
1669,2312774,CHICAGO COLLEGIATE CHARTER,CHICAGO COLLEGIATE CHARTER,3846104.0,School,Risk 1 (High),10909 S COTTAGE GROVE AVE,,IL,,2019-09-24T00:00:00.000,Canvass Re-Inspection,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.696087,-87.608945,
1879,2312540,CHICAGO COLLEGIATE CHARTER,CHICAGO COLLEGIATE CHARTER,3846104.0,School,Risk 1 (High),10909 S COTTAGE GROVE AVE,,IL,,2019-09-19T00:00:00.000,Canvass Re-Inspection,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.696087,-87.608945,
1903,2312545,JCYS IRIS & STEVEN PODOLSKY FAMILY CENTER,JCYS IRIS & STEVEN PODOLSKY FAMILY CENTER,2671297.0,Children's Services Facility,Risk 1 (High),2112 W LAWRENCE AVE,,IL,60625.0,2019-09-19T00:00:00.000,License Re-Inspection,Pass,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",41.968821,-87.682201,
3073,2305166,"AMY BECK CAKE DESIGN, LLC","AMY BECK CAKE DESIGN, LLC",2079264.0,Bakery,Risk 1 (High),636 N RACINE AVE,,,60642.0,2019-08-23T00:00:00.000,Canvass,Pass,"55. PHYSICAL FACILITIES INSTALLED, MAINTAINED ...",41.893380,-87.657588,
3617,2304583,JCYS IRIS & STEVEN PODOLSKY FAMILY CENTER,JCYS IRIS & STEVEN PODOLSKY FAMILY CENTER,2671297.0,Children's Services Facility,Risk 1 (High),2112 W LAWRENCE AVE,,IL,60625.0,2019-08-13T00:00:00.000,License,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.968821,-87.682201,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194253,60291,"CLOVERHILL PASTRY-VEND,LLC","CLOVERHILL PASTRY-VEND,LLC",2004357.0,Wholesale,Risk 3 (Low),4464 W 44TH ST,,IL,60632.0,2010-02-03T00:00:00.000,License Re-Inspection,Pass,,41.814266,-87.736013,
194489,60282,"CLOVERHILL PASTRY-VEND,LLC","CLOVERHILL PASTRY-VEND,LLC",2004357.0,Wholesale,Risk 3 (Low),4464 W 44TH ST,,IL,60632.0,2010-01-28T00:00:00.000,License,Fail,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.814266,-87.736013,
194610,60279,"CLOVERHILL PASTRY-VEND,LLC","CLOVERHILL PASTRY-VEND,LLC",2004357.0,Wholesale,Risk 3 (Low),4464 W 44TH ST,,IL,60632.0,2010-01-27T00:00:00.000,License,Fail,,41.814266,-87.736013,
195141,67912,THREE CHEFS RESTURANT,THREE CHEFS RESTURANT,2009471.0,Restaurant,Risk 1 (High),8125 S HALSTED ST,,IL,60620.0,2010-01-15T00:00:00.000,License Re-Inspection,Pass,,41.746236,-87.643766,


Looking at the coordinates of these places, all of them seem to also be in chicago, so we will fill their City and State columns

In [69]:
inspections['City'] = inspections['City'].fillna('Chicago')
inspections['State'] = inspections['State'].fillna('IL')

Next, we check if there are any facilities which are not located in Chicago.

In [70]:
# make sure that our assumption is correct
print('Values taken by \'City\': ', inspections['City'].unique())

Values taken by 'City':  ['CHICAGO' 'Chicago' 'chicago' 'GRIFFITH' 'NEW YORK' 'SCHAUMBURG'
 'ELMHURST' 'ALGONQUIN' 'NEW HOLSTEIN' 'CCHICAGO' 'NILES NILES' 'EVANSTON'
 'CHICAGO.' 'CHESTNUT STREET' 'LANSING' 'CHICAGOCHICAGO' 'WADSWORTH'
 'WILMETTE' 'WHEATON' 'CHICAGOHICAGO' 'ROSEMONT' 'CHicago' 'CALUMET CITY'
 'PLAINFIELD' 'HIGHLAND PARK' 'PALOS PARK' 'ELK GROVE VILLAGE' 'CICERO'
 'BRIDGEVIEW' 'OAK PARK' 'MAYWOOD' 'LAKE BLUFF' '312CHICAGO'
 'SCHILLER PARK' 'SKOKIE' 'BEDFORD PARK' 'BANNOCKBURNDEERFIELD' 'CHCICAGO'
 'BLOOMINGDALE' 'Norridge' 'CHARLES A HAYES' 'CHCHICAGO' 'CHICAGOI'
 'SUMMIT' 'OOLYMPIA FIELDS' 'WESTMONT' 'CHICAGO HEIGHTS' 'JUSTICE'
 'TINLEY PARK' 'LOMBARD' 'EAST HAZEL CREST' 'COUNTRY CLUB HILLS'
 'STREAMWOOD' 'BOLINGBROOK' 'INACTIVE' 'BERWYN' 'BURNHAM' 'DES PLAINES'
 'LAKE ZURICH' 'OLYMPIA FIELDS' 'alsip' 'OAK LAWN' 'BLUE ISLAND' 'GLENCOE'
 'FRANKFORT' 'NAPERVILLE' 'BROADVIEW' 'WORTH' 'Maywood' 'ALSIP'
 'EVERGREEN PARK']


We can see that this column takes values which are not Chicago. The rows where the 'City' is not Chicago are hence irrelevent to our study and should be dropped. Let's first make sure tha the bulk of the data is for Chicago before proceeding

In [71]:
chicago_inspections = inspections.groupby('City')['Inspection ID'].nunique().filter(regex='(?i)chicago', axis=0)
print('{}% of the inpections in the dataframe come from Chicago.'.format(100 * chicago_inspections.values.sum()/len(inspections)))

99.7884906200188% of the inpections in the dataframe come from Chicago.


We can safely drop the rows which come from cities that are not Chicago.

In [72]:
# list of ways Chicago has been written in the dataset
chicago_variations = chicago_inspections.index.tolist()
inspections = inspections[inspections['City'].isin(chicago_variations)]
# drop the 'City' and 'State' columns since they have each only one value, 'Chicago' and 'IL' respectively
inspections = inspections.drop(columns=['City', 'State'])


Now that we only have facilities in Chicago in our dataset, let us fill the 'Community Areas' column. To that end, we use the geopy library.

Let's see if there are still any missing values in our dataset.

In [49]:
inspections.size

3519612

In [50]:
inspections.isnull().sum()

Inspection ID                      0
DBA Name                           0
AKA Name                        2443
License #                         17
Facility Type                   4776
Risk                              73
Address                            0
Zip                               50
Inspection Date                    0
Inspection Type                    1
Results                            0
Violations                     51810
Latitude                         519
Longitude                        519
Historical Wards 2003-2015    195534
Community Areas               195534
Census Tracts                 195534
Wards                         195534
dtype: int64

array([-87.78575236, -87.73518302, -87.64440716, ..., -87.65396483,
       -87.67381938, -87.69542346])

**Explore the difference between DBA and AKA names**

In [20]:
print ('There are {0} unique DBA (‘Doing business as.’) names in the dataset.'.format(len(inspections['DBA Name'].unique())))

There are 27546 unique DBA (‘Doing business as.’) names in the dataset.


In [21]:
# Display the number of restaurants (we display the unique DBA names)
print ('There are {0} AKA (‘Also known as.’) names in the dataset.'.format(len(inspections['AKA Name'].unique())))

There are 26275 AKA (‘Also known as.’) names in the dataset.


In [22]:
# Explore how DBA and AKA names differ
print ('There are {0} rows where the DBA names and the AKA names differ.'\
       .format((len(inspections[inspections['DBA Name'] != inspections['AKA Name']]))))

There are 50895 rows where the DBA names and the AKA names differ.


In [23]:
print('Examples of different DBA and AKA names : ')
inspections[inspections['DBA Name'] != inspections['AKA Name']].head(2)

Examples of different DBA and AKA names : 


Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Historical Wards 2003-2015,Community Areas,Census Tracts,Wards
1,2345334,LA MICHOACANA ICE CREAM SHOP,LA MICHOACANA ICE CREAM SHOP,2698396.0,Restaurant,Risk 1 (High),3591-3597 N MILWAUKEE AVE,CHICAGO,IL,60641.0,2019-11-08T00:00:00.000,License,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.94614,-87.735183,,,,
6,2345217,"LUNA PARK, INC.",LUNA PARK DAY CARE,2215485.0,Children's Services Facility,Risk 1 (High),3250 W FOSTER AVE,CHICAGO,IL,60625.0,2019-11-07T00:00:00.000,License,Pass,49. NON-FOOD/FOOD CONTACT SURFACES CLEAN - Com...,41.975834,-87.710746,,,,
