In [1]:
import numpy as np
import pandas as pd
from sodapy import Socrata

In [2]:
#Make API call to City of Chicago Open Data Portal
#Include only inspections prior to 3/31/2021
client = Socrata("data.cityofchicago.org", None)
results = client.get("4ijn-s7e5", where="inspection_date < '2021-04-01T00:00:00.000'", limit=100000000)

inspections_df = pd.DataFrame.from_records(results)
inspections_df.head()



Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,results,latitude,longitude,location,violations
0,2484532,VOLARE,VOLARE,2141813,Restaurant,Risk 1 (High),201 E GRAND AVE,CHICAGO,IL,60611,2021-02-04T00:00:00.000,Non-Inspection,No Entry,41.89165221441017,-87.62260381408842,"{'latitude': '-87.62260381408842', 'longitude'...",
1,2473041,ROSATI'S GRANT PARK,ROSATI'S,2762683,Restaurant,Risk 1 (High),23 E ADAMS ST,CHICAGO,IL,60603,2021-01-22T00:00:00.000,License,Pass,41.879391313239694,-87.62684825563626,"{'latitude': '-87.62684825563626', 'longitude'...",
2,2472595,PORTO,PORTO,2636886,Restaurant,Risk 1 (High),1600 W CHICAGO AVE,CHICAGO,IL,60622,2021-01-12T00:00:00.000,Non-Inspection,No Entry,41.896182324221726,-87.66735606423251,"{'latitude': '-87.66735606423251', 'longitude'...",
3,2457202,PI18EN,PI18EN,2626471,Restaurant,Risk 1 (High),1519 W 18TH ST,CHICAGO,IL,60608,2020-11-30T00:00:00.000,Non-Inspection,No Entry,41.8577138739859,-87.66454238198166,"{'latitude': '-87.66454238198166', 'longitude'...",
4,2386070,STREETERS TAVERN,STREETERS TAVERN,8864,TAVERN,Risk 3 (Low),48-50 E CHICAGO AVE,CHICAGO,IL,60611,2020-08-13T00:00:00.000,Canvass,Pass,41.89681264961122,-87.62636337526784,"{'latitude': '-87.62636337526784', 'longitude'...",


In [3]:
inspections_df["inspection_type"].unique()

array(['Non-Inspection', 'License', 'Canvass', 'Complaint',
       'Canvass Re-Inspection', 'License Re-Inspection',
       'Short Form Complaint', 'Complaint Re-Inspection',
       'Suspected Food Poisoning', 'Recent Inspection', 'Consultation',
       nan, 'License-Task Force',
       'Suspected Food Poisoning Re-inspection', 'Complaint-Fire',
       'Short Form Fire-Complaint', 'Complaint-Fire Re-inspection',
       'Task Force Liquor 1475', 'Tag Removal', 'Out of Business',
       'Special Events (Festivals)', 'No Entry', 'Business Not Located',
       'NO ENTRY', 'LICENSE REQUEST', 'NO ENTRY-SHORT COMPLAINT)',
       'OUT OF BUSINESS', 'citation re-issued', 'OFFICE ASSIGNMENT',
       'COVID COMPLAINT', 'Not Ready', 'Recent inspection',
       'fire complaint', 'FIRE', 'KITCHEN CLOSED FOR RENOVATION',
       'CORRECTIVE ACTION', 'O.B.', 'OWNER SUSPENDED OPERATION/LICENSE',
       'LICENSE CANCELED BY OWNER', 'LICENSE CONSULTATION',
       'License consultation', 'Illegal Operation

In [4]:
#Take only re-inspections
inspections_df['inspection_type_clean'] = inspections_df['inspection_type'].str.lower()
inspections_df['inspection_type_clean'] = inspections_df['inspection_type_clean'].str.replace('-', '')
inspections_df['inspection_type_clean'] = inspections_df['inspection_type_clean'].str.replace('re inspection', 'reinspection')

#inspections_df[inspections_df['inspection_type_clean'].isna()]
#One record is missing inspection type; will drop
inspections_df = inspections_df.dropna(subset=['inspection_type_clean'])
#inspections_df['inspection_type_clean'].unique()

reinspections_df = inspections_df[inspections_df['inspection_type_clean'].str.contains('reinspection')]
reinspections_df.shape

(41456, 18)

In [5]:
inspections_df.shape

(218257, 18)

In [6]:
#matching steps:
#merge full inspections_df with reinspections_df
#delete where inspection_id = reinspection_id
#group by ID of reinspection
#within each group, retain only inspection with closest date before (but not equal to) reinspection
full_merge = inspections_df.merge(reinspections_df, on=['dba_name', 'address', 'license_', 'latitude', 'longitude'],
                                    suffixes=('_orig', '_re'))
full_merge.shape

(494337, 31)

In [7]:
full_merge.columns

Index(['inspection_id_orig', 'dba_name', 'aka_name_orig', 'license_',
       'facility_type_orig', 'risk_orig', 'address', 'city_orig', 'state_orig',
       'zip_orig', 'inspection_date_orig', 'inspection_type_orig',
       'results_orig', 'latitude', 'longitude', 'location_orig',
       'violations_orig', 'inspection_type_clean_orig', 'inspection_id_re',
       'aka_name_re', 'facility_type_re', 'risk_re', 'city_re', 'state_re',
       'zip_re', 'inspection_date_re', 'inspection_type_re', 'results_re',
       'location_re', 'violations_re', 'inspection_type_clean_re'],
      dtype='object')

In [8]:
full_merge_clean = full_merge[full_merge['inspection_id_orig'] != full_merge['inspection_id_re']]
full_merge_clean.shape

(452881, 31)

In [9]:
full_merge_clean['date_orig'] = pd.to_datetime(full_merge_clean['inspection_date_orig'])
full_merge_clean['date_re'] = pd.to_datetime(full_merge_clean['inspection_date_re'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_merge_clean['date_orig'] = pd.to_datetime(full_merge_clean['inspection_date_orig'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_merge_clean['date_re'] = pd.to_datetime(full_merge_clean['inspection_date_re'])


In [10]:
#remove rows where original inspection occurred after re-inspection
full_merge_clean['time_between'] = full_merge_clean['date_orig'] - full_merge_clean['date_re']
full_merge_clean['time_between'] = full_merge_clean['time_between'].dt.days
full_merge_clean = full_merge_clean[full_merge_clean['time_between'] <= 0]
full_merge_clean.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_merge_clean['time_between'] = full_merge_clean['date_orig'] - full_merge_clean['date_re']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_merge_clean['time_between'] = full_merge_clean['time_between'].dt.days


(248091, 34)

In [11]:
#full_merge_grp = full_merge_clean.groupby('inspection_id_re')
grped = full_merge_clean[full_merge_clean['time_between'] == full_merge_clean.groupby('inspection_id_re')['time_between'].transform('max')]

grped.head()

Unnamed: 0,inspection_id_orig,dba_name,aka_name_orig,license_,facility_type_orig,risk_orig,address,city_orig,state_orig,zip_orig,...,zip_re,inspection_date_re,inspection_type_re,results_re,location_re,violations_re,inspection_type_clean_re,date_orig,date_re,time_between
10,1989768,VOLARE,VOLARE,2141813,Restaurant,Risk 1 (High),201 E GRAND AVE,CHICAGO,IL,60611,...,60611,2017-03-01T00:00:00.000,Canvass Re-Inspection,Pass,"{'latitude': '-87.62260381408842', 'longitude'...",32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,canvass reinspection,2017-02-27,2017-03-01,-2
13,1632809,VOLARE,VOLARE,2141813,Restaurant,Risk 1 (High),201 E GRAND AVE,CHICAGO,IL,60611,...,60611,2016-03-15T00:00:00.000,Canvass Re-Inspection,Pass,"{'latitude': '-87.62260381408842', 'longitude'...",,canvass reinspection,2016-03-08,2016-03-15,-7
32,1448101,STREETERS TAVERN,STREETERS TAVERN,8864,TAVERN,Risk 3 (Low),48-50 E CHICAGO AVE,CHICAGO,IL,60611,...,60611,2015-12-11T00:00:00.000,Complaint Re-Inspection,Pass,"{'latitude': '-87.62636337526784', 'longitude'...",,complaint reinspection,2015-12-01,2015-12-11,-10
36,2453925,MISS SAIGON,MISS SAIGON,2699550,Restaurant,Risk 1 (High),1129 W ARGYLE ST,CHICAGO,IL,60640,...,60640,2020-11-05T00:00:00.000,Canvass Re-Inspection,Pass,"{'latitude': '-87.65895612095852', 'longitude'...",,canvass reinspection,2020-10-28,2020-11-05,-8
39,2352289,MISS SAIGON,MISS SAIGON,2699550,Restaurant,Risk 1 (High),1129 W ARGYLE ST,CHICAGO,IL,60640,...,60640,2019-11-27T00:00:00.000,License Re-Inspection,Pass w/ Conditions,"{'latitude': '-87.65895612095852', 'longitude'...",5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,license reinspection,2019-11-25,2019-11-27,-2


In [12]:
grped.shape

(41600, 34)

In [13]:
duplicates = grped[grped.duplicated('inspection_id_re')]
print(duplicates.shape)
duplicates[['inspection_id_orig', 'inspection_date_orig', 'inspection_id_re', 'inspection_date_re', 'time_between']].head(20)

(262, 34)


Unnamed: 0,inspection_id_orig,inspection_date_orig,inspection_id_re,inspection_date_re,time_between
1786,2492464,2021-03-05T00:00:00.000,2492814,2021-03-12T00:00:00.000,-7
2314,2252347,2019-01-10T00:00:00.000,2252797,2019-01-18T00:00:00.000,-8
18957,2285631,2019-04-23T00:00:00.000,2290882,2019-06-04T00:00:00.000,-42
21946,2484257,2021-01-28T00:00:00.000,2484727,2021-02-10T00:00:00.000,-13
23396,2214079,2018-09-10T00:00:00.000,2240981,2018-12-04T00:00:00.000,-85
41707,1760212,2016-05-19T00:00:00.000,1931711,2016-05-26T00:00:00.000,-7
41971,2144588,2018-02-06T00:00:00.000,2144892,2018-02-13T00:00:00.000,-7
42266,2050307,2017-05-12T00:00:00.000,2050414,2017-05-15T00:00:00.000,-3
51975,2213686,2018-08-30T00:00:00.000,2222313,2018-09-13T00:00:00.000,-14
60531,1591629,2015-12-11T00:00:00.000,1609295,2016-01-05T00:00:00.000,-25


In [14]:
grped['results_orig'].unique()

array(['Fail', 'Pass w/ Conditions', 'Pass', 'No Entry',
       'Out of Business', 'Not Ready'], dtype=object)

In [15]:
grped.groupby('results_orig').count()

Unnamed: 0_level_0,inspection_id_orig,dba_name,aka_name_orig,license_,facility_type_orig,risk_orig,address,city_orig,state_orig,zip_orig,...,zip_re,inspection_date_re,inspection_type_re,results_re,location_re,violations_re,inspection_type_clean_re,date_orig,date_re,time_between
results_orig,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Fail,37853,37853,37419,37851,37826,37853,37853,37820,37844,37846,...,37846,37853,37853,37853,37724,24824,37853,37853,37853,37853
No Entry,343,343,338,343,343,343,343,343,343,343,...,343,343,343,343,343,254,343,343,343,343
Not Ready,70,70,70,70,70,70,70,70,70,70,...,70,70,70,70,68,36,70,70,70,70
Out of Business,300,300,297,300,298,300,300,300,300,300,...,300,300,300,300,299,175,300,300,300,300
Pass,596,596,591,596,596,596,596,596,596,596,...,596,596,596,596,595,277,596,596,596,596
Pass w/ Conditions,2438,2438,2429,2438,2438,2438,2438,2436,2438,2437,...,2437,2438,2438,2438,2433,1803,2438,2438,2438,2438


In [16]:
grped_clean = grped[(grped['results_orig'] == 'Fail') | (grped['results_orig'] == 'Pass w/ Conditions')]
grped_clean.shape

(40291, 34)

In [17]:
duplicates2 = grped_clean[grped_clean.duplicated('inspection_id_re')]
print(duplicates2.shape)
duplicates2[['inspection_id_orig', 'inspection_date_orig', 'inspection_id_re', 'inspection_date_re', 'time_between']].head(20)

(69, 34)


Unnamed: 0,inspection_id_orig,inspection_date_orig,inspection_id_re,inspection_date_re,time_between
41707,1760212,2016-05-19T00:00:00.000,1931711,2016-05-26T00:00:00.000,-7
85582,1516051,2015-01-16T00:00:00.000,1516204,2015-01-23T00:00:00.000,-7
93876,80288,2010-05-03T00:00:00.000,80327,2010-06-17T00:00:00.000,-45
97807,679768,2012-02-17T00:00:00.000,670797,2012-02-27T00:00:00.000,-10
101821,580190,2011-04-04T00:00:00.000,580188,2011-04-04T00:00:00.000,0
101822,580190,2011-04-04T00:00:00.000,580167,2011-04-04T00:00:00.000,0
101823,580190,2011-04-04T00:00:00.000,580176,2011-04-04T00:00:00.000,0
101824,580190,2011-04-04T00:00:00.000,580179,2011-04-04T00:00:00.000,0
101825,580190,2011-04-04T00:00:00.000,580186,2011-04-04T00:00:00.000,0
101827,580190,2011-04-04T00:00:00.000,580169,2011-04-04T00:00:00.000,0


In [18]:
#Dropping anything that's been duplicated. This only drops 69 rows. 
grped_no_duplicates = grped_clean.drop_duplicates(subset='inspection_id_re')
grped_no_duplicates = grped_no_duplicates.drop_duplicates(subset='inspection_id_orig')
grped_no_duplicates.shape

(40192, 34)

In [19]:
grped_no_duplicates['time_between'].describe()

count    40192.000000
mean       -11.131046
std         20.853949
min      -1121.000000
25%        -10.000000
50%         -7.000000
75%         -7.000000
max          0.000000
Name: time_between, dtype: float64

In [20]:
grped_no_duplicates.groupby('results_re').count()

Unnamed: 0_level_0,inspection_id_orig,dba_name,aka_name_orig,license_,facility_type_orig,risk_orig,address,city_orig,state_orig,zip_orig,...,state_re,zip_re,inspection_date_re,inspection_type_re,location_re,violations_re,inspection_type_clean_re,date_orig,date_re,time_between
results_re,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Fail,3512,3512,3433,3511,3509,3512,3512,3507,3512,3511,...,3512,3511,3512,3512,3495,3433,3512,3512,3512,3512
No Entry,339,339,334,339,339,339,339,339,339,339,...,339,339,339,339,338,259,339,339,339,339
Not Ready,36,36,36,36,36,36,36,36,36,36,...,36,36,36,36,36,27,36,36,36,36
Out of Business,81,81,81,81,80,81,81,81,81,81,...,81,81,81,81,81,14,81,81,81,81
Pass,31031,31031,30693,31030,31012,31031,31031,31003,31024,31025,...,31024,31025,31031,31031,30930,17804,31031,31031,31031,31031
Pass w/ Conditions,5193,5193,5173,5193,5189,5193,5193,5191,5191,5192,...,5191,5192,5193,5193,5178,5057,5193,5193,5193,5193


In [21]:
grped2 = grped_no_duplicates[(grped_no_duplicates['results_re'] == 'Fail') | (grped_no_duplicates['results_re'] == 'Pass') | (grped_no_duplicates['results_re'] == 'Pass w/ Conditions') ]
grped2['time_between'].describe()

count    39736.000000
mean       -10.996779
std         19.786114
min       -685.000000
25%        -10.000000
50%         -7.000000
75%         -7.000000
max          0.000000
Name: time_between, dtype: float64

In [22]:
final_grped = grped2[grped2['time_between'] >= -180]

In [23]:
final_grped.columns

Index(['inspection_id_orig', 'dba_name', 'aka_name_orig', 'license_',
       'facility_type_orig', 'risk_orig', 'address', 'city_orig', 'state_orig',
       'zip_orig', 'inspection_date_orig', 'inspection_type_orig',
       'results_orig', 'latitude', 'longitude', 'location_orig',
       'violations_orig', 'inspection_type_clean_orig', 'inspection_id_re',
       'aka_name_re', 'facility_type_re', 'risk_re', 'city_re', 'state_re',
       'zip_re', 'inspection_date_re', 'inspection_type_re', 'results_re',
       'location_re', 'violations_re', 'inspection_type_clean_re', 'date_orig',
       'date_re', 'time_between'],
      dtype='object')

In [24]:
final_df = final_grped[['inspection_id_orig', 'dba_name', 'license_', 
                        'facility_type_orig', 'date_orig', 'inspection_type_orig',
                        'results_orig', 'violations_orig', 'inspection_id_re',
                        'date_re', 'results_re', 'time_between']]
final_df = final_df.rename({'inspection_id_orig': 'id_orig', 'dba_name': 'name', 'license_': 'license', 
                        'facility_type_orig': 'facility_type', 'inspection_id_re': 'id_re'}, axis=1)

In [25]:
final_df['inspection_type_orig'].unique()

array(['Canvass', 'Complaint', 'License', 'License Re-Inspection',
       'Canvass Re-Inspection', 'Suspected Food Poisoning',
       'Complaint Re-Inspection', 'Short Form Complaint',
       'Complaint-Fire', 'Consultation', 'Recent Inspection',
       'License-Task Force', 'Tag Removal', 'Special Events (Festivals)',
       'SFP', 'Suspected Food Poisoning Re-inspection',
       'TWO PEOPLE ATE AND GOT SICK.', 'Short Form Fire-Complaint',
       'No Entry', 'Task Force Liquor 1475', 'SFP/Complaint',
       'TAVERN 1470', 'Complaint-Fire Re-inspection', 'SFP/COMPLAINT',
       'Non-Inspection', 'RECALL INSPECTION', 'no entry',
       'RE-INSPECTION OF CLOSE-UP', 'Illegal Operation',
       'REINSPECTION OF 48 HOUR NOTICE', 'Package Liquor 1474',
       '1315 license reinspection', 'NO ENTRY', 'TASK FORCE NIGHT',
       'task force(1470) liquor tavern', 'TASK FORCE PACKAGE LIQUOR',
       'LIQUOR CATERING', 'SPECIAL TASK FORCE'], dtype=object)

In [26]:
final_df = final_df[(final_df['inspection_type_orig'] != 'No Entry') & (final_df['inspection_type_orig'] != 'no entry')]

In [27]:
print(final_df.shape)
final_df.head()

(39623, 12)


Unnamed: 0,id_orig,name,license,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,id_re,date_re,results_re,time_between
10,1989768,VOLARE,2141813,Restaurant,2017-02-27,Canvass,Fail,"11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, D...",1989902,2017-03-01,Pass,-2
13,1632809,VOLARE,2141813,Restaurant,2016-03-08,Canvass,Fail,"16. FOOD PROTECTED DURING STORAGE, PREPARATION...",1734225,2016-03-15,Pass,-7
32,1448101,STREETERS TAVERN,8864,TAVERN,2015-12-01,Complaint,Fail,"16. FOOD PROTECTED DURING STORAGE, PREPARATION...",1448119,2015-12-11,Pass,-10
36,2453925,MISS SAIGON,2699550,Restaurant,2020-10-28,Canvass,Fail,58. ALLERGEN TRAINING AS REQUIRED - Comments: ...,2456264,2020-11-05,Pass,-8
39,2352289,MISS SAIGON,2699550,Restaurant,2019-11-25,License,Fail,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,2352403,2019-11-27,Pass w/ Conditions,-2


In [28]:
#Drop conditions:
#repeat re-inspection ID or original inspection ID (may investigate further, but not a big impact)
#original outcome of pass, no entry, or not ready
#more than 150 days between original and reinspection
#went from 41,600 rows to 39,621

#save to pickle
final_df.to_pickle("initial_clean.pkl")