In [1]:
import numpy as np
import pandas as pd
from sodapy import Socrata

In [2]:
#Make API call to City of Chicago Open Data Portal
#Include only inspections prior to 3/31/2021
client = Socrata("data.cityofchicago.org", None)
results = client.get("4ijn-s7e5", where="inspection_date < '2021-04-01T00:00:00.000'", limit=100000000)

inspections_df = pd.DataFrame.from_records(results)
inspections_df.head()



Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,results,latitude,longitude,location,violations
0,2493173,"PIEROGI HEAVEN, INC.",PIEROGI HEAVEN,2022158,Restaurant,Risk 1 (High),169 N WELLS ST,CHICAGO,IL,60606,2021-03-18T00:00:00.000,Canvass,Out of Business,41.88507356203613,-87.63376475726596,"{'latitude': '-87.63376475726596', 'longitude'...",
1,2497619,CRUSHED BY GIANT,CRUSHED BY GIANT,2647349,Restaurant,Risk 1 (High),600 N MICHIGAN AVE,CHICAGO,IL,60611,2021-03-31T00:00:00.000,Non-Inspection,No Entry,41.89259322553828,-87.6243340479495,"{'latitude': '-87.6243340479495', 'longitude':...",
2,2497613,MONARCA P.L.A.C.E .7 INC,MONARCA P.L.A.C.E. 7,2391062,Restaurant,Risk 1 (High),4553 S ASHLAND AVE,CHICAGO,IL,60609,2021-03-31T00:00:00.000,Non-Inspection,No Entry,41.81068114659683,-87.6648796277618,"{'latitude': '-87.6648796277618', 'longitude':...",
3,2497603,EDDIE V'S WILD FISH,EDDIE V'S WILD FISH,2428608,Restaurant,Risk 1 (High),521 N RUSH ST,CHICAGO,IL,60611,2021-03-31T00:00:00.000,Canvass,Pass,41.89167129985412,-87.62522472785261,"{'latitude': '-87.62522472785261', 'longitude'...","55. PHYSICAL FACILITIES INSTALLED, MAINTAINED ..."
4,2497593,BAKE FOR ME!,BAKE FOR ME!,2443317,Bakery,Risk 1 (High),4305 S HALSTED ST,CHICAGO,IL,60609,2021-03-31T00:00:00.000,Complaint,Fail,41.816000608796905,-87.64557477487227,"{'latitude': '-87.64557477487227', 'longitude'...","3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E..."


In [3]:
inspections_df["inspection_type"].unique()

array(['Canvass', 'Non-Inspection', 'Complaint', 'License',
       'Canvass Re-Inspection', 'Complaint Re-Inspection',
       'License Re-Inspection', 'Short Form Complaint',
       'Recent Inspection', 'Suspected Food Poisoning', 'Not Ready',
       'Consultation', 'Task Force Liquor 1475', 'License-Task Force',
       'Complaint-Fire Re-inspection', 'Complaint-Fire', 'Tag Removal',
       'Out of Business', 'No Entry',
       'Suspected Food Poisoning Re-inspection',
       'TASK FORCE LIQUOR (1481)', 'TASK FORCE LIQUOR 1474',
       'Package Liquor 1474', 'TASK FORCE PACKAGE GOODS 1474',
       'OFFICE ASSIGNMENT', 'COVID COMPLAINT',
       'Special Events (Festivals)', 'Recent inspection',
       'fire complaint', 'FIRE', nan, 'Short Form Fire-Complaint',
       'KITCHEN CLOSED FOR RENOVATION', 'O.B.', 'CORRECTIVE ACTION',
       'OWNER SUSPENDED OPERATION/LICENSE', 'LICENSE CANCELED BY OWNER',
       'LICENSE CONSULTATION', 'License consultation',
       'Illegal Operation', 'Pre-

In [4]:
#Take only re-inspections
inspections_df['inspection_type_clean'] = inspections_df['inspection_type'].str.lower()
inspections_df['inspection_type_clean'] = inspections_df['inspection_type_clean'].str.replace('-', '')
inspections_df['inspection_type_clean'] = inspections_df['inspection_type_clean'].str.replace('re inspection', 'reinspection')

#inspections_df[inspections_df['inspection_type_clean'].isna()]
#One record is missing inspection type; will drop
inspections_df = inspections_df.dropna(subset=['inspection_type_clean'])
#inspections_df['inspection_type_clean'].unique()

reinspections_df = inspections_df[inspections_df['inspection_type_clean'].str.contains('reinspection')]
reinspections_df.shape

(41456, 18)

In [5]:
inspections_df.shape

(218256, 18)

In [6]:
#matching steps:
#merge full inspections_df with reinspections_df
#delete where inspection_id = reinspection_id
#group by ID of reinspection
#within each group, retain only inspection with closest date before (but not equal to) reinspection
full_merge = inspections_df.merge(reinspections_df, on=['dba_name', 'address', 'license_', 'latitude', 'longitude'],
                                    suffixes=('_orig', '_re'))
full_merge.shape

(494335, 31)

In [7]:
full_merge.columns

Index(['inspection_id_orig', 'dba_name', 'aka_name_orig', 'license_',
       'facility_type_orig', 'risk_orig', 'address', 'city_orig', 'state_orig',
       'zip_orig', 'inspection_date_orig', 'inspection_type_orig',
       'results_orig', 'latitude', 'longitude', 'location_orig',
       'violations_orig', 'inspection_type_clean_orig', 'inspection_id_re',
       'aka_name_re', 'facility_type_re', 'risk_re', 'city_re', 'state_re',
       'zip_re', 'inspection_date_re', 'inspection_type_re', 'results_re',
       'location_re', 'violations_re', 'inspection_type_clean_re'],
      dtype='object')

In [8]:
full_merge_clean = full_merge[full_merge['inspection_id_orig'] != full_merge['inspection_id_re']]
full_merge_clean.shape

(452879, 31)

In [9]:
full_merge_clean['date_orig'] = pd.to_datetime(full_merge_clean['inspection_date_orig'])
full_merge_clean['date_re'] = pd.to_datetime(full_merge_clean['inspection_date_re'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_merge_clean['date_orig'] = pd.to_datetime(full_merge_clean['inspection_date_orig'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_merge_clean['date_re'] = pd.to_datetime(full_merge_clean['inspection_date_re'])


In [10]:
#remove rows where original inspection occurred after re-inspection
full_merge_clean['time_between'] = full_merge_clean['date_orig'] - full_merge_clean['date_re']
full_merge_clean['time_between'] = full_merge_clean['time_between'].dt.days
full_merge_clean = full_merge_clean[full_merge_clean['time_between'] <= 0]
full_merge_clean.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_merge_clean['time_between'] = full_merge_clean['date_orig'] - full_merge_clean['date_re']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_merge_clean['time_between'] = full_merge_clean['time_between'].dt.days


(248091, 34)

In [11]:
#full_merge_grp = full_merge_clean.groupby('inspection_id_re')
grped = full_merge_clean[full_merge_clean['time_between'] == full_merge_clean.groupby('inspection_id_re')['time_between'].transform('max')]

grped.head()

Unnamed: 0,inspection_id_orig,dba_name,aka_name_orig,license_,facility_type_orig,risk_orig,address,city_orig,state_orig,zip_orig,...,zip_re,inspection_date_re,inspection_type_re,results_re,location_re,violations_re,inspection_type_clean_re,date_orig,date_re,time_between
14,1684274,"PIEROGI HEAVEN, INC.",PIEROGI HEAVEN,2022158,Restaurant,Risk 1 (High),169 N WELLS ST,CHICAGO,IL,60606,...,60606,2016-03-24T00:00:00.000,Canvass Re-Inspection,Pass,"{'latitude': '-87.63376475726596', 'longitude'...",,canvass reinspection,2016-03-17,2016-03-24,-7
19,1447377,"PIEROGI HEAVEN, INC.",PIEROGI HEAVEN,2022158,Restaurant,Risk 1 (High),169 N WELLS ST,CHICAGO,IL,60606,...,60606,2014-07-30T00:00:00.000,Canvass Re-Inspection,Pass,"{'latitude': '-87.63376475726596', 'longitude'...",32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,canvass reinspection,2014-07-22,2014-07-30,-8
33,2282269,MONARCA P.L.A.C.E .7 INC,MONARCA P.L.A.C.E. 7,2391062,Restaurant,Risk 1 (High),4553 S ASHLAND AVE,CHICAGO,IL,60609,...,60609,2019-04-04T00:00:00.000,Canvass Re-Inspection,No Entry,"{'latitude': '-87.6648796277618', 'longitude':...",,canvass reinspection,2019-03-28,2019-04-04,-7
38,2282633,MONARCA P.L.A.C.E .7 INC,MONARCA P.L.A.C.E. 7,2391062,Restaurant,Risk 1 (High),4553 S ASHLAND AVE,CHICAGO,IL,60609,...,60609,2019-04-08T00:00:00.000,Canvass Re-Inspection,Fail,"{'latitude': '-87.6648796277618', 'longitude':...","3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",canvass reinspection,2019-04-04,2019-04-08,-4
43,2282806,MONARCA P.L.A.C.E .7 INC,MONARCA P.L.A.C.E. 7,2391062,Restaurant,Risk 1 (High),4553 S ASHLAND AVE,CHICAGO,IL,60609,...,60609,2019-04-10T00:00:00.000,Canvass Re-Inspection,Pass w/ Conditions,"{'latitude': '-87.6648796277618', 'longitude':...","3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",canvass reinspection,2019-04-08,2019-04-10,-2


In [12]:
grped.shape

(41600, 34)

In [13]:
duplicates = grped[grped.duplicated('inspection_id_re')]
print(duplicates.shape)
duplicates[['inspection_id_orig', 'inspection_date_orig', 'inspection_id_re', 'inspection_date_re', 'time_between']].head(20)

(262, 34)


Unnamed: 0,inspection_id_orig,inspection_date_orig,inspection_id_re,inspection_date_re,time_between
4208,2492464,2021-03-05T00:00:00.000,2492814,2021-03-12T00:00:00.000,-7
10966,2472750,2021-01-15T00:00:00.000,2472744,2021-01-15T00:00:00.000,0
14155,1931653,2016-05-19T00:00:00.000,1931655,2016-05-26T00:00:00.000,-7
19658,2252450,2019-01-10T00:00:00.000,2252797,2019-01-18T00:00:00.000,-8
22896,1307394,2014-04-17T00:00:00.000,1307399,2014-04-18T00:00:00.000,-1
25788,2303973,2019-07-31T00:00:00.000,2304284,2019-08-07T00:00:00.000,-7
34004,1966455,2016-10-14T00:00:00.000,1970656,2016-11-01T00:00:00.000,-18
37084,2484257,2021-01-28T00:00:00.000,2484727,2021-02-10T00:00:00.000,-13
38161,2050307,2017-05-12T00:00:00.000,2050414,2017-05-15T00:00:00.000,-3
41047,233570,2010-05-06T00:00:00.000,233645,2010-05-10T00:00:00.000,-4


In [14]:
grped['results_orig'].unique()

array(['Fail', 'No Entry', 'Pass w/ Conditions', 'Pass', 'Not Ready',
       'Out of Business'], dtype=object)

In [15]:
grped.groupby('results_orig').count()

Unnamed: 0_level_0,inspection_id_orig,dba_name,aka_name_orig,license_,facility_type_orig,risk_orig,address,city_orig,state_orig,zip_orig,...,zip_re,inspection_date_re,inspection_type_re,results_re,location_re,violations_re,inspection_type_clean_re,date_orig,date_re,time_between
results_orig,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Fail,37853,37853,37419,37851,37826,37853,37853,37820,37844,37846,...,37846,37853,37853,37853,37724,24824,37853,37853,37853,37853
No Entry,343,343,338,343,343,343,343,343,343,343,...,343,343,343,343,343,254,343,343,343,343
Not Ready,70,70,70,70,70,70,70,70,70,70,...,70,70,70,70,68,36,70,70,70,70
Out of Business,300,300,297,300,298,300,300,300,300,300,...,300,300,300,300,299,175,300,300,300,300
Pass,596,596,591,596,596,596,596,596,596,596,...,596,596,596,596,595,277,596,596,596,596
Pass w/ Conditions,2438,2438,2429,2438,2438,2438,2438,2436,2438,2437,...,2437,2438,2438,2438,2433,1803,2438,2438,2438,2438


In [16]:
grped_clean = grped[(grped['results_orig'] == 'Fail') | (grped['results_orig'] == 'Pass w/ Conditions')]
grped_clean.shape

(40291, 34)

In [17]:
duplicates2 = grped_clean[grped_clean.duplicated('inspection_id_re')]
print(duplicates2.shape)
duplicates2[['inspection_id_orig', 'inspection_date_orig', 'inspection_id_re', 'inspection_date_re', 'time_between']].head(20)

(69, 34)


Unnamed: 0,inspection_id_orig,inspection_date_orig,inspection_id_re,inspection_date_re,time_between
14155,1931653,2016-05-19T00:00:00.000,1931655,2016-05-26T00:00:00.000,-7
22896,1307394,2014-04-17T00:00:00.000,1307399,2014-04-18T00:00:00.000,-1
34004,1966455,2016-10-14T00:00:00.000,1970656,2016-11-01T00:00:00.000,-18
41047,233570,2010-05-06T00:00:00.000,233645,2010-05-10T00:00:00.000,-4
41483,154306,2010-03-12T00:00:00.000,154309,2010-03-19T00:00:00.000,-7
56903,1931211,2016-05-19T00:00:00.000,1931711,2016-05-26T00:00:00.000,-7
91482,1516051,2015-01-16T00:00:00.000,1516204,2015-01-23T00:00:00.000,-7
110480,580191,2011-04-04T00:00:00.000,580179,2011-04-04T00:00:00.000,0
110481,580191,2011-04-04T00:00:00.000,580176,2011-04-04T00:00:00.000,0
110482,580191,2011-04-04T00:00:00.000,580185,2011-04-04T00:00:00.000,0


In [18]:
#Dropping anything that's been duplicated. This only drops 69 rows. 
grped_no_duplicates = grped_clean.drop_duplicates(subset='inspection_id_re')
grped_no_duplicates = grped_no_duplicates.drop_duplicates(subset='inspection_id_orig')
grped_no_duplicates.shape

(40191, 34)

In [19]:
grped_no_duplicates['time_between'].describe()

count    40191.000000
mean       -11.131223
std         20.854148
min      -1121.000000
25%        -10.000000
50%         -7.000000
75%         -7.000000
max          0.000000
Name: time_between, dtype: float64

In [20]:
grped_no_duplicates.groupby('results_re').count()

Unnamed: 0_level_0,inspection_id_orig,dba_name,aka_name_orig,license_,facility_type_orig,risk_orig,address,city_orig,state_orig,zip_orig,...,state_re,zip_re,inspection_date_re,inspection_type_re,location_re,violations_re,inspection_type_clean_re,date_orig,date_re,time_between
results_re,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Fail,3511,3511,3432,3510,3508,3511,3511,3506,3511,3510,...,3511,3510,3511,3511,3494,3432,3511,3511,3511,3511
No Entry,340,340,335,340,340,340,340,340,340,340,...,340,340,340,340,339,259,340,340,340,340
Not Ready,36,36,36,36,36,36,36,36,36,36,...,36,36,36,36,36,27,36,36,36,36
Out of Business,81,81,81,81,80,81,81,81,81,81,...,81,81,81,81,81,14,81,81,81,81
Pass,31030,31030,30692,31029,31011,31030,31030,31002,31023,31024,...,31023,31024,31030,31030,30929,17804,31030,31030,31030,31030
Pass w/ Conditions,5193,5193,5173,5193,5189,5193,5193,5191,5191,5192,...,5191,5192,5193,5193,5178,5057,5193,5193,5193,5193


In [21]:
grped2 = grped_no_duplicates[(grped_no_duplicates['results_re'] == 'Fail') | (grped_no_duplicates['results_re'] == 'Pass') | (grped_no_duplicates['results_re'] == 'Pass w/ Conditions') ]
grped2['time_between'].describe()

count    39734.000000
mean       -10.997232
std         19.786472
min       -685.000000
25%        -10.000000
50%         -7.000000
75%         -7.000000
max          0.000000
Name: time_between, dtype: float64

In [22]:
final_grped = grped2[grped2['time_between'] >= -180]

In [23]:
final_grped.columns

Index(['inspection_id_orig', 'dba_name', 'aka_name_orig', 'license_',
       'facility_type_orig', 'risk_orig', 'address', 'city_orig', 'state_orig',
       'zip_orig', 'inspection_date_orig', 'inspection_type_orig',
       'results_orig', 'latitude', 'longitude', 'location_orig',
       'violations_orig', 'inspection_type_clean_orig', 'inspection_id_re',
       'aka_name_re', 'facility_type_re', 'risk_re', 'city_re', 'state_re',
       'zip_re', 'inspection_date_re', 'inspection_type_re', 'results_re',
       'location_re', 'violations_re', 'inspection_type_clean_re', 'date_orig',
       'date_re', 'time_between'],
      dtype='object')

In [24]:
final_df = final_grped[['inspection_id_orig', 'dba_name', 'license_', 
                        'facility_type_orig', 'date_orig', 'inspection_type_orig',
                        'results_orig', 'violations_orig', 'inspection_id_re',
                        'date_re', 'results_re', 'time_between']]
final_df = final_df.rename({'inspection_id_orig': 'id_orig', 'dba_name': 'name', 'license_': 'license', 
                        'facility_type_orig': 'facility_type', 'inspection_id_re': 'id_re'}, axis=1)

In [25]:
final_df['inspection_type_orig'].unique()

array(['Canvass', 'Canvass Re-Inspection', 'Complaint',
       'License Re-Inspection', 'License', 'License-Task Force',
       'Short Form Complaint', 'Complaint Re-Inspection',
       'Recent Inspection', 'Tag Removal', 'Suspected Food Poisoning',
       'Short Form Fire-Complaint', 'Consultation', 'Complaint-Fire',
       'RECALL INSPECTION', 'Special Events (Festivals)',
       'Suspected Food Poisoning Re-inspection', 'SFP', 'No Entry',
       'Task Force Liquor 1475', 'Complaint-Fire Re-inspection',
       'REINSPECTION OF 48 HOUR NOTICE', 'SFP/COMPLAINT',
       'Non-Inspection', 'TWO PEOPLE ATE AND GOT SICK.', 'no entry',
       'RE-INSPECTION OF CLOSE-UP', 'Illegal Operation', 'SFP/Complaint',
       'Package Liquor 1474', '1315 license reinspection', 'NO ENTRY',
       'LIQUOR CATERING', 'TASK FORCE NIGHT', 'TAVERN 1470',
       'task force(1470) liquor tavern', 'TASK FORCE PACKAGE LIQUOR',
       'SPECIAL TASK FORCE'], dtype=object)

In [27]:
final_df = final_df[(final_df['inspection_type_orig'] != 'No Entry') & (final_df['inspection_type_orig'] != 'no entry')]

In [28]:
print(final_df.shape)
final_df.head()

(39621, 12)


Unnamed: 0,id_orig,name,license,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,id_re,date_re,results_re,time_between
14,1684274,"PIEROGI HEAVEN, INC.",2022158,Restaurant,2016-03-17,Canvass,Fail,19. OUTSIDE GARBAGE WASTE GREASE AND STORAGE A...,1734731,2016-03-24,Pass,-7
19,1447377,"PIEROGI HEAVEN, INC.",2022158,Restaurant,2014-07-22,Canvass,Fail,24. DISH WASHING FACILITIES: PROPERLY DESIGNED...,1447391,2014-07-30,Pass,-8
43,2282806,MONARCA P.L.A.C.E .7 INC,2391062,Restaurant,2019-04-08,Canvass Re-Inspection,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",2282937,2019-04-10,Pass w/ Conditions,-2
63,2200614,EDDIE V'S WILD FISH,2428608,Restaurant,2018-08-07,Complaint,Fail,16. FOOD-CONTACT SURFACES: CLEANED & SANITIZED...,2200969,2018-08-15,Pass,-8
79,1609983,EDDIE V'S WILD FISH,2428608,Restaurant,2016-01-21,License Re-Inspection,Fail,"16. FOOD PROTECTED DURING STORAGE, PREPARATION...",1632823,2016-03-09,Pass,-48


In [29]:
#Drop conditions:
#repeat re-inspection ID or original inspection ID (may investigate further, but not a big impact)
#original outcome of pass, no entry, or not ready
#more than 150 days between original and reinspection
#went from 41,600 rows to 39,621

#save to pickle
final_df.to_pickle("initial_clean.pkl")