In [1]:
import numpy as np
import pandas as pd
from sodapy import Socrata

In [2]:
#Make API call to City of Chicago Open Data Portal
#Include only inspections prior to 3/31/2021
client = Socrata("data.cityofchicago.org", None)
results = client.get("4ijn-s7e5", where="inspection_date < '2021-04-01T00:00:00.000'", limit=100000000)

inspections_df = pd.DataFrame.from_records(results)
inspections_df.head()



Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,results,latitude,longitude,location,violations
0,2484567,ROYALTY,ROYALTY,1306130,Restaurant,Risk 1 (High),3810 W 63RD ST,CHICAGO,IL,60629,2021-02-05T00:00:00.000,Non-Inspection,No Entry,41.778836516734856,-87.71836138998039,"{'latitude': '-87.71836138998039', 'longitude'...",
1,2473041,ROSATI'S GRANT PARK,ROSATI'S,2762683,Restaurant,Risk 1 (High),23 E ADAMS ST,CHICAGO,IL,60603,2021-01-22T00:00:00.000,License,Pass,41.879391313239694,-87.62684825563626,"{'latitude': '-87.62684825563626', 'longitude'...",
2,2472982,ENZO & EMILIA,ENZO & EMILIA,2762590,Restaurant,Risk 1 (High),2840-2542 N MILKWAUKEE AVE,CHICAGO,IL,60647,2021-01-21T00:00:00.000,License,Pass,,,,
3,2472728,PRET A MANGER,PRET A MANGER,2138418,Restaurant,Risk 1 (High),225 N MICHIGAN AVE FL,CHICAGO,IL,60601,2021-01-14T00:00:00.000,Canvass,Out of Business,41.88637740620821,-87.62438167043969,"{'latitude': '-87.62438167043969', 'longitude'...",
4,2472613,CHIPOTLE MEXICAN GRILL,CHIPOTLE MEXICAN GRILL,1379435,Restaurant,Risk 1 (High),235 W LAKE ST,CHICAGO,IL,60606,2021-01-12T00:00:00.000,Canvass,Pass,41.885620943905664,-87.63506653338979,"{'latitude': '-87.63506653338979', 'longitude'...",


In [3]:
inspections_df["inspection_type"].unique()

array(['Non-Inspection', 'License', 'Canvass', 'Complaint',
       'Canvass Re-Inspection', 'License Re-Inspection',
       'Short Form Complaint', 'Complaint Re-Inspection',
       'Suspected Food Poisoning', 'Recent Inspection', 'Consultation',
       'License-Task Force', 'Suspected Food Poisoning Re-inspection',
       'Complaint-Fire', 'Short Form Fire-Complaint', 'Not Ready',
       'Pre-License Consultation', 'OUT OF BUSINESS',
       'Complaint-Fire Re-inspection', 'Tag Removal',
       'Task Force Liquor 1475', 'LICENSE REQUEST', 'Out of Business',
       'No Entry', 'OFFICE ASSIGNMENT', 'COVID COMPLAINT',
       'Recent inspection', 'Special Events (Festivals)',
       'fire complaint', 'FIRE', nan, 'KITCHEN CLOSED FOR RENOVATION',
       'O.B.', 'CORRECTIVE ACTION', 'LICENSE CANCELED BY OWNER',
       'LICENSE CONSULTATION', 'OWNER SUSPENDED OPERATION/LICENSE',
       'License consultation', 'Package Liquor 1474', 'Illegal Operation',
       'CANVASS SPECIAL EVENTS', 'TWO PE

In [4]:
#Take only re-inspections
inspections_df['inspection_type_clean'] = inspections_df['inspection_type'].str.lower()
inspections_df['inspection_type_clean'] = inspections_df['inspection_type_clean'].str.replace('-', '')
inspections_df['inspection_type_clean'] = inspections_df['inspection_type_clean'].str.replace('re inspection', 'reinspection')

#inspections_df[inspections_df['inspection_type_clean'].isna()]
#One record is missing inspection type; will drop
inspections_df = inspections_df.dropna(subset=['inspection_type_clean'])
#inspections_df['inspection_type_clean'].unique()

reinspections_df = inspections_df[inspections_df['inspection_type_clean'].str.contains('reinspection')]
reinspections_df.shape

(41456, 18)

In [5]:
inspections_df.shape

(218277, 18)

In [6]:
#matching steps:
#merge full inspections_df with reinspections_df
#delete where inspection_id = reinspection_id
#group by ID of reinspection
#within each group, retain only inspection with closest date before (but not equal to) reinspection
full_merge = inspections_df.merge(reinspections_df, on=['dba_name', 'address', 'license_', 'latitude', 'longitude'],
                                    suffixes=('_orig', '_re'))
full_merge.shape

(494375, 31)

In [7]:
full_merge.columns

Index(['inspection_id_orig', 'dba_name', 'aka_name_orig', 'license_',
       'facility_type_orig', 'risk_orig', 'address', 'city_orig', 'state_orig',
       'zip_orig', 'inspection_date_orig', 'inspection_type_orig',
       'results_orig', 'latitude', 'longitude', 'location_orig',
       'violations_orig', 'inspection_type_clean_orig', 'inspection_id_re',
       'aka_name_re', 'facility_type_re', 'risk_re', 'city_re', 'state_re',
       'zip_re', 'inspection_date_re', 'inspection_type_re', 'results_re',
       'location_re', 'violations_re', 'inspection_type_clean_re'],
      dtype='object')

In [8]:
full_merge_clean = full_merge[full_merge['inspection_id_orig'] != full_merge['inspection_id_re']]
full_merge_clean.shape

(452919, 31)

In [9]:
full_merge_clean['date_orig'] = pd.to_datetime(full_merge_clean['inspection_date_orig'])
full_merge_clean['date_re'] = pd.to_datetime(full_merge_clean['inspection_date_re'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_merge_clean['date_orig'] = pd.to_datetime(full_merge_clean['inspection_date_orig'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_merge_clean['date_re'] = pd.to_datetime(full_merge_clean['inspection_date_re'])


In [10]:
#remove rows where original inspection occurred after re-inspection
full_merge_clean['time_between'] = full_merge_clean['date_orig'] - full_merge_clean['date_re']
full_merge_clean['time_between'] = full_merge_clean['time_between'].dt.days
full_merge_clean = full_merge_clean[full_merge_clean['time_between'] <= 0]
full_merge_clean.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_merge_clean['time_between'] = full_merge_clean['date_orig'] - full_merge_clean['date_re']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_merge_clean['time_between'] = full_merge_clean['time_between'].dt.days


(248093, 34)

In [11]:
#full_merge_grp = full_merge_clean.groupby('inspection_id_re')
grped = full_merge_clean[full_merge_clean['time_between'] == full_merge_clean.groupby('inspection_id_re')['time_between'].transform('max')]

grped.head()

Unnamed: 0,inspection_id_orig,dba_name,aka_name_orig,license_,facility_type_orig,risk_orig,address,city_orig,state_orig,zip_orig,...,zip_re,inspection_date_re,inspection_type_re,results_re,location_re,violations_re,inspection_type_clean_re,date_orig,date_re,time_between
16,577275,ROYALTY,ROYALTY,1306130,Restaurant,Risk 1 (High),3810 W 63RD ST,CHICAGO,IL,60629,...,60629,2011-05-24T00:00:00.000,Canvass Re-Inspection,Pass,"{'latitude': '-87.71836138998039', 'longitude'...",33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,canvass reinspection,2011-04-18,2011-05-24,-36
38,1345428,PRET A MANGER,PRET A MANGER,2138418,Restaurant,Risk 1 (High),225 N MICHIGAN AVE FL,CHICAGO,IL,60601,...,60601,2013-08-13T00:00:00.000,Canvass Re-Inspection,Pass,"{'latitude': '-87.62438167043969', 'longitude'...",,canvass reinspection,2013-08-06,2013-08-13,-7
43,1114379,PRET A MANGER,PRET A MANGER,2138418,Restaurant,Risk 1 (High),225 N MICHIGAN AVE FL,CHICAGO,IL,60601,...,60601,2012-07-31T00:00:00.000,Complaint Re-Inspection,Pass,"{'latitude': '-87.62438167043969', 'longitude'...","34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO...",complaint reinspection,2012-07-23,2012-07-31,-8
65,343293,CHIPOTLE MEXICAN GRILL,CHIPOTLE MEXICAN GRILL,1379435,Restaurant,Risk 1 (High),235 W LAKE ST,CHICAGO,IL,60606,...,60606,2010-08-26T00:00:00.000,Canvass Re-Inspection,Pass,"{'latitude': '-87.63506653338979', 'longitude'...",32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,canvass reinspection,2010-08-17,2010-08-26,-9
72,2484973,HILLTOP FAMILY RESTAURANT,HILLTOP FAMILY RESTAURANT,2652370,Restaurant,Risk 1 (High),2800 W FOSTER AVE,CHICAGO,IL,60625,...,60625,2021-02-23T00:00:00.000,Canvass Re-Inspection,Pass w/ Conditions,"{'latitude': '-87.69904568571212', 'longitude'...",23. PROPER DATE MARKING AND DISPOSITION - Comm...,canvass reinspection,2021-02-19,2021-02-23,-4


In [12]:
grped.shape

(41600, 34)

In [13]:
duplicates = grped[grped.duplicated('inspection_id_re')]
print(duplicates.shape)
duplicates[['inspection_id_orig', 'inspection_date_orig', 'inspection_id_re', 'inspection_date_re', 'time_between']].head(20)

(262, 34)


Unnamed: 0,inspection_id_orig,inspection_date_orig,inspection_id_re,inspection_date_re,time_between
3933,2492457,2021-03-05T00:00:00.000,2492814,2021-03-12T00:00:00.000,-7
4667,2252347,2019-01-10T00:00:00.000,2252797,2019-01-18T00:00:00.000,-8
19662,2285631,2019-04-23T00:00:00.000,2290882,2019-06-04T00:00:00.000,-42
22362,2484255,2021-01-28T00:00:00.000,2484727,2021-02-10T00:00:00.000,-13
23194,154305,2010-03-12T00:00:00.000,154309,2010-03-19T00:00:00.000,-7
23514,2214129,2018-09-10T00:00:00.000,2240981,2018-12-04T00:00:00.000,-85
43154,1760212,2016-05-19T00:00:00.000,1931711,2016-05-26T00:00:00.000,-7
43334,2144587,2018-02-06T00:00:00.000,2144892,2018-02-13T00:00:00.000,-7
43649,2050307,2017-05-12T00:00:00.000,2050414,2017-05-15T00:00:00.000,-3
49181,1467468,2014-09-09T00:00:00.000,1467490,2014-09-23T00:00:00.000,-14


In [14]:
grped['results_orig'].unique()

array(['Fail', 'Pass w/ Conditions', 'Pass', 'Not Ready', 'No Entry',
       'Out of Business'], dtype=object)

In [15]:
grped.groupby('results_orig').count()

Unnamed: 0_level_0,inspection_id_orig,dba_name,aka_name_orig,license_,facility_type_orig,risk_orig,address,city_orig,state_orig,zip_orig,...,zip_re,inspection_date_re,inspection_type_re,results_re,location_re,violations_re,inspection_type_clean_re,date_orig,date_re,time_between
results_orig,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Fail,37854,37854,37420,37852,37827,37854,37854,37821,37845,37847,...,37847,37854,37854,37854,37725,24825,37854,37854,37854,37854
No Entry,343,343,338,343,343,343,343,343,343,343,...,343,343,343,343,343,254,343,343,343,343
Not Ready,70,70,70,70,70,70,70,70,70,70,...,70,70,70,70,68,36,70,70,70,70
Out of Business,300,300,297,300,298,300,300,300,300,300,...,300,300,300,300,299,175,300,300,300,300
Pass,596,596,591,596,596,596,596,596,596,596,...,596,596,596,596,595,277,596,596,596,596
Pass w/ Conditions,2437,2437,2428,2437,2437,2437,2437,2435,2437,2436,...,2436,2437,2437,2437,2432,1802,2437,2437,2437,2437


In [16]:
#grped_clean = grped[(grped['results_orig'] == 'Fail') | (grped['results_orig'] == 'Pass w/ Conditions')]
grped_clean = grped[grped['results_orig'] == 'Fail']
grped_clean.shape

(37854, 34)

In [17]:
duplicates2 = grped_clean[grped_clean.duplicated('inspection_id_re')]
print(duplicates2.shape)
duplicates2[['inspection_id_orig', 'inspection_date_orig', 'inspection_id_re', 'inspection_date_re', 'time_between']].head(20)

(64, 34)


Unnamed: 0,inspection_id_orig,inspection_date_orig,inspection_id_re,inspection_date_re,time_between
23194,154305,2010-03-12T00:00:00.000,154309,2010-03-19T00:00:00.000,-7
43154,1760212,2016-05-19T00:00:00.000,1931711,2016-05-26T00:00:00.000,-7
52136,1965764,2016-09-30T00:00:00.000,1965767,2016-10-03T00:00:00.000,-3
75513,88601,2010-11-19T00:00:00.000,88604,2010-11-23T00:00:00.000,-4
79510,580611,2011-04-27T00:00:00.000,580732,2011-05-04T00:00:00.000,-7
92796,670705,2012-02-17T00:00:00.000,670797,2012-02-27T00:00:00.000,-10
94927,580191,2011-04-04T00:00:00.000,580186,2011-04-04T00:00:00.000,0
94928,580191,2011-04-04T00:00:00.000,580176,2011-04-04T00:00:00.000,0
94929,580191,2011-04-04T00:00:00.000,580167,2011-04-04T00:00:00.000,0
94931,580191,2011-04-04T00:00:00.000,580234,2011-04-04T00:00:00.000,0


In [18]:
#Dropping anything that's been duplicated. This only drops 69 rows. 
grped_no_duplicates = grped_clean.drop_duplicates(subset='inspection_id_re')
grped_no_duplicates = grped_no_duplicates.drop_duplicates(subset='inspection_id_orig')
grped_no_duplicates.shape

(37756, 34)

In [19]:
grped_no_duplicates['time_between'].describe()

count    37756.000000
mean       -11.004556
std         19.771317
min       -685.000000
25%        -10.000000
50%         -7.000000
75%         -7.000000
max          0.000000
Name: time_between, dtype: float64

In [20]:
grped_no_duplicates.groupby('results_re').count()

Unnamed: 0_level_0,inspection_id_orig,dba_name,aka_name_orig,license_,facility_type_orig,risk_orig,address,city_orig,state_orig,zip_orig,...,state_re,zip_re,inspection_date_re,inspection_type_re,location_re,violations_re,inspection_type_clean_re,date_orig,date_re,time_between
results_re,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Fail,3489,3489,3410,3488,3486,3489,3489,3485,3489,3488,...,3489,3488,3489,3489,3472,3412,3489,3489,3489,3489
No Entry,329,329,324,329,329,329,329,329,329,329,...,329,329,329,329,329,252,329,329,329,329
Not Ready,36,36,36,36,36,36,36,36,36,36,...,36,36,36,36,36,27,36,36,36,36
Out of Business,78,78,78,78,77,78,78,78,78,78,...,78,78,78,78,78,14,78,78,78,78
Pass,29043,29043,28713,29042,29024,29043,29043,29016,29036,29038,...,29036,29038,29043,29043,28944,16438,29043,29043,29043,29043
Pass w/ Conditions,4781,4781,4762,4781,4777,4781,4781,4779,4779,4780,...,4779,4780,4781,4781,4768,4654,4781,4781,4781,4781


In [21]:
grped2 = grped_no_duplicates[(grped_no_duplicates['results_re'] == 'Fail') | (grped_no_duplicates['results_re'] == 'Pass') | (grped_no_duplicates['results_re'] == 'Pass w/ Conditions') ]
grped2['time_between'].describe()

count    37313.000000
mean       -10.889422
std         19.417266
min       -685.000000
25%        -10.000000
50%         -7.000000
75%         -7.000000
max          0.000000
Name: time_between, dtype: float64

In [22]:
final_grped = grped2[grped2['time_between'] >= -180]

In [23]:
final_grped.columns

Index(['inspection_id_orig', 'dba_name', 'aka_name_orig', 'license_',
       'facility_type_orig', 'risk_orig', 'address', 'city_orig', 'state_orig',
       'zip_orig', 'inspection_date_orig', 'inspection_type_orig',
       'results_orig', 'latitude', 'longitude', 'location_orig',
       'violations_orig', 'inspection_type_clean_orig', 'inspection_id_re',
       'aka_name_re', 'facility_type_re', 'risk_re', 'city_re', 'state_re',
       'zip_re', 'inspection_date_re', 'inspection_type_re', 'results_re',
       'location_re', 'violations_re', 'inspection_type_clean_re', 'date_orig',
       'date_re', 'time_between'],
      dtype='object')

In [24]:
final_df = final_grped[['inspection_id_orig', 'dba_name', 'license_', 
                        'facility_type_orig', 'date_orig', 'inspection_type_orig',
                        'results_orig', 'violations_orig', 'inspection_id_re',
                        'date_re', 'results_re', 'time_between']]
final_df = final_df.rename({'inspection_id_orig': 'id_orig', 'dba_name': 'name', 'license_': 'license', 
                        'facility_type_orig': 'facility_type', 'inspection_id_re': 'id_re'}, axis=1)

In [25]:
final_df['inspection_type_orig'].unique()

array(['Canvass', 'Complaint', 'Canvass Re-Inspection', 'License',
       'License Re-Inspection', 'Complaint Re-Inspection',
       'License-Task Force', 'Short Form Complaint',
       'Suspected Food Poisoning', 'Consultation', 'Recent Inspection',
       'RECALL INSPECTION', 'Complaint-Fire',
       'Special Events (Festivals)', 'SFP',
       'Suspected Food Poisoning Re-inspection', 'No Entry',
       'Tag Removal', 'Short Form Fire-Complaint',
       'Task Force Liquor 1475', 'Complaint-Fire Re-inspection',
       'SFP/COMPLAINT', 'Non-Inspection', 'TWO PEOPLE ATE AND GOT SICK.',
       'no entry', 'RE-INSPECTION OF CLOSE-UP', 'Illegal Operation',
       'REINSPECTION OF 48 HOUR NOTICE', 'SFP/Complaint',
       'Package Liquor 1474', '1315 license reinspection', 'NO ENTRY',
       'TASK FORCE NIGHT', 'TAVERN 1470',
       'task force(1470) liquor tavern', 'TASK FORCE PACKAGE LIQUOR',
       'LIQUOR CATERING', 'SPECIAL TASK FORCE'], dtype=object)

In [26]:
final_df = final_df[(final_df['inspection_type_orig'] != 'No Entry') & (final_df['inspection_type_orig'] != 'no entry')]

In [27]:
print(final_df.shape)
final_df.head()

(37206, 12)


Unnamed: 0,id_orig,name,license,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,id_re,date_re,results_re,time_between
16,577275,ROYALTY,1306130,Restaurant,2011-04-18,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,577343,2011-05-24,Pass,-36
38,1345428,PRET A MANGER,2138418,Restaurant,2013-08-06,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,1345448,2013-08-13,Pass,-7
43,1114379,PRET A MANGER,2138418,Restaurant,2012-07-23,Complaint,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,1114384,2012-07-31,Pass,-8
65,343293,CHIPOTLE MEXICAN GRILL,1379435,Restaurant,2010-08-17,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,343310,2010-08-26,Pass,-9
72,2484973,HILLTOP FAMILY RESTAURANT,2652370,Restaurant,2021-02-19,Canvass Re-Inspection,Fail,"44. UTENSILS, EQUIPMENT & LINENS: PROPERLY STO...",2485081,2021-02-23,Pass w/ Conditions,-4


In [28]:
#Drop conditions:
#repeat re-inspection ID or original inspection ID (may investigate further, but not a big impact)
#original outcome of pass, pass w/conditions, no entry, or not ready
#more than 150 days between original and reinspection
#went from 41,600 rows to 37,201

#save to pickle
final_df.to_pickle("initial_clean.pkl")