In [188]:
# 1. Import libraries
import pandas as pd

# 2. Define raw data path and load data

RAW_PATH = "../data/raw/food_inspections.csv"
df = pd.read_csv(RAW_PATH, low_memory= False)

# 3. Inspect Shape and preview

df.shape

(296215, 17)

In [189]:
df.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,2623075,HEALTHY WALA TIFFIN / FRIED JUNCTION,FRIED JUNCTION,2943658.0,Restaurant,Risk 1 (High),6349 N CLAREMONT AVE,CHICAGO,IL,60659.0,08/29/2025,Canvass Re-Inspection,No Entry,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",41.99738,-87.688504,"(41.99737973336337, -87.68850409776527)"
1,2623084,KISHA'S KITCHEN,KISHA'S KITCHEN,3006551.0,Restaurant,Risk 1 (High),857 W 115TH ST,CHICAGO,IL,60643.0,08/29/2025,Complaint,Fail,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,41.684943,-87.644369,"(41.68494313726917, -87.644368901148)"
2,2623074,BIG SAMS FOOD MART,BIG SAMS FOOD MART,2817600.0,Grocery Store,Risk 1 (High),133 E 75TH ST,CHICAGO,IL,60619.0,08/29/2025,Complaint,Pass w/ Conditions,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,41.758208,-87.620088,"(41.75820844474927, -87.62008844011588)"
3,2623081,FOOD TOWN,FOOD TOWN,31219.0,Grocery Store,Risk 2 (Medium),935 E 79TH ST,CHICAGO,IL,60619.0,08/29/2025,Canvass Re-Inspection,Pass,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",41.751218,-87.601401,"(41.75121793394575, -87.60140141021087)"
4,2623078,JIMMY JOHNS,JIMMY JOHNS,3041206.0,Restaurant,Risk 1 (High),3696 S ARCHER AVE,CHICAGO,IL,60609.0,08/29/2025,License,Pass w/ Conditions,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,41.826519,-87.684205,"(41.826518542923324, -87.68420498239537)"


In [190]:
# Missing Values

count_missing_values = df.isna().sum().sort_values( ascending = False)
count_missing_values

missing_value_percentage = df.isna().mean().sort_values(ascending = False) * 100 

missing = pd.DataFrame({
    "count_missing_values": count_missing_values,
    "missing_value_percentage": missing_value_percentage
})
missing



Unnamed: 0,count_missing_values,missing_value_percentage
Violations,82449,27.834175
Facility Type,5240,1.768985
AKA Name,2413,0.814611
Longitude,1011,0.341306
Location,1011,0.341306
Latitude,1011,0.341306
City,159,0.053677
Risk,83,0.02802
State,58,0.01958
Zip,42,0.014179


In [191]:
# Duplicates

df.duplicated().sum()
duplicate_count = int(df.duplicated().sum())
print(f"Total duplicate rows: {duplicate_count}")


Total duplicate rows: 0


In [192]:
# Renaming Columns

df.columns = (
    df.columns.str.lower()
    .str.strip()
    .str.replace("#", "number")   # replace # first
    .str.replace(" ", "_")        # then handle spaces
    .str.replace(r"[^a-z0-9_]", "", regex=True)  # cleanup
)
df.columns




Index(['inspection_id', 'dba_name', 'aka_name', 'license_number',
       'facility_type', 'risk', 'address', 'city', 'state', 'zip',
       'inspection_date', 'inspection_type', 'results', 'violations',
       'latitude', 'longitude', 'location'],
      dtype='object')

In [193]:
df.dtypes

df["license_number"] = df["license_number"].astype(str)
df["zip"] = df["zip"].astype(str)
df["inspection_id"] = df["inspection_id"].astype(str)
df["inspection_date"] = pd.to_datetime(df["inspection_date"], errors= "coerce")

# Categorical columns (object, string, category)
categorical_cols = df.select_dtypes(include=["object", "string", "category", "datetime64[ns]"]).columns.tolist()

# Numerical columns (int, float)
numerical_cols = df.select_dtypes(include=["int64", "float64", "Int64"]).columns.tolist()

print("Categorical columns:")
print(categorical_cols)

print("\nNumerical columns:")
print(numerical_cols, "\n")



Categorical columns:
['inspection_id', 'dba_name', 'aka_name', 'license_number', 'facility_type', 'risk', 'address', 'city', 'state', 'zip', 'inspection_date', 'inspection_type', 'results', 'violations', 'location']

Numerical columns:
['latitude', 'longitude'] 



In [194]:
df.head()

Unnamed: 0,inspection_id,dba_name,aka_name,license_number,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,results,violations,latitude,longitude,location
0,2623075,HEALTHY WALA TIFFIN / FRIED JUNCTION,FRIED JUNCTION,2943658.0,Restaurant,Risk 1 (High),6349 N CLAREMONT AVE,CHICAGO,IL,60659.0,2025-08-29,Canvass Re-Inspection,No Entry,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",41.99738,-87.688504,"(41.99737973336337, -87.68850409776527)"
1,2623084,KISHA'S KITCHEN,KISHA'S KITCHEN,3006551.0,Restaurant,Risk 1 (High),857 W 115TH ST,CHICAGO,IL,60643.0,2025-08-29,Complaint,Fail,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,41.684943,-87.644369,"(41.68494313726917, -87.644368901148)"
2,2623074,BIG SAMS FOOD MART,BIG SAMS FOOD MART,2817600.0,Grocery Store,Risk 1 (High),133 E 75TH ST,CHICAGO,IL,60619.0,2025-08-29,Complaint,Pass w/ Conditions,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,41.758208,-87.620088,"(41.75820844474927, -87.62008844011588)"
3,2623081,FOOD TOWN,FOOD TOWN,31219.0,Grocery Store,Risk 2 (Medium),935 E 79TH ST,CHICAGO,IL,60619.0,2025-08-29,Canvass Re-Inspection,Pass,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",41.751218,-87.601401,"(41.75121793394575, -87.60140141021087)"
4,2623078,JIMMY JOHNS,JIMMY JOHNS,3041206.0,Restaurant,Risk 1 (High),3696 S ARCHER AVE,CHICAGO,IL,60609.0,2025-08-29,License,Pass w/ Conditions,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,41.826519,-87.684205,"(41.826518542923324, -87.68420498239537)"


In [195]:
"""
Categorical Variable Exploration & Cleaning

Here we explore categorical columns (facility_type, risk, results)
to identify inconsistent values, typos, and casing issues.
We apply normalization steps such as title-casing and whitespace cleanup
to make categories consistent and ready for analysis.
"""
# facility_type

df["facility_type"] = (
    df["facility_type"]
    .str.lower()
    .str.title()
    .str.replace(r"S\b", "'s", regex = True)
    .str.replace(r"'", "", regex = True)
)
print(df["facility_type"].value_counts().head(10), "\n")
print(df["facility_type"].value_counts().tail(10))




facility_type
Restaurant                         200211
Grocery Store                       36095
School                              18503
Childrens Services Facility          6939
Bakery                               4259
Daycare Above And Under 2 Years      3939
Daycare (2 - 6 Years)                3190
Long Term Care                       2291
Catering                             1835
Liquor                               1270
Name: count, dtype: int64 

facility_type
Grocery/Liquor Store                     1
Protein Shake Bar                        1
Pop-Up Food Establishment User-Tier I    1
Gas/Mini Mart                            1
Day Spa                                  1
Mobile Dessert Vendor                    1
Cat/Liquor                               1
Mobile Dessert Cart                      1
Book Store                               1
Car Wash                                 1
Name: count, dtype: int64


In [196]:
# risk

df["risk"].value_counts()
df[df["risk"] == "All"]

df.loc[df["risk"] == "All", "risk"] = pd.NA

df["risk"].value_counts(dropna = False)




risk
Risk 1 (High)      219081
Risk 2 (Medium)     53589
Risk 3 (Low)        23387
NaN                    83
<NA>                   75
Name: count, dtype: int64

In [197]:
# results

print(df["results"].value_counts().head(), "\n")
print(df["results"].value_counts().tail())

results_mapping = {
    "Pass": "Pass-related",
    "Pass w/ Conditions": "Pass-related",
    "Fail": "Fail-related",
    "Fail w/ Conditions": "Fail-related",
    "Out of Business": "Other/Not Inspected",
    "No Entry": "Other/Not Inspected",
    "Not Ready": "Other/Not Inspected",
    "Business Not Located": "Other/Not Inspected"

}

df["results_grouped"] = df["results"].map(results_mapping)

df["results_grouped"].value_counts(dropna=False)


results
Pass                  153013
Fail                   57337
Pass w/ Conditions     44413
Out of Business        24578
No Entry               12746
Name: count, dtype: int64 

results
Pass w/ Conditions      44413
Out of Business         24578
No Entry                12746
Not Ready                4035
Business Not Located       93
Name: count, dtype: int64


results_grouped
Pass-related           197426
Fail-related            57337
Other/Not Inspected     41452
Name: count, dtype: int64

In [198]:
# Numerical Cleanup

print(df[["latitude", "longitude"]])
print(df[["latitude", "longitude"]].describe())

missing_coord = df[df["latitude"].isna() | df["longitude"].isna()]
print("Missing coords: ", len(missing_coord))

df = df.dropna(subset= ["latitude", "longitude"])

print("After cleanup:", df.shape)


         latitude  longitude
0       41.997380 -87.688504
1       41.684943 -87.644369
2       41.758208 -87.620088
3       41.751218 -87.601401
4       41.826519 -87.684205
...           ...        ...
296210  41.884586 -87.631010
296211  41.898431 -87.628009
296212  41.938443 -87.768318
296213  41.938007 -87.644755
296214        NaN        NaN

[296215 rows x 2 columns]
            latitude      longitude
count  295204.000000  295204.000000
mean       41.880592     -87.676392
std         0.081087       0.058336
min        41.644670     -87.906874
25%        41.831302     -87.707598
50%        41.891797     -87.666524
75%        41.939753     -87.634955
max        42.021064     -87.525094
Missing coords:  1011
After cleanup: (295204, 18)


In [199]:
# Define Chicago bounding box

lat_min, lat_max = 41.6, 42.1
lon_min, lon_max = -88.0, -87.5

df  = df[df["latitude"].between(lat_min, lat_max) & df["longitude"].between(lon_min, lon_max)]
print("After bounding box filter:", df.shape)


After bounding box filter: (295204, 18)


In [200]:
# --- State Cleanup ---

df["state"].value_counts()

state
IL    295146
Name: count, dtype: int64

In [201]:
# --- City Cleanup ---

# Normalize to uppercase so we don't get 'Chicago', 'CHICAGO', 'chicago' separately
df["city"] = df["city"].str.upper()

# Quick look at counts
df["city"].value_counts().head(15)

# Map obvious typos back to CHICAGO
city_dict = {
    "CCHICAGO": "CHICAGO",
    "CHICAGOCHICAGO": "CHICAGO",
    "CHICAGOO": "CHICAGO",
    "CHICAGO.": "CHICAGO",
    "CHCHICAGO": "CHICAGO",
    "312CHICAGO": "CHICAGO",
    "CHICAGOI": "CHICAGO",
    "CHCICAGO": "CHICAGO",
    "CHICAGOC": "CHICAGO"
}
df["city"] = df["city"].replace(city_dict)

# Keep only Chicago rows (dropping suburbs + weird values)
before = df.shape[0]
df = df[df["city"] == "CHICAGO"]
after = df.shape[0]

print(f"Dropped {before - after} rows that weren’t Chicago.")
df.head()






Dropped 179 rows that weren’t Chicago.


Unnamed: 0,inspection_id,dba_name,aka_name,license_number,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,results,violations,latitude,longitude,location,results_grouped
0,2623075,HEALTHY WALA TIFFIN / FRIED JUNCTION,FRIED JUNCTION,2943658.0,Restaurant,Risk 1 (High),6349 N CLAREMONT AVE,CHICAGO,IL,60659.0,2025-08-29,Canvass Re-Inspection,No Entry,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",41.99738,-87.688504,"(41.99737973336337, -87.68850409776527)",Other/Not Inspected
1,2623084,KISHA'S KITCHEN,KISHA'S KITCHEN,3006551.0,Restaurant,Risk 1 (High),857 W 115TH ST,CHICAGO,IL,60643.0,2025-08-29,Complaint,Fail,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,41.684943,-87.644369,"(41.68494313726917, -87.644368901148)",Fail-related
2,2623074,BIG SAMS FOOD MART,BIG SAMS FOOD MART,2817600.0,Grocery Store,Risk 1 (High),133 E 75TH ST,CHICAGO,IL,60619.0,2025-08-29,Complaint,Pass w/ Conditions,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,41.758208,-87.620088,"(41.75820844474927, -87.62008844011588)",Pass-related
3,2623081,FOOD TOWN,FOOD TOWN,31219.0,Grocery Store,Risk 2 (Medium),935 E 79TH ST,CHICAGO,IL,60619.0,2025-08-29,Canvass Re-Inspection,Pass,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",41.751218,-87.601401,"(41.75121793394575, -87.60140141021087)",Pass-related
4,2623078,JIMMY JOHNS,JIMMY JOHNS,3041206.0,Restaurant,Risk 1 (High),3696 S ARCHER AVE,CHICAGO,IL,60609.0,2025-08-29,License,Pass w/ Conditions,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,41.826519,-87.684205,"(41.826518542923324, -87.68420498239537)",Pass-related


In [214]:
# --- ZIP Cleanup ---

# Convert to string, drop decimals (e.g. 60610.0 → 60610)
df["zip"] = df["zip"].astype(str).str.split(".", n=1, expand=True)[0]

# Check lengths (expect mostly 5 digits)
df["zip"].str.len().value_counts()

# Keep only 5-digit ZIPs
df = df[df["zip"].str.len() == 5]

# Keep only Chicago ZIPs
df = df[df["zip"].astype(int).between(60601, 60827)]

# ZIPs by count
print(df["zip"].value_counts().sort_index().tail(15))




zip
60649     3467
60651     4326
60652     2611
60653     2524
60654     6562
60655     1299
60656     1171
60657    10082
60659     6213
60660     4426
60661     3829
60666     3702
60706        3
60707     1716
60827      212
Name: count, dtype: int64
