# Data Loading

In [None]:
!pip install sodapy pandas
from sodapy import Socrata
import pandas as pd

Collecting sodapy
  Downloading sodapy-2.2.0-py2.py3-none-any.whl.metadata (15 kB)
Downloading sodapy-2.2.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: sodapy
Successfully installed sodapy-2.2.0


In [None]:
APP_TOKEN = "8GXNVY2yK4u55lcft3CNx0PWO"    # replace with the token you got from Socrata
client = Socrata("data.cityofchicago.org", APP_TOKEN)

# grab a tiny sample to prove it works
results = client.get(
  "4ijn-s7e5",
  select="dba_name,address,inspection_date,results,violations",
  limit=5
)
df = pd.DataFrame.from_records(results)
df.head()

Unnamed: 0,dba_name,address,inspection_date,results,violations
0,CHINA COURT RESTAURANT,1146 N MILWAUKEE AVE,2012-03-14T00:00:00.000,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
1,CUDDLE CARE,4800 S LAKE PARK AVE,2012-10-22T00:00:00.000,Pass,31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...
2,CUDDLE CARE,4800 S LAKE PARK AVE,2012-10-22T00:00:00.000,Pass,31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...
3,SHARKS FISH & CHICKEN,101 E 51ST ST,2012-10-26T00:00:00.000,Pass,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO..."
4,SALAAM RESTAURANT AND BAKERY,700-706 W 79TH ST,2013-01-24T00:00:00.000,Pass,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...


In [None]:
def search_inspections(name_query: str, limit: int = 50) -> pd.DataFrame:
    where_clause = f"upper(dba_name) LIKE '%{name_query.upper()}%'"
    results = client.get(
        "4ijn-s7e5",
        where=where_clause,
        select="dba_name,address,inspection_date,results,violations",
        order="inspection_date DESC",
        limit=limit
    )
    df = pd.DataFrame.from_records(results)
    df["inspection_date"] = pd.to_datetime(df["inspection_date"])
    return df

# Example
df_taco = search_inspections("TACO", limit=10)
df_taco

Unnamed: 0,dba_name,address,inspection_date,results,violations
0,GOMEZ TACOS REST,3016 E 91ST ST,2025-04-16,Pass w/ Conditions,16. FOOD-CONTACT SURFACES: CLEANED & SANITIZED...
1,RAYMONDS TACOS LLC,229 S WESTERN AVE,2025-04-16,Out of Business,
2,EL BUEN TACO #4,2300 S THROOP ST,2025-04-16,Pass,37. FOOD PROPERLY LABELED; ORIGINAL CONTAINER ...
3,TACO FIRME,2640 W CHICAGO AVE,2025-04-16,Fail,47. FOOD & NON-FOOD CONTACT SURFACES CLEANABLE...
4,RAYMONDS TACOS LLC,229 S WESTERN AVE,2025-04-16,Pass,49. NON-FOOD/FOOD CONTACT SURFACES CLEAN - Com...
5,TACO MOTORA INC,2300 S THROOP ST,2025-04-15,Pass,37. FOOD PROPERLY LABELED; ORIGINAL CONTAINER ...
6,TACO PROS/EGG HOLIC,1400 E 47TH DR,2025-04-14,Pass,
7,TACO BELL #2513,6944 W ARCHER AVE,2025-04-14,Pass,49. NON-FOOD/FOOD CONTACT SURFACES CLEAN - Com...
8,TACO PROS,7108 S WESTERN AVE,2025-04-14,Fail,43. IN-USE UTENSILS: PROPERLY STORED - Comment...
9,PEPE TACO,11652 S WESTERN AVE,2025-04-14,Pass,39. CONTAMINATION PREVENTED DURING FOOD PREPAR...


In [None]:
def search_inspections(name_query: str, limit: int = 50) -> pd.DataFrame:
    select_str = ",".join([
        "dba_name", "facility_type", "address", "city", "state", "zip",
        "inspection_date", "inspection_type", "risk", "results", "violations"
    ])
    where_clause = f"upper(dba_name) LIKE '%{name_query.upper()}%'"
    results = client.get(
        "4ijn-s7e5",
        where=where_clause,
        select=select_str,
        order="inspection_date DESC",
        limit=limit
    )
    df = pd.DataFrame.from_records(results)
    df["inspection_date"] = pd.to_datetime(df["inspection_date"])
    df["Address"] = (
        df["address"]
        .str.cat(df["city"], sep=", ")
        .str.cat(df["state"], sep=", ")
        .str.cat(df["zip"], sep=" ")
    )
    df = df.drop(columns=["address", "city", "state", "zip"])
    df = df.rename(columns={
        "dba_name":       "Business Name",
        "facility_type":  "Facility Type",
        "inspection_date":"Inspection Date",
        "inspection_type":"Inspection Type",
        "risk":           "Risk Category",
        "results":        "Result",
        "violations":     "Raw Violations"
    })
    return df

# Try it:
df_sample = search_inspections("PIZZA", limit=10)
df_sample


Unnamed: 0,Business Name,Facility Type,Inspection Date,Inspection Type,Risk Category,Result,Raw Violations,Address
0,PIZZA CAPRI / ZIG ZAG KITCHEN,Restaurant,2025-04-17,License,Risk 1 (High),Pass,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,"3310 N ELSTON AVE, CHICAGO, IL 60618"
1,DOMINOS PIZZA,Restaurant,2025-04-14,Canvass,Risk 1 (High),Pass w/ Conditions,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,"2337 W CERMAK RD, CHICAGO, IL 60608"
2,BARRACO'S PIZZA AND THE VINEYARD BANQUETS,Restaurant,2025-04-10,Canvass,Risk 1 (High),Pass,37. FOOD PROPERLY LABELED; ORIGINAL CONTAINER ...,"2105-2121 W 95TH ST, CHICAGO, IL 60643"
3,WEST END TATA'S PIZZA,Restaurant,2025-04-09,Non-Inspection,Risk 1 (High),No Entry,,"3019 W 111TH ST, CHICAGO, IL 60655"
4,PEQUOD'S PIZZA,Restaurant,2025-04-09,Complaint,Risk 1 (High),Pass,36. THERMOMETERS PROVIDED & ACCURATE - Comment...,"2207 N CLYBOURN AVE, CHICAGO, IL 60614"
5,JETS PIZZA,Restaurant,2025-04-07,License,Risk 2 (Medium),Not Ready,,"749 W 31ST ST, CHICAGO, IL 60616"
6,DE ARCOS PIZZA & RESTAURANT,Restaurant,2025-04-07,Canvass,Risk 1 (High),Pass,39. CONTAMINATION PREVENTED DURING FOOD PREPAR...,"2832 E 87TH ST, CHICAGO, IL 60617"
7,VINIS PIZZA,Restaurant,2025-04-04,Canvass,Risk 1 (High),Pass,,"4009 W LAWRENCE AVE, CHICAGO, IL 60630"
8,LITTLE CAESARS PIZZA,Restaurant,2025-04-04,Complaint,Risk 2 (Medium),Pass w/ Conditions,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...","6940 S ASHLAND AVE, CHICAGO, IL 60636"
9,BARRACO'S PIZZA,Restaurant,2025-04-03,Canvass,Risk 1 (High),Pass,39. CONTAMINATION PREVENTED DURING FOOD PREPAR...,"3043-3047 W 111TH ST, CHICAGO, IL 60655"


In [None]:
def format_and_reorder(df: pd.DataFrame, drop_risk: bool = False) -> pd.DataFrame:
    df["Inspection Date"] = df["Inspection Date"].dt.strftime("%m/%d/%Y")
    if not drop_risk:
        df["Risk Level"] = df["Risk Category"].str.extract(r"\((.*?)\)")
    cols = [
        "Business Name", "Facility Type", "Address",
        "Inspection Date", "Inspection Type", "Raw Violations"
    ]
    if not drop_risk:
        cols.append("Risk Level")
    cols.append("Result")
    return df[cols]


In [None]:
def clean_and_reorder(df: pd.DataFrame) -> pd.DataFrame:
    df["Inspection Frequency"] = df["Risk Category"].str.extract(r"\((.*?)\)")
    df["Inspection Date"] = df["Inspection Date"].dt.strftime("%m/%d/%Y")
    df = df.drop(columns=["Risk Category"])
    cols = [
        "Business Name", "Facility Type", "Address",
        "Inspection Date", "Inspection Type",
        "Inspection Frequency", "Raw Violations", "Result"
    ]
    return df[cols]


In [None]:
inspection_type_map = {
    "Canvass":                "Routine Inspection",
    "Consultation":           "Pre‑Opening Consultation",
    "Complaint":              "Complaint‑Driven Inspection",
    "License":                "Licensing Inspection",
    "Suspect Food Poisoning": "Food‑Poisoning Investigation",
    "Task-Force Inspection":  "Bar/Tavern Task‑Force Inspection"
}

def humanize_inspection_type(df: pd.DataFrame) -> pd.DataFrame:
    def map_type(raw):
        base = raw.replace("Re-","")
        label = inspection_type_map.get(base, base)
        if raw.startswith("Re-"):
            return f"{label} (Re‑Check)"
        return label

    df["Inspection Type"] = df["Inspection Type"].apply(map_type)
    return df

# Pipeline example:
df = search_inspections("PIZZA", limit=10)
df = clean_and_reorder(df)
df = humanize_inspection_type(df)
df


Unnamed: 0,Business Name,Facility Type,Address,Inspection Date,Inspection Type,Inspection Frequency,Raw Violations,Result
0,PIZZA CAPRI / ZIG ZAG KITCHEN,Restaurant,"3310 N ELSTON AVE, CHICAGO, IL 60618",04/17/2025,Licensing Inspection,High,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,Pass
1,DOMINOS PIZZA,Restaurant,"2337 W CERMAK RD, CHICAGO, IL 60608",04/14/2025,Routine Inspection,High,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,Pass w/ Conditions
2,BARRACO'S PIZZA AND THE VINEYARD BANQUETS,Restaurant,"2105-2121 W 95TH ST, CHICAGO, IL 60643",04/10/2025,Routine Inspection,High,37. FOOD PROPERLY LABELED; ORIGINAL CONTAINER ...,Pass
3,WEST END TATA'S PIZZA,Restaurant,"3019 W 111TH ST, CHICAGO, IL 60655",04/09/2025,Non-Inspection,High,,No Entry
4,PEQUOD'S PIZZA,Restaurant,"2207 N CLYBOURN AVE, CHICAGO, IL 60614",04/09/2025,Complaint‑Driven Inspection,High,36. THERMOMETERS PROVIDED & ACCURATE - Comment...,Pass
5,JETS PIZZA,Restaurant,"749 W 31ST ST, CHICAGO, IL 60616",04/07/2025,Licensing Inspection,Medium,,Not Ready
6,DE ARCOS PIZZA & RESTAURANT,Restaurant,"2832 E 87TH ST, CHICAGO, IL 60617",04/07/2025,Routine Inspection,High,39. CONTAMINATION PREVENTED DURING FOOD PREPAR...,Pass
7,VINIS PIZZA,Restaurant,"4009 W LAWRENCE AVE, CHICAGO, IL 60630",04/04/2025,Routine Inspection,High,,Pass
8,LITTLE CAESARS PIZZA,Restaurant,"6940 S ASHLAND AVE, CHICAGO, IL 60636",04/04/2025,Complaint‑Driven Inspection,Medium,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",Pass w/ Conditions
9,BARRACO'S PIZZA,Restaurant,"3043-3047 W 111TH ST, CHICAGO, IL 60655",04/03/2025,Routine Inspection,High,39. CONTAMINATION PREVENTED DURING FOOD PREPAR...,Pass


In [None]:
import pandas as pd

# 1. Ensure no NaNs in the source column
df["Raw Violations"] = df["Raw Violations"].fillna("")

# 2. Updated extractor
def extract_key_phrases(text, top_n: int = 5) -> str:
    if not isinstance(text, str) or not text.strip():
        return ""
    doc = nlp(text)
    seen, phrases = set(), []
    for chunk in doc.noun_chunks:
        tok = chunk.text.strip().lower()
        if tok not in seen:
            seen.add(tok)
            phrases.append(chunk.text)
        if len(phrases) >= top_n:
            break
    return ", ".join(phrases)

# 3. Re‑run the pipeline
df = search_inspections("PIZZA", limit=10)
df = clean_and_reorder(df)
df = humanize_inspection_type(df)
df["Key Phrases"] = df["Raw Violations"].apply(extract_key_phrases)
df