# NYC Inspections - part II
Data Preprocessing, ML Model, Insights & GenAI

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from sklearn.metrics import confusion_matrix


sys.path.append('../src/')
from preprocess import Preprocess


pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('../data/43nn-pn8j.csv')

In [4]:
df_gradable_inspection, df_agg = Preprocess().clean_data(df)

2025-07-24 22:44:01,051 - INFO - Starting preprocessing of data.
2025-07-24 22:44:01,066 - INFO - Initial sample size: (100000, 27)
2025-07-24 22:44:01,318 - INFO - Sample size after removing duplicates: (99998, 27)
2025-07-24 22:44:01,363 - INFO - Converted date columns: ['inspection_date', 'record_date', 'grade_date'] to datetime format.
2025-07-24 22:44:01,384 - INFO - Sample size after removing 1/1/1900 entries: (98293, 27)
2025-07-24 22:44:01,395 - INFO - 9 inspections took place during COVID-19 (March 17, 2020 - July 19, 2021)
2025-07-24 22:44:01,415 - INFO - Sample size after removing COVID-19 inspections: (98284, 27)
2025-07-24 22:44:01,528 - INFO - Sample size after filtering for gradable inspections: (89910, 27)
2025-07-24 22:44:01,532 - INFO - Computing grades based on scores. Initial grades: [nan 'B' 'A' 'C' 'N' 'Z']
2025-07-24 22:44:01,558 - INFO - Computed grades based on scores. Final grades: ['B' 'C' 'A']
2025-07-24 22:44:01,571 - INFO - Found 63 entries where the grade

In [5]:
def side_by_side_bar_chart(df1, df2, suplot_title1, suplot_title2, title):
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=(suplot_title1, suplot_title2)
    )

    fig.add_trace(
        go.Bar(x=df1['grade'], y=df1['count'], name='Original', text=df1['count'], textposition='outside'),
        row=1, col=1
    )

    fig.add_trace(
        go.Bar(x=df2['grade'], y=df2['count'], name='Corrected', text=df2['count'], textposition='outside'),
        row=1, col=2
    )

    fig.update_layout(
        title_text=title,
        showlegend=False,
        height=500, width=900,
    )

    fig.update_yaxes(title_text="Count", row=1, col=1)
    fig.update_yaxes(title_text="Count", row=1, col=2)
    fig.update_xaxes(title_text="Grade",row=1, col=1)
    fig.update_xaxes(title_text="Grade",row=1, col=2)

    return fig

In [6]:
# Compare original and corrected grade distributions
# Count original grades 
grade_counts_raw_agg = df_agg['original_grade'].value_counts(dropna=False).reset_index()
grade_counts_raw_agg.columns = ['grade', 'count']
grade_counts_raw_agg['grade'] = grade_counts_raw_agg['grade'].fillna('Missing')

# Count computed grades
grade_counts_clean_agg = df_agg['computed_grade'].value_counts(dropna=False).reset_index()
grade_counts_clean_agg.columns = ['grade', 'count']
grade_counts_clean_agg['grade'] = grade_counts_clean_agg['grade'].fillna('Missing')

fig = side_by_side_bar_chart(grade_counts_raw_agg, grade_counts_clean_agg, "Original Grade Distribution", "Corrected Grade Distribution", title="Gradable Inspections (aggregated by camis and inspection_date)")
fig.show()

In [7]:
df_gradable_inspection.groupby('inspection_type')['grade'].value_counts().unstack().join(
    df_gradable_inspection.groupby('inspection_type').size().rename('total')).reset_index()

Unnamed: 0,inspection_type,A,B,C,N,Z,total
0,Cycle Inspection / Initial Inspection,18923.0,1.0,2.0,562.0,,53104
1,Cycle Inspection / Re-inspection,9047.0,4995.0,3203.0,10.0,1426.0,19452
2,Pre-permit (Operational) / Initial Inspection,3629.0,3.0,5.0,1811.0,,13585
3,Pre-permit (Operational) / Re-inspection,1599.0,942.0,653.0,13.0,449.0,3769


In [8]:
violation_code_counts = df_gradable_inspection['violation_code'].value_counts(dropna=False).reset_index()
violation_code_counts.columns = ['violation_code', 'count']


fig = px.bar(violation_code_counts, x='violation_code', y='count', title ='Violation Codes',
             labels={'violation_code': 'Violation Code', 'count': 'Count'})
fig.update_xaxes(type='category')
fig.show()

In [9]:
df_gradable_inspection['violation_code'].unique()

array(['09B', '04K', '02B', '10J', '04M', '04L', '09E', '06F', '10H',
       '08A', nan, '06I', '10C', '09C', '05A', '10F', '10B', '02G', '05D',
       '10G', '06C', '04J', '06D', '02H', '10A', '10E', '06E', '04A',
       '09A', '04N', '06A', '08C', '05F', '03A', '04O', '04H', '28-06',
       '04D', '04E', '05H', '10D', '03B', '08B', '05E', '28-05', '04F',
       '04C', '02C', '02A', '05C', '02I', '06B', '03I', '10I', '04P',
       '05B', '06G', '03C', '02F', '03F', '06H', '02D', '09D', '03E',
       '28-07', '04B', '07A', '03D', '03G', '18-11', '04I'], dtype=object)

In [10]:
## Grades per Borough

grade_borough_counts = (
    df_agg
    .groupby(['boro', 'computed_grade'])
    .size()
    .reset_index(name='count')
)


fig = px.bar(
    grade_borough_counts,
    x='boro',
    y='count',
    color='computed_grade',
    barmode='group',
    title='Grade Distribution per Borough',
    labels={'boro': 'Borough', 'count': 'Number of Grades', 'computed_grade': 'Computed Grade'},
    color_discrete_map={
        "A": "green",
        "B": "orange",
        "C": "red"
    }
)

fig.show()

In [11]:
df_agg.head()

Unnamed: 0,camis,inspection_date,inspection_type,original_grade,computed_grade,action,boro,cuisine_description,latitude,longitude,critical_flag,violation_code
0,30075445,2023-01-31,Cycle Inspection / Initial Inspection,,B,Establishment Closed by DOHMH. Violations were...,Bronx,Bakery Products/Desserts,40.848231,-73.855972,[Critical],[06D]
1,30075445,2023-08-22,Cycle Inspection / Re-inspection,A,A,Violations were cited in the following area(s).,Bronx,Bakery Products/Desserts,40.848231,-73.855972,[Not Critical],[08A]
2,30075445,2024-11-08,Cycle Inspection / Initial Inspection,A,A,Violations were cited in the following area(s).,Bronx,Bakery Products/Desserts,40.848231,-73.855972,[Not Critical],[10F]
3,30191841,2023-04-23,Cycle Inspection / Initial Inspection,A,A,Violations were cited in the following area(s).,Manhattan,Irish,40.767326,-73.98431,[Critical],[06E]
4,30191841,2024-11-20,Cycle Inspection / Initial Inspection,,B,Violations were cited in the following area(s).,Manhattan,Irish,40.767326,-73.98431,"[Not Critical, Not Critical]","[10F, 08A]"


In [12]:
# feature that identifies if any violation has been reported on that inspection
df_agg['violation_reported'] = df_agg['violation_code'].apply(lambda x: 1 if len(x) != 0 else 0)
df_agg['violation_reported'].value_counts()

violation_reported
1    53012
0      177
Name: count, dtype: int64

In [13]:
# count number of critical and not critical violations
df_agg['nr_critical_violations'] = df_agg['critical_flag'].apply(lambda x_list: x_list.count('Critical'))
df_agg['nr_not_critical_violations'] = df_agg['critical_flag'].apply(lambda x_list: x_list.count('Not Critical'))

In [14]:
df_agg['month'] = pd.to_datetime(df_agg['inspection_date']).dt.month


In [15]:
# Violation Codes

# some violation codes appear very rarely. I will aggregate these into a single 'OTHER' category.

min_freq = 10

frequent_violation_codes = set(violation_code_counts[violation_code_counts['count'] > min_freq]['violation_code'])
frequent_violation_codes

# Replace rare codes with 'OTHER'
def replace_rare(codes):
    return [code if code in frequent_violation_codes else 'OTHER' for code in codes]

df_agg['filtered_codes'] = df_agg['violation_code'].apply(replace_rare)

In [16]:
df_agg['risk_category'] = df_agg['computed_grade']

### Train/Test Split

In [17]:
#### Train test split

from sklearn.model_selection import train_test_split

df_agg_model = df_agg[['boro','month','filtered_codes', 'violation_reported', 'risk_category', "nr_critical_violations", "nr_not_critical_violations"]].copy()

train_df, test_df = train_test_split(
    df_agg_model,
    test_size=0.2,
    stratify=df_agg_model['risk_category'],
    random_state=42
)

print(f"Train set size: {len(train_df)}"
      f"\nTest set size: {len(test_df)}")



Train set size: 42551
Test set size: 10638


### Feature and Label Preprocessing

In [18]:
from sklearn.feature_extraction import DictVectorizer

train_df['violation_code_dict'] = train_df['filtered_codes'].apply(lambda x: {code: 1 for code in x}) 
test_df['violation_code_dict'] = test_df['filtered_codes'].apply(lambda x: {code: 1 for code in x})

violation_code_vectorizer = DictVectorizer(sparse=True)
X_train_violation = violation_code_vectorizer.fit_transform(train_df['violation_code_dict'])
print(f"Shape: {X_train_violation.shape}")
print(f"Feature names: {violation_code_vectorizer.feature_names_}")
print(f"Number of features: {len(violation_code_vectorizer.feature_names_)}")

X_test_violation = violation_code_vectorizer.transform(test_df['violation_code_dict'])

Shape: (42551, 59)
Feature names: ['02A', '02B', '02C', '02F', '02G', '02H', '02I', '03A', '03B', '03C', '03I', '04A', '04C', '04D', '04E', '04F', '04H', '04J', '04K', '04L', '04M', '04N', '04O', '04P', '05A', '05B', '05C', '05D', '05E', '05F', '05H', '06A', '06B', '06C', '06D', '06E', '06F', '06G', '08A', '08B', '08C', '09A', '09B', '09C', '09D', '09E', '10A', '10B', '10C', '10D', '10E', '10F', '10G', '10H', '10I', '10J', '28-05', '28-06', 'OTHER']
Number of features: 59


In [19]:
## One Hot encode: Borough & Month
from sklearn.preprocessing import OneHotEncoder

cat_features = ['boro', 'month']  
ohe = OneHotEncoder(sparse_output=True)
X_train_cat = ohe.fit_transform(train_df[cat_features]) 
print(f"Shape: {X_train_cat.shape}")
print(f"Feature names: {ohe.get_feature_names_out()}")
print(f"Number of features: {len(ohe.get_feature_names_out())}")

X_test_cat = ohe.transform(test_df[cat_features]) 

Shape: (42551, 17)
Feature names: ['boro_Bronx' 'boro_Brooklyn' 'boro_Manhattan' 'boro_Queens'
 'boro_Staten Island' 'month_1' 'month_2' 'month_3' 'month_4' 'month_5'
 'month_6' 'month_7' 'month_8' 'month_9' 'month_10' 'month_11' 'month_12']
Number of features: 17


In [20]:
from scipy.sparse import hstack

X_train_no_violation = train_df[['violation_reported']]
X_test_no_violation = test_df[['violation_reported']]

X_train_nr_critical_violations = train_df[['nr_critical_violations']]
X_train_nr_not_critical_violations = train_df[['nr_not_critical_violations']]

X_test_nr_critical_violations = test_df[['nr_critical_violations']]
X_test_nr_not_critical_violations = test_df[['nr_not_critical_violations']]

#combining all features:
# X_train = hstack([X_train_violation, X_train_cat,]) # X_train_month])
# X_test = hstack([X_test_violation, X_test_cat,])# X_test_month])

X_train = hstack([X_train_violation, X_train_cat, X_train_nr_critical_violations, X_train_nr_not_critical_violations, X_train_no_violation]) # X_train_month])
X_test = hstack([X_test_violation, X_test_cat, X_test_nr_critical_violations, X_test_nr_not_critical_violations, X_test_no_violation])# X_test_month])

# X_train = hstack([ X_train_cat, X_train_nr_critical_violations, X_train_nr_not_critical_violations, X_train_no_violation]) # X_train_month])
# X_test = hstack([ X_test_cat, X_test_nr_critical_violations, X_test_nr_not_critical_violations, X_test_no_violation])# X_test_month])

In [21]:
# Encoding the target:

from sklearn.preprocessing import LabelEncoder

y_train = train_df['risk_category']
y_test = test_df['risk_category']


le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

labels_order = le.classes_
print(labels_order)
y_test_enc

['A' 'B' 'C']


array([0, 0, 2, ..., 1, 2, 0], shape=(10638,))

In [22]:
# Model Training

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42, class_weight='balanced')

random_search = RandomizedSearchCV(
    rf, param_dist, n_iter=20, cv=5,
    scoring='f1_macro', verbose=1, n_jobs=-1
)
random_search.fit(X_train, y_train_enc)
best_model = random_search.best_estimator_

print("Best parameters found:", random_search.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20}


In [23]:
import joblib
import json

#                               ******Uncomment this cell to save the model and its parameters: ******

# #save the best estimator
# joblib.dump(best_model, '../models/best_random_forest_model.pkl')

# #save parameters 
# best_params = best_model.get_params()

# with open('../models/best_rf_params.json', 'w') as f:
#     json.dump(best_params, f, indent=2)

In [24]:
from sklearn.metrics import classification_report

y_test_pred_enc = best_model.predict(X_test)
y_test_pred = le.inverse_transform(y_test_pred_enc)

print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           A       0.71      0.74      0.72      5447
           B       0.36      0.35      0.36      2728
           C       0.58      0.54      0.56      2463

    accuracy                           0.59     10638
   macro avg       0.55      0.54      0.55     10638
weighted avg       0.59      0.59      0.59     10638



In [25]:
# Accuracy, Macro F1, Confusion Matrix
from sklearn.metrics import accuracy_score, f1_score

y_train_pred_enc = best_model.predict(X_train)
y_train_pred = le.inverse_transform(y_train_pred_enc)
# Performance Train Set
accuracy = accuracy_score(y_train, y_train_pred)
macro_f1 = f1_score(y_train, y_train_pred, average='macro')
print("Performance Train Set:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")

# Performance Test Set
print("Performance Test Set:")
accuracy = accuracy_score(y_test, y_test_pred)
macro_f1 = f1_score(y_test, y_test_pred, average='macro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")

Performance Train Set:
Accuracy: 0.6566
Macro F1 Score: 0.6261
Performance Test Set:
Accuracy: 0.5918
Macro F1 Score: 0.5451


In [26]:
cm = confusion_matrix(y_test, y_test_pred, labels=labels_order)
fig = px.imshow(cm,
                text_auto=True,
                color_continuous_scale='Reds',
                x=labels_order,
                y=labels_order,
                labels=dict(x="Predicted", y="Actual", color="Count"),
                title="Confusion Matrix")
fig.update_layout(width=600, height=600)
fig.show()


##  Map high-risk by borough

In [27]:
# % of high risk by borough

total_per_boro = df_agg['boro'].value_counts()

total_per_boro_high_risk = df_agg[df_agg['computed_grade'] == 'C'].groupby('boro').size()
total_per_boro_high_risk


high_risk_proportion = (total_per_boro_high_risk / total_per_boro).sort_values(ascending=False)
high_risk_proportion


boro
Queens           0.260768
Brooklyn         0.243276
Manhattan        0.218569
Bronx            0.198507
Staten Island    0.176877
dtype: float64

In [28]:
import plotly.express as px

fig = px.bar(high_risk_proportion.reset_index(), 
             x='boro', 
             y=0,
             labels={'0': 'Proportion of High-Risk (Grade C)', 'boro': 'Borough'},
             title='Proportion of High-Risk Restaurants by Borough',

            color=0,
            color_continuous_scale='Reds')

fig.show()


In [29]:
# get proportions for each boro/grade
grade_counts = df_agg.groupby(['boro', 'computed_grade']).size().reset_index(name='count')

total_per_boro = total_per_boro.reset_index().rename(columns = {'count':'boro_count'})

grade_counts = grade_counts.merge(total_per_boro, left_on='boro', right_on='boro', how='left')

grade_counts['proportion'] = grade_counts['count'] / grade_counts['boro_count']
grade_counts

Unnamed: 0,boro,computed_grade,count,boro_count,proportion
0,Bronx,A,2497,4957,0.503732
1,Bronx,B,1476,4957,0.297761
2,Bronx,C,984,4957,0.198507
3,Brooklyn,A,7175,14054,0.510531
4,Brooklyn,B,3460,14054,0.246193
5,Brooklyn,C,3419,14054,0.243276
6,Manhattan,A,10524,19710,0.533942
7,Manhattan,B,4878,19710,0.247489
8,Manhattan,C,4308,19710,0.218569
9,Queens,A,6005,12444,0.482562


In [30]:
import pandas as pd
import plotly.express as px

# get proportions for each combination boro/grade
grade_counts = df_agg.groupby(['boro', 'computed_grade']).size().reset_index(name='count')

total_per_boro = total_per_boro.reset_index().rename(columns = {'count':'boro_count'})

grade_counts = grade_counts.merge(total_per_boro, left_on='boro', right_on='boro', how='left')

grade_counts['proportion'] = grade_counts['count'] / grade_counts['boro_count']

# bar plot
fig = px.bar(
    grade_counts,
    x='boro',
    y='proportion',
    color='computed_grade',
    title='Proportion of Grades by Borough',
    labels={'boro': 'Borough', 'proportion': 'Proportion'},
    category_orders={'grade': ['A', 'B', 'C']},
    color_discrete_map={
        'A': 'green',
        'B': 'orange',
        'C': 'red'
    }
)

fig.update_layout(
    barmode='stack',
    yaxis=dict(tickformat=".0%", title='Percentage'),
    xaxis_title='Borough'
)

fig.show()


In [31]:
df_agg_filter_C = df_agg[df_agg['computed_grade'] == 'C']
print(f"Entries with High Risk (Grade C): {df_agg_filter_C.shape}")
df_agg_filter_C = df_agg[df_agg['computed_grade'] == 'C'].dropna(subset=['latitude', 'longitude'])
print(f"Entries with High Risk (Grade C) (and latitute and longitude filled in): {df_agg_filter_C.shape}")

Entries with High Risk (Grade C): (12314, 18)
Entries with High Risk (Grade C) (and latitute and longitude filled in): (12305, 18)


In [32]:

center_latitude = df_agg['latitude'].mean()
center_longitude = df_agg['longitude'].mean()

fig = px.scatter_map(
    df_agg_filter_C,
    lat='latitude',
    lon='longitude',
    color='boro',
    hover_data=['violation_code'],
    zoom=9,
    labels={ 'boro': 'Borough'},
    title='High-Risk (Grade C) Restaurants Across NYC',
    map_style='carto-positron',
    height=600
)

fig.update_layout(
  map_center={"lat":40.7, "lon": -74},
)

fig.show()


In [33]:

df_agg_all_grades = df_agg.dropna(subset=['latitude', 'longitude'])

fig = px.scatter_map(
    df_agg_all_grades,
    lat='latitude',
    lon='longitude',
    color='computed_grade',
    hover_data=['boro'],
    map_style='carto-positron',
    color_discrete_map={'A': 'green', 'B': 'orange', 'C': 'red'},
    opacity=0.5,
    height=700
)

fig.update_layout(
    map_center={"lat":40.7, "lon": -74},
    map_zoom=9,
    title='NYC Restaurant Grades'
)

fig.show()


## Takeaway: Action codes associated with higher risk (grade C )

In [34]:
# compare violation codes associated with grade C vs (A, B)
c_violations = df_gradable_inspection[df_gradable_inspection['computed_grade'] == 'C']['violation_code'].value_counts(normalize=True)
other_violations = df_gradable_inspection[df_gradable_inspection['computed_grade'] != 'C']['violation_code'].value_counts(normalize=True)

comparison = (c_violations / other_violations).sort_values(ascending=False)

exclusive_to_c = comparison[comparison.isna()]
print(f"Violation codes that show up exclusively for High Risk (Grade C) inspections: {exclusive_to_c.index.tolist()}")
comparison

comparison_filled = comparison.fillna(30)
comparison_filled = comparison_filled.sort_values(ascending=False)

Violation codes that show up exclusively for High Risk (Grade C) inspections: ['03D', '04B', '04I', '06H', '07A', '18-11', '28-07']


In [35]:
## Violation Codes most predictive of high risk

top_codes = comparison_filled.sort_values(ascending=False).head(15)

violation_map = df_gradable_inspection.dropna(subset=['violation_description']).drop_duplicates(subset=['violation_code']).set_index('violation_code')['violation_description'].to_dict()

labels = [violation_map.get(code, code) for code in top_codes.index]

colors = ['crimson' if code in exclusive_to_c else 'salmon' for code in top_codes.index]

hover_texts = [
    f"{description}<br>Violation Code: {code}<br>" +
    ("**Only found in Grade C**" if code in exclusive_to_c else f"Risk Ratio: {val:.2f}")
    for code, description, val in zip(top_codes.index, labels, top_codes.values)
]

fig = go.Figure(go.Bar(
    x=top_codes.values,
    y=top_codes.index.tolist(),
    orientation='h',
    marker_color=colors,
    hovertext=hover_texts,
    hoverinfo='text'
))

fig.update_layout(
    title='Violation Codes Most Predictive of High Risk (Grade C)',
    xaxis_title='Risk Ratio (C vs A+B)',
    yaxis_title='Violation Code',
    yaxis=dict(type='category'),   
)

fig.show()


## GenAI: Mistral


This section requires a valid Mistral API key. Ensure you have added one to `secrets.yaml

In [36]:
import yaml
from mistralai import Mistral
import re
import json

# Load secrets
with open("secrets.yaml", "r") as file:
    secrets = yaml.safe_load(file)

try:
    api_key = secrets['mistral_api_key']
    api_key
except Exception as e:
    print(f"`mistral_api_key` was not found in the secrets file.")

if not api_key:
    print(f"Ensure you fill in the `mistral_api_key` in secrets.yaml to successfully run this section of the notebook")


client = Mistral(api_key=api_key)

def call_llm(user_prompt: str, model ="mistral-large-latest"):
    llm_response = client.chat.complete(
        model= model,
        messages = [
            {
                "role": "user",
                "content": user_prompt,
            },
        ]
    )

    try:
        response = llm_response.choices[0].message.content
        return response
    except Exception as e:
        raise Exception(f"Failure parsing LLM output: {e}")
    
#call_llm(user_prompt="Hello")

In [37]:
def get_top_violation_codes(df_agg, eval_date, boro, top_n = 3) -> set:
    #filter by month of interest
    df_agg_filtered = df_agg[(df_agg['month_start'] == eval_date) & (df_agg['boro'] == boro)]

    # Explode violation codes vertically
    df_agg_C_exploded = df_agg_filtered.explode("violation_code")
    df_agg_C_exploded

    # count
    violation_by_boro = (
        df_agg_C_exploded.groupby(["boro", "violation_code"])
        .size()
        .reset_index(name="count")
    ).sort_values(by='count', ascending = False)

    top_ = set(violation_by_boro['violation_code'].head(top_n))
    return top_


def get_violation_description(violation_codes: set, df_gradable_inspection) -> list:
    descriptions = []
    for code in violation_codes:
        descriptions.append(df_gradable_inspection[df_gradable_inspection['violation_code'] == code]['violation_description'].values[0])    # currently just picking the 1st value. needs to be improved to get a cleaned up version of the description
    return descriptions

In [38]:
    
def extract_json_from_code_block(text):
    # parse LLM's output into valid json
    match = re.search(r'```json\s*(\{.*?\})\s*```', text, re.DOTALL)
    if not match:
        print("No JSON code block found.")
        return None

    json_str = match.group(1)
    try:
        return json.loads(json_str)
    except Exception as e:
        print(f"Failed to parse JSON: {e}")
        return None
    
def create_prompt(perc_increase, boro, top_n_violation_descriptions):
    return f"""You work for the Public Health Department of NYC.
    Draft a public health department alert about a recent increase in high-risk restaurant violations in {boro}. The alert should mention:
    The percentage increase in restaurant inspections associated with high risk: {perc_increase}.
    The most common types of violations identified: {top_n_violation_descriptions}.
    A reminder to restaurant operators about safe practices
    A call to action for residents or business owners
    Tone should be professional and informative, suitable for a city department press release.
    Return your answer in json format with a single key `press_release`. Do not add additional keys.
    """

def get_monthly_alert(monthly_risk, eval_date,df_agg, df_gradable_inspection, threshold =10):
    monthly_risk_filter_month = monthly_risk[monthly_risk["month_start"] == eval_date]

    rising_areas = monthly_risk_filter_month[monthly_risk_filter_month["perc_change"] > threshold]  
    
    if rising_areas.shape[0] == 0:
        print(f"No alerts on month {eval_date}")
    else:
        alerts = []
        for index, row in rising_areas.iterrows():
            boro = row[['boro']].values[0]
            perc_increase = row[['perc_change']]
            print(f"Generatig alert for: {boro} - {eval_date}")
            
            top_violation_codes = get_top_violation_codes(df_agg, eval_date, boro, top_n = 3)

            top_descriptions = get_violation_description(top_violation_codes, df_gradable_inspection)

            prompt = create_prompt(perc_increase, boro, top_descriptions)

            answer = call_llm(user_prompt=prompt, model ="mistral-small-2506")
            
            parsed_answer = extract_json_from_code_block(answer)
            
            alerts.append(parsed_answer['press_release'])
    
        return alerts
    

# compute the change in percentage of high risk (C) grades from month to month
df_agg["month_start"] = df_agg["inspection_date"].dt.to_period("M").dt.start_time
monthly_risk = (
    df_agg[df_agg["computed_grade"] == "C"]
    .groupby(["boro", "month_start"])
    .size()
    .reset_index(name="high_risk_count")
).sort_values(by = 'month_start', ascending = True)

monthly_risk["perc_change"] = (
    monthly_risk.groupby("boro")["high_risk_count"]
    .pct_change()
    .fillna(0)
    * 100
)
alerts = get_monthly_alert(monthly_risk, eval_date='2025-06-01', df_agg = df_agg, df_gradable_inspection=df_gradable_inspection)
alerts


Generatig alert for: Manhattan - 2025-06-01


2025-07-24 22:51:00,928 - INFO - HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


['FOR IMMEDIATE RELEASE\n\nPUBLIC HEALTH DEPARTMENT ALERT: INCREASE IN HIGH-RISK RESTAURANT VIOLATIONS IN MANHATTAN\n\nThe New York City Public Health Department is issuing an alert regarding a significant increase in high-risk restaurant violations in Manhattan. Over the past reporting period, there has been a 34.8% increase in restaurant inspections associated with high-risk violations, posing potential health risks to the public.\n\nThe most common violations identified include:\n1. Non-food contact surfaces or equipment made of unacceptable material, not kept clean, or improperly sealed, raised, spaced, or movable to allow accessibility for cleaning.\n2. Establishments not free of harborage or conditions conducive to rodents, insects, or other pests.\n3. Cold time/temperature control for safety (TCS) food items held above required temperatures, including smoked or processed fish, raw eggs, or reduced oxygen packaged (ROP) TCS foods.\n\nRestaurant operators are reminded to adhere to