In [1]:
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, roc_curve


In [2]:
df = pl.read_csv("Rodent_Inspection_20250410.csv")

df = df.select(["BOROUGH", "INSPECTION_DATE", "RESULT"])

df.head()

BOROUGH,INSPECTION_DATE,RESULT
str,str,str
"""Bronx""","""08/30/2010 03:23:11 PM""","""Passed"""
"""Manhattan""","""08/18/2011 12:05:54 PM""","""Passed"""
"""Brooklyn""","""10/10/2018 12:57:02 PM""","""Passed"""
"""Manhattan""","""02/07/2019 12:48:34 PM""","""Passed"""
"""Bronx""","""10/16/2017 01:02:51 PM""","""Rat Activity"""


In [3]:
pdf = df.to_pandas()

# Parse date
pdf["INSPECTION_DATE"] = pd.to_datetime(pdf["INSPECTION_DATE"], errors="coerce")

# Drop bad dates
pdf = pdf[
    (pdf["INSPECTION_DATE"].dt.year >= 2010) &
    (pdf["INSPECTION_DATE"].dt.year <= 2025)
]

# Create target
pdf["target"] = (pdf["RESULT"] == "Rat Activity").astype(int)

# Create two separate datasets
pdf_month = pdf.copy()
pdf_year = pdf.copy()

# For monthly model: extract month
pdf_month["INSPECTION_MONTH"] = pdf_month["INSPECTION_DATE"].dt.month
pdf_month = pdf_month[["BOROUGH", "INSPECTION_MONTH", "target"]].dropna()

# For yearly model: extract year
pdf_year["INSPECTION_YEAR"] = pdf_year["INSPECTION_DATE"].dt.year
pdf_year = pdf_year[["BOROUGH", "INSPECTION_YEAR", "target"]].dropna()

  pdf["INSPECTION_DATE"] = pd.to_datetime(pdf["INSPECTION_DATE"], errors="coerce")


In [4]:
# Monthly
X_month = pdf_month.drop(columns="target")
y_month = pdf_month["target"]

X_train_month, X_test_month, y_train_month, y_test_month = train_test_split(
    X_month, y_month, test_size=0.3, random_state=1, stratify=y_month
)

# Yearly
X_year = pdf_year.drop(columns="target")
y_year = pdf_year["target"]

X_train_year, X_test_year, y_train_year, y_test_year = train_test_split(
    X_year, y_year, test_size=0.3, random_state=1, stratify=y_year
)

In [5]:
# Preprocessing for monthly model
preprocessor_month = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ["BOROUGH", "INSPECTION_MONTH"])
])

# Preprocessing for yearly model
preprocessor_year = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ["BOROUGH", "INSPECTION_YEAR"])
])

In [6]:
rf = RandomForestClassifier(class_weight="balanced", random_state=1, n_jobs=-1)

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5],
    "max_features": ["sqrt", "log2"]
}

# Monthly pipeline
search_month = RandomizedSearchCV(
    rf,
    param_distributions=param_grid,
    n_iter=20,
    scoring="roc_auc",
    cv=3,
    random_state=1,
    verbose=2,
    n_jobs=-1
)

pipe_month = Pipeline([
    ("preprocessor", preprocessor_month),
    ("classifier", search_month)
])

# Yearly pipeline
search_year = RandomizedSearchCV(
    rf,
    param_distributions=param_grid,
    n_iter=20,
    scoring="roc_auc",
    cv=3,
    random_state=1,
    verbose=2,
    n_jobs=-1
)

pipe_year = Pipeline([
    ("preprocessor", preprocessor_year),
    ("classifier", search_year)
])

In [8]:
# Fit month model
pipe_month.fit(X_train_month, y_train_month)

# Fit year model
pipe_year.fit(X_train_year, y_train_year)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time= 1.3min
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time= 1.3min
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time= 1.4min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=100; total time= 1.5min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=100; total time= 1.5min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=100; total time= 1.5min
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time= 2.6min
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estima

In [9]:
# pipe_month = joblib.load("rf_month_model.pkl")
# pipe_year = joblib.load("rf_year_model.pkl")

# Monthly model evaluation
y_pred_month = pipe_month.predict(X_test_month)
y_proba_month = pipe_month.predict_proba(X_test_month)[:, 1]

print("MONTHLY MODEL:")
print(classification_report(y_test_month, y_pred_month))
print("ROC AUC:", roc_auc_score(y_test_month, y_proba_month))

# Yearly model evaluation
y_pred_year = pipe_year.predict(X_test_year)
y_proba_year = pipe_year.predict_proba(X_test_year)[:, 1]

print("\nYEARLY MODEL:")
print(classification_report(y_test_year, y_pred_year))
print("ROC AUC:", roc_auc_score(y_test_year, y_proba_year))


MONTHLY MODEL:
              precision    recall  f1-score   support

           0       0.85      0.42      0.56    680408
           1       0.18      0.64      0.29    138577

    accuracy                           0.46    818985
   macro avg       0.52      0.53      0.42    818985
weighted avg       0.74      0.46      0.52    818985

ROC AUC: 0.5463170475161885

YEARLY MODEL:
              precision    recall  f1-score   support

           0       0.88      0.67      0.76    680408
           1       0.25      0.55      0.34    138577

    accuracy                           0.65    818985
   macro avg       0.56      0.61      0.55    818985
weighted avg       0.77      0.65      0.69    818985

ROC AUC: 0.6375531791900324


In [10]:
joblib.dump(pipe_month, "rf_month_model.pkl")
joblib.dump(pipe_year, "rf_year_model.pkl")

['rf_year_model.pkl']