# NYC 311 Service Requests Data Pipeline

*Library + jobs to fetch, cache, and analyze NYC 311 service request data.*

In [45]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# As Jobs

## LTP Pipeline

In [46]:
from pathlib import Path

# Per-user, cross-platform base under the home dir
# retrieve for one day only:
dir_base = Path.home() / "TASK_HBC_TSY"

%cd -q ..
!python -m hbc.jobs.dispatch \
  --job-name=job_poll_nyc_311 \
  --as-of=20091231 \
  --dir-base={dir_base} \
  --incremental=True \
  --log-level=INFO
%cd -q notebooks/

Log file: /Users/alexandershubert/TASK_HBC_TSY/LOGS/job_poll_nyc_311/job_poll_nyc_311_20251222183553.txt


In [47]:
# restore cache integrity for the last missing dates
%cd -q ..
!python -m hbc.jobs.dispatch  \
      --job-name=job_poll_nyc_311 \
      --as-of=20091231 \
      --incremental=false \
      --log-level=INFO \
      --last-missing-dates=5
%cd -q notebooks/

Log file: /var/folders/jj/dn25brln45j26cvj4y_lgbzr0000gn/T/hbc_nyc_dp/LOGS/job_poll_nyc_311/job_poll_nyc_311_20251222183600.txt


## Analytics

In [48]:
%cd -q ..
!python -m hbc.jobs.dispatch  \
      --job-name=job_analysis_nyc_311 \
      --as-of=20091231 \
      --log-level=INFO \
      --n-worst=10 \
      --n-best=10 \
      --n-days=10
%cd -q notebooks/

Log file: /var/folders/jj/dn25brln45j26cvj4y_lgbzr0000gn/T/hbc_nyc_dp/LOGS/job_analysis_nyc_311/job_analysis_nyc_311_20251222183625.txt


***

# As Library 

### Imports

In [49]:
import sys
from pathlib import Path

p = str(Path.cwd().parent) # one dir up
if p not in sys.path:
    sys.path.insert(0, p)

In [50]:
import pandas as pd
import os
import numpy as np
import logging
import matplotlib.pyplot as plt

In [51]:
import warnings
warnings.filterwarnings("ignore")

## Api

In [52]:
from hbc import app_context, DataContainer, utils as ul

In [53]:
from hbc.quant.analysis import AnalyticalEngine
from hbc.quant.plots import PlotEngine

In [54]:
app_context

AppContext
as_of : 2025-12-22
dir_analytics: PosixPath('/var/folders/jj/dn25brln45j26cvj4y_lgbzr0000gn/T/hbc_nyc_dp/ANALYTICS'),
dir_base: PosixPath('/var/folders/jj/dn25brln45j26cvj4y_lgbzr0000gn/T/hbc_nyc_dp'),
dir_cache: PosixPath('/var/folders/jj/dn25brln45j26cvj4y_lgbzr0000gn/T/hbc_nyc_dp/CACHE'),
dir_logging: PosixPath('/var/folders/jj/dn25brln45j26cvj4y_lgbzr0000gn/T/hbc_nyc_dp/LOGS')

## Logging

In [55]:
# Console-only (no file writes):
ul.conf_log(level=logging.DEBUG, console=True, file=False, reset_handlers=True)

# File-only (no console output at all):
ul.conf_log(level=logging.INFO, console=False, file=True, reset_handlers=True)

# Both:
ul.conf_log(level=logging.INFO, console=True, file=True, reset_handlers=True)

Log file: /var/folders/jj/dn25brln45j26cvj4y_lgbzr0000gn/T/hbc_nyc_dp/LOGS/hbc_job_generic.txt
Log file: /var/folders/jj/dn25brln45j26cvj4y_lgbzr0000gn/T/hbc_nyc_dp/LOGS/hbc_job_generic.txt


<RootLogger root (INFO)>

## DataContainer

In [56]:
dc = DataContainer('nyc_open_data_311_service_requests')

In [57]:
# we retrieve 100 rows
dc.get()

2025-12-22 18:36:37 fetch_nycopen.py       85 INFO  root    : Fetched 100 rows
2025-12-22 18:36:37 base.py                52 INFO  root    : using validator: ValidatorNYCOpen311Service
2025-12-22 18:36:37 base.py                53 INFO  root    : cleaning...
2025-12-22 18:36:37 base.py                56 INFO  root    : normalizing...
2025-12-22 18:36:37 base.py                59 INFO  root    : validating...
2025-12-22 18:36:37 valid_nycopen.py      183 INFO  root    : Validation summary -> flagged 10 rows. closed_date before created_date: 4; closed_date set but status not closed: 5; incident_zip outside NYC range: 2; resolution_action_updated_date before created_date: 7
2025-12-22 18:36:37 base.py                62 INFO  root    : finalizing...
2025-12-22 18:36:37 container.py           73 ERROR root    : DataContainer nyc_open_data_311_service_requests does not adhere to schema. Missing columns: bridge_highway_direction, bridge_highway_name, bridge_highway_segment, garage_lot_name, l

In [58]:
dc.get(where=f"created_date = '{ul.date_as_iso_format(ul.str_as_date('20091231'))}' ")

2025-12-22 18:36:37 fetch_nycopen.py       56 INFO  root    : using pagination at fetching with page_size=10000 timeout=30
2025-12-22 18:36:38 fetch_nycopen.py       85 INFO  root    : Fetched 4155 rows
2025-12-22 18:36:38 base.py                52 INFO  root    : using validator: ValidatorNYCOpen311Service
2025-12-22 18:36:38 base.py                53 INFO  root    : cleaning...
2025-12-22 18:36:38 base.py                56 INFO  root    : normalizing...
2025-12-22 18:36:38 base.py                59 INFO  root    : validating...
2025-12-22 18:36:38 valid_nycopen.py      183 INFO  root    : Validation summary -> flagged 311 rows. closed_date before created_date: 52; closed_date set but status not closed: 89; incident_zip outside NYC range: 4; resolution_action_updated_date before created_date: 270
2025-12-22 18:36:38 base.py                62 INFO  root    : finalizing...


In [59]:
dc.to_cache()

2025-12-22 18:36:39 cache.py               34 INFO  root    : Cached: /var/folders/jj/dn25brln45j26cvj4y_lgbzr0000gn/T/hbc_nyc_dp/CACHE/nyc_open_data_311_service_requests/20251222/nyc_open_data_311_service_requests.csv.gz


In [60]:
dc.from_cache().head()

2025-12-22 18:36:39 cache.py               53 INFO  root    : Retrieved from cache: /var/folders/jj/dn25brln45j26cvj4y_lgbzr0000gn/T/hbc_nyc_dp/CACHE/nyc_open_data_311_service_requests/20251222/nyc_open_data_311_service_requests.csv.gz


Unnamed: 0,unique_key,created_date,closed_date,agency,agency_name,complaint_type,descriptor,incident_zip,intersection_street_1,intersection_street_2,...,ferry_direction,ferry_terminal_name,school_or_citywide_complaint,taxi_company_borough,vehicle_type,school_not_found,garage_lot_name,landmark,DROP_FLAG,DROP_REASON
0,15654551,2009-12-31 00:00:00,2009-12-31 00:00:00,DOT,Department of Transportation,Street Condition,Pothole,10472.0,EAST 182 STREET,SOUTHERN BOULEVARD,...,,,,,,,,,False,
1,15631023,2009-12-31 00:00:00,2011-03-10 00:00:00,DOT,Department of Transportation,Street Light Condition,Street Light Out,,86 ST,7 ST W,...,,,,,,,,,False,
2,15632945,2009-12-31 00:00:00,2009-12-31 00:00:00,DOT,Department of Transportation,Street Light Condition,Street Light Out,,WEBSTER AVE,168 ST E,...,,,,,,,,,True,resolution_action_updated_date before created_...
3,15632946,2009-12-31 00:00:00,2009-12-31 00:00:00,DOT,Department of Transportation,Street Light Condition,Street Light Cycling,,3 AVE,167 ST E,...,,,,,,,,,True,resolution_action_updated_date before created_...
4,15619185,2009-12-31 00:00:00,2010-02-17 00:00:00,TLC,Correspondence - Taxi and Limousine Commission,Taxi Compliment,Driver Compliment,,,,...,,,,,,,,,False,


In [61]:
', '.join(dc.all_cached_dates)

'20251222, 20091231, 20091230, 20091229, 20091228, 20091227, 20091226, 20091225, 20091224, 20091223, 20091222, 20091221, 20091220, 20091219, 20091218, 20091217, 20091216, 20091215, 20091214, 20091213, 20091212, 20091211, 20091210, 20091209, 20091208, 20091207, 20091206, 20091205, 20091204, 20091203, 20091202, 20091201, 20091130, 20091129, 20091128, 20091127, 20091126'

## Analytics

In [62]:
df = dc.df

In [63]:
cols = ul.cols_as_named_tuple(df)

In [64]:
df = df[~df[cols.DROP_FLAG]]

In [65]:
df["hbc_days_to_close"] = (
    pd.to_datetime(df[cols.closed_date])
    - pd.to_datetime(df[cols.created_date])
).dt.days.astype("Int64")
cols = ul.cols_as_named_tuple(df)

In [66]:
m = df[cols.hbc_days_to_close] == 0
df_closed_not_same_day = df[~m]

In [67]:
path = ul.path_to_str(
                ul.mk_dir(ul.get_dir_analytics() / "plots")
                / "closed_requests_by_location.html"
            )
_ = PlotEngine.plot_geo_map(
            df=df_closed_not_same_day,
            col_latitude=cols.latitude,
            col_longitude=cols.longitude,
            aggregation="count",
            round_precision=3,
            cluster=True,
            start_zoom=11,
            tiles="CartoDB positron",
            savepath= path
        )
print(path)

/var/folders/jj/dn25brln45j26cvj4y_lgbzr0000gn/T/hbc_nyc_dp/ANALYTICS/plots/closed_requests_by_location.html


In [68]:
# by agency
res = AnalyticalEngine.descriptive_stats(
    n_best=10,
    n_worst=10,
    df=df_closed_not_same_day,
    col_metric=cols.hbc_days_to_close,
    group=[
        cols.agency,
        cols.agency_name,
    ],
)


In [69]:
res.keys()

dict_keys(['best', 'worst', 'median', 'mean'])

In [70]:
res['worst']

Unnamed: 0_level_0,Unnamed: 1_level_0,hbc_days_to_close
agency,agency_name,Unnamed: 2_level_1
DCA,Department of Consumer Affairs,559
TLC,Taxi and Limousine Commission,539
DOT,Department of Transportation,434
DSNY,Department of Sanitation,421
DOB,Department of Buildings,296
DEP,Department of Environmental Protection,223
HPD,Department of Housing Preservation and Development,126
DOE,Central - Department of Education,119
DOHMH,Department of Health and Mental Hygiene,61
DPR,Department of Parks and Recreation,50


### 311 Call Center Inquiry dataset onboarding:

In [71]:
dc = DataContainer('nyc_open_data_311_call_center_inquiry')

In [72]:
dc.get()

2025-12-22 18:36:42 fetch_nycopen.py       85 INFO  root    : Fetched 100 rows
2025-12-22 18:36:42 base.py                52 INFO  root    : using validator: ValidatorGeneric
2025-12-22 18:36:42 base.py                53 INFO  root    : cleaning...
2025-12-22 18:36:42 base.py                56 INFO  root    : normalizing...
2025-12-22 18:36:42 base.py                59 INFO  root    : validating...
2025-12-22 18:36:42 base.py                62 INFO  root    : finalizing...


In [73]:
dc.get(where=f"agency='NYPD'", limit=250)

2025-12-22 18:36:43 fetch_nycopen.py       85 INFO  root    : Fetched 250 rows
2025-12-22 18:36:43 base.py                52 INFO  root    : using validator: ValidatorGeneric
2025-12-22 18:36:43 base.py                53 INFO  root    : cleaning...
2025-12-22 18:36:43 base.py                56 INFO  root    : normalizing...
2025-12-22 18:36:43 base.py                59 INFO  root    : validating...
2025-12-22 18:36:43 base.py                62 INFO  root    : finalizing...


In [74]:
dc.df.shape

(250, 9)

In [75]:
dc.df.head()

Unnamed: 0,unique_id,date,time,date_time,agency,agency_name,inquiry_name,brief_description,call_resolution
0,100000002,2014-03-27T00:00:00.000,8:49:03 PM,2014-03-27T20:49:03.000,NYPD,New York City Police Department,Noise from Neighbor,Report a noisy neighbor.,CSMS SR
1,100000001,2014-03-27T00:00:00.000,3:03:22 PM,2014-03-27T15:03:22.000,NYPD,New York City Police Department,Hot Transfer 911,Requires immediate transfer to 911.,Hot Transfer 911
2,100011603,2014-03-27T00:00:00.000,6:39:52 PM,2014-03-27T18:39:52.000,NYPD,New York City Police Department,Lost or Found Property,Report lost or found property.,Information Provided
3,100011951,2014-03-27T00:00:00.000,4:08:20 PM,2014-03-27T16:08:20.000,NYPD,New York City Police Department,Division Information Provided,Hidden service for activity coding.,Information Provided
4,100012105,2014-03-28T00:00:00.000,1:13:07 AM,2014-03-28T01:13:07.000,NYPD,New York City Police Department,Noise from Club or Bar,Report noise from inside a club or bar.,CSMS SR


In [76]:
dc.df.head()

Unnamed: 0,unique_id,date,time,date_time,agency,agency_name,inquiry_name,brief_description,call_resolution
0,100000002,2014-03-27T00:00:00.000,8:49:03 PM,2014-03-27T20:49:03.000,NYPD,New York City Police Department,Noise from Neighbor,Report a noisy neighbor.,CSMS SR
1,100000001,2014-03-27T00:00:00.000,3:03:22 PM,2014-03-27T15:03:22.000,NYPD,New York City Police Department,Hot Transfer 911,Requires immediate transfer to 911.,Hot Transfer 911
2,100011603,2014-03-27T00:00:00.000,6:39:52 PM,2014-03-27T18:39:52.000,NYPD,New York City Police Department,Lost or Found Property,Report lost or found property.,Information Provided
3,100011951,2014-03-27T00:00:00.000,4:08:20 PM,2014-03-27T16:08:20.000,NYPD,New York City Police Department,Division Information Provided,Hidden service for activity coding.,Information Provided
4,100012105,2014-03-28T00:00:00.000,1:13:07 AM,2014-03-28T01:13:07.000,NYPD,New York City Police Department,Noise from Club or Bar,Report noise from inside a club or bar.,CSMS SR


In [77]:
dc.get(where=f"date = '{ul.date_as_iso_format(ul.str_as_date('2010-01-03'))}'")

2025-12-22 18:36:43 fetch_nycopen.py       56 INFO  root    : using pagination at fetching with page_size=10000 timeout=30
2025-12-22 18:36:44 fetch_nycopen.py       85 INFO  root    : Fetched 14971 rows
2025-12-22 18:36:44 base.py                52 INFO  root    : using validator: ValidatorGeneric
2025-12-22 18:36:44 base.py                53 INFO  root    : cleaning...
2025-12-22 18:36:44 base.py                56 INFO  root    : normalizing...
2025-12-22 18:36:44 base.py                59 INFO  root    : validating...
2025-12-22 18:36:44 base.py                62 INFO  root    : finalizing...


In [78]:
dc.df.head()

Unnamed: 0,unique_id,date,time,date_time,agency,agency_name,inquiry_name,brief_description,call_resolution
0,59855644,2010-01-03T00:00:00.000,12:52:55 AM,2010-01-03T00:52:55.000,NYPD,New York City Police Department,Noise from Neighbor,Report a noisy neighbor.,CSMS SR
1,59855645,2010-01-03T00:00:00.000,12:27:41 AM,2010-01-03T00:27:41.000,NYPD,New York City Police Department,Noise from Neighbor,Report a noisy neighbor.,CSMS SR
2,59855646,2010-01-03T00:00:00.000,12:34:13 AM,2010-01-03T00:34:13.000,NYPD,New York City Police Department,Noise from Neighbor,Report a noisy neighbor.,CSMS SR
3,59871947,2010-01-03T00:00:00.000,11:54:54 AM,2010-01-03T11:54:54.000,NYPD,New York City Police Department,Noise from Neighbor,Report a noisy neighbor.,CSMS SR
4,59873887,2010-01-03T00:00:00.000,11:00:39 PM,2010-01-03T23:00:39.000,NYPD,New York City Police Department,Noise from Neighbor,Report a noisy neighbor.,CSMS SR
