# HBC_TSY_ENCHANCED

# As Jobs

## Pipeline (load/transform/persist)

In [None]:
from pathlib import Path
import tempfile

dir_base = Path(tempfile.gettempdir()) / "HBC_TASK_BASE_FOLDER"
dir_analytics = dir_base / "ANALYTICS"
dir_logging = dir_base / "LOGGING"

%cd -q ..
!python -m hbc.jobs.dispatch \
  --job-name=job_fetch_nyc_open_data_311_service_requests \
  --as-of=20091231 \
  --dir-base="{dir_base}" \
  --dir-analytics="{dir_analytics}" \
  --dir-logging="{dir_logging}" \
  --incremental=True \
  --log-level=DEBUG
%cd -q notebooks/


In [None]:
# restore cache integrity for the last missing dates
%cd -q ..
!python -m hbc.jobs.dispatch  \
      --job-name=job_fetch_nyc_open_data_311_service_requests \
      --as-of=20091231 \
      --incremental=false \
      --log-level=INFO \
      --last-missing-dates=10
%cd -q notebooks/

## Analytics

In [None]:
%cd -q ..
!python -m hbc.jobs.dispatch  \
      --job-name=job_analyse_nyc_open_data_311_service_requests \
      --as-of=20091231 \
      --log-level=INFO \
      --n-worst=10 \
      --n-best=10 \
      --n-days=10
%cd -q notebooks/

***

# As Library 

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

### Imports

In [2]:
import os
import sys
from pathlib import Path

In [3]:
p = str(Path.cwd().parent) # one dir up
if p not in sys.path:
    sys.path.insert(0, p)

import pandas as pd
import os
import numpy as np
import logging
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

## Api

In [4]:
from hbc import app_context, DataContainer, utils as ul
from hbc.quant.analysis import AnalyticalEngine
from hbc.quant.plots import PlotEngine

In [5]:
app_context

AppContext
as_of : 2025-12-27
dir_base: PosixPath('/var/folders/jj/dn25brln45j26cvj4y_lgbzr0000gn/T/hbc_nyc_dp'),
dir_analytics: PosixPath('/var/folders/jj/dn25brln45j26cvj4y_lgbzr0000gn/T/hbc_nyc_dp/ANALYTICS'),
dir_logging: PosixPath('/var/folders/jj/dn25brln45j26cvj4y_lgbzr0000gn/T/hbc_nyc_dp/LOGS')

## Logging

In [6]:
import logging
logger = logging.getLogger()
ul.conf_log(level=logging.INFO, console=True, file=True, reset_handlers=True)

Log file: /var/folders/jj/dn25brln45j26cvj4y_lgbzr0000gn/T/hbc_nyc_dp/LOGS/hbc_job_generic.txt


<RootLogger root (INFO)>

## Working on WebPortal

In [7]:
!ls /Users/alexandershubert/git/hbc_tsy_enhanced/hbc_configs/

nyc_open_data_311_call_center_inquiry.yaml
nyc_open_data_311_customer_satisfaction_survey.yaml
nyc_open_data_311_service_requests.yaml


In [8]:
monikers = [
    'nyc_open_data_311_call_center_inquiry.yaml',
    'nyc_open_data_311_customer_satisfaction_survey.yaml',
    'nyc_open_data_311_service_requests.yaml'
]

In [82]:
dc = DataContainer('nyc_open_data_311_service_requests')

In [83]:
dc.from_cache(query="$filter=city eq 'BROOKLYN2'")

2025-12-27 22:55:57 container.py           58 INFO  root    : Retrieved dataFrame with shape=(0, 53)


In [74]:
dc.from_cache(query="$apply=groupby((city))")

2025-12-27 22:38:19 container.py           58 INFO  root    : Retrieved dataFrame with shape=(16, 54)


In [81]:
dc.df.dropna(axis=1)

Unnamed: 0,address_type,agency,agency_name,borough,bridge_highway_direction,bridge_highway_name,bridge_highway_segment,city,closed_date,community_board,...,status,street_name,taxi_company_borough,taxi_pick_up_location,unique_key,vehicle_type,x_coordinate_state_plane_,y_coordinate_state_plane_,count,hbc_unique_key
0,,,,,,,,,,,...,,,,,,,,,859,
1,,,,,,,,ASTORIA,,,...,,,,,,,,,1,
2,,,,,,,,BAYSIDE,,,...,,,,,,,,,1,
3,,,,,,,,BELLEROSE,,,...,,,,,,,,,2,
4,,,,,,,,BRONX,,,...,,,,,,,,,8,
5,,,,,,,,BROOKLYN,,,...,,,,,,,,,11,
6,,,,,,,,FOREST HILLS,,,...,,,,,,,,,1,
7,,,,,,,,FRESH MEADOWS,,,...,,,,,,,,,1,
8,,,,,,,,HEMPSTEAD,,,...,,,,,,,,,1,
9,,,,,,,,JACKSONVILLE,,,...,,,,,,,,,1,


## Data Container : nyc_open_data_311_customer_satisfaction_survey

In [35]:
dc = DataContainer("nyc_open_data_311_customer_satisfaction_survey")

In [36]:
# retrieve: query first 100 rows
dc.get()
dc.df.shape

2025-12-27 22:16:35 fetch_nycopen.py       84 INFO  root    : Fetched 100 rows
2025-12-27 22:16:35 base.py                52 INFO  root    : using validator: ValidatorGeneric
2025-12-27 22:16:35 base.py                53 INFO  root    : cleaning...
2025-12-27 22:16:35 base.py                56 INFO  root    : normalizing...
2025-12-27 22:16:35 base.py                59 INFO  root    : validating...
2025-12-27 22:16:35 base.py                62 INFO  root    : dropping flagged rows...
2025-12-27 22:16:35 base.py                65 INFO  root    : finalizing...
2025-12-27 22:16:35 container.py           35 INFO  root    : Retrieved dataFrame with shape=(100, 14)


(100, 14)

In [37]:
# retrieve: query distinct
dc.get(query="$apply=groupby((campaign))")
dc.df.shape

2025-12-27 22:16:36 fetch_nycopen.py       55 INFO  root    : using pagination at fetching with page_size=10000 timeout=30
2025-12-27 22:16:37 fetch_nycopen.py       84 INFO  root    : Fetched 6 rows
2025-12-27 22:16:37 base.py                52 INFO  root    : using validator: ValidatorGeneric
2025-12-27 22:16:37 base.py                53 INFO  root    : cleaning...
2025-12-27 22:16:37 base.py                56 INFO  root    : normalizing...
2025-12-27 22:16:37 base.py                59 INFO  root    : validating...
2025-12-27 22:16:37 base.py                62 INFO  root    : dropping flagged rows...
2025-12-27 22:16:37 base.py                65 INFO  root    : finalizing...
2025-12-27 22:16:37 container.py           35 INFO  root    : Retrieved dataFrame with shape=(6, 14)


(6, 14)

In [None]:
# retrieve: query with filter
dc.get(query="$filter=campaign eq 'Campaign 4'")
dc.df.shape

In [None]:
# caching: 
dc.to_cache()

In [None]:
# from_cache: get 100 rows
dc.from_cache()
dc.df.shape

In [None]:
# from_cache: get by filter
dc.from_cache(query="$filter=campaign eq 'Campaign 4'")
dc.df.shape

In [None]:
# from_cache: get distinct
dc.from_cache(query="$apply=groupby((year))")
dc.df.shape

In [None]:
# from_cache: get page 2 with page size 50
dc.from_cache(query="$top=50&$skip=50")
dc.df.shape

In [None]:
# from_cache: get total count
dc.from_cache(query="$count=true")
dc.df.shape

## Analytics

In [None]:
app_context.as_of  = ul.str_as_date('20091231')

In [None]:
dc = DataContainer('nyc_open_data_311_service_requests')

In [None]:
# query / load / validate:
dc.get(query=f"$filter=created_date eq '{ul.date_as_iso_format(app_context.as_of)}'")

In [None]:
# persist:
dc.to_cache()

In [None]:
# retrieve from cache for analytics:
dc.from_cache(query=f"$filter=created_date eq '{ul.date_as_iso_format(app_context.as_of)}'")

In [None]:
df = dc.df

In [None]:
cols = ul.cols_as_named_tuple(df)

In [None]:
df["hbc_days_to_close"] = (
    pd.to_datetime(df[cols.closed_date])
    - pd.to_datetime(df[cols.created_date])
).dt.days.astype("Int64")
cols = ul.cols_as_named_tuple(df)

In [None]:
m = df[cols.hbc_days_to_close] == 0
df_closed_not_same_day = df[~m]

In [None]:
path = ul.path_to_str(
                ul.mk_dir(app_context.dir_analytics / "plots")
                / "closed_requests_by_location.html"
            )
_ = PlotEngine.plot_geo_map(
            df=df_closed_not_same_day,
            col_latitude=cols.latitude,
            col_longitude=cols.longitude,
            aggregation="count",
            round_precision=3,
            cluster=True,
            start_zoom=11,
            tiles="CartoDB positron",
            savepath= path
        )
print(path)

In [None]:
# by agency
res = AnalyticalEngine.descriptive_stats(
    n_best=10,
    n_worst=10,
    df=df_closed_not_same_day,
    col_metric=cols.hbc_days_to_close,
    group=[
        cols.agency,
        cols.agency_name,
    ],
)


In [None]:
res.keys()

In [None]:
res['worst']

## Data Container : nyc_open_data_311_call_center_inquiry

In [None]:
dc = DataContainer('nyc_open_data_311_call_center_inquiry')

In [None]:
# load / validate:
dc.get()
dc.df.head(2)

In [None]:
# persist
dc.to_cache()

In [None]:
# query at loading:
dc.get(query="$filter=agency eq 'NYPD' and date eq '2014-03-27'&$top=250")
dc.df.head(2)

In [None]:
# query at caching:
dc.from_cache(query="$filter=date eq '2014-03-27'")
dc.df.head(2)

## Data Container : nyc_open_data_311_customer_satisfaction_survey

In [None]:
dc = DataContainer('nyc_open_data_311_customer_satisfaction_survey')

In [None]:
# load / validate:
dc.get()
dc.df.head(2)

In [None]:
# cache:
dc.to_cache()

In [None]:
# query from caching:
dc.from_cache(query="$filter=answer_satisfaction eq 'Neutral'")
dc.df.head(2)

***

## `Raw DB Access`

In [39]:
from hbc.ltp.persistence.db import SqlLiteDataBase

In [69]:
db = SqlLiteDataBase()
myd = dict(zip(db.all_tables, [db.run_query(f'select count(*) from {t}').values.tolist()[0] for t in db.all_tables]))
db.close()
pd.DataFrame.from_dict(myd).T

Unnamed: 0,0
nyc_open_data_311_call_center_inquiry,1100
nyc_open_data_311_customer_satisfaction_survey,1000
nyc_open_data_311_service_requests,925


In [59]:
myd

{'nyc_open_data_311_call_center_inquiry': array([[1100]]),
 'nyc_open_data_311_customer_satisfaction_survey': array([[1000]]),
 'nyc_open_data_311_service_requests': array([[925]])}

## `Files/folders-based cache`

In [None]:
from hbc.ltp.persistence.cache import Cache

In [None]:
dc = DataContainer('nyc_open_data_311_customer_satisfaction_survey')

In [None]:
dc.get(query='$top=10')

In [None]:
Cache.to_cache(dc, as_of=app_context.as_of)

In [None]:
Cache.from_cache(dc, as_of=app_context.as_of).head(2)