# AADT Confidence Interval - Interstate 99

## FHWA Links
* Guidelines for Obtaining AADT Estimates from Non-Traditional Sources:
    * https://www.fhwa.dot.gov/policyinformation/travel_monitoring/pubs/aadtnt/Guidelines_for_AADT_Estimates_Final.pdf
  
  
## AADT Analysis Locations
* 10 locations were used in the analysis
* Locations were determined based on the location on installed & recording Traffic Operations cameras
    * for additional information contact Zhenyu Zhu with Traffic Operations

## Traffic Census Data
* https://dot.ca.gov/programs/traffic-operations/census/traffic-volumes
* Back AADT, Peak Month, and Peak Hour usually represents traffic South or West of the count location.  
* Ahead AADT, Peak Month, and Peak Hour usually represents traffic North or East of the count location. Listing of routes with their designated  

* Because the Back & Ahead counts are included at each location in the Traffic Census Data, (e.g., "IRWINDALE, ARROW HIGHWAY") only one [OBJECTID*] per location was pulled; for this analysis the North Bound Nodes were used for the analysis. 
    * for more information see the diagram: https://traffic.onramp.dot.ca.gov/downloads/traffic/files/performance/census/Back_and_Ahead_Leg_Traffic_Count_Diagram.pdf

## StreetLight Analysis Data
* StreetLight Locations on Interstate 99 are one-direction, each location will contain two points: northbound and southbound
    * Analysis Type == Network Performance
    * Segment Metrics
    * 2022 was used to match currently available Traffic Census Data (as of 8/27/2025)
    * pulled a variety of Day Types, but plan to just look at """All Day Types"""
    * pulled a variety of Day Parts, but plan to just look at """All Day Parts"""




In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats

### Pull in the Location Dictionaries

In [2]:
# pull in the coordinates from the utils docs
#from osow_frp_o_d_utils_v3 import origin_intersections, destination_intersections
from sr99_tc_locations_utils import sr_99_d3_tc_aadt_locations

SyntaxError: closing parenthesis ']' does not match opening parenthesis '{' on line 912 (sr99_tc_locations_utils.py, line 916)

### Identify the Google Cloud Storage path

In [None]:
# Identify the GCS path to the data
gcs_path = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/sr99_d3/"

### Pull in the Data

In [None]:
# This function will pull in the data and clean the column headers in a way that will make them easier to work with
def getdata_and_cleanheaders(path):
    # Read the CSV file
    df = pd.read_csv(path)

    # Clean column headers: remove spaces, convert to lowercase, and strip trailing asterisks
    cleaned_columns = []
    for column in df.columns:
        cleaned_column = column.replace(" ", "").lower().rstrip("*")
        cleaned_columns.append(cleaned_column)

    df.columns = cleaned_columns
    return df

In [None]:
# pull in the data & create dataframes
df_tc = getdata_and_cleanheaders(f"{gcs_path}caltrans_traffic_census_2022.csv")  # Traffic Census
df_stl = getdata_and_cleanheaders(f"{gcs_path}streetlight_sr99_d3_all_vehicles_2022_np.csv")  # StreetLight

In [None]:
# comparing
df_tc.to_csv("df_tc.csv", index=False)

In [None]:
# comparing
df_stl.to_csv("df_stl.csv", index=False)

In [None]:
# ----------------------------
# 1) Normalize your locations
# ----------------------------
def _ensure_list(x):
    if x is None:
        return []
    if isinstance(x, (list, tuple, set)):
        return list(x)
    return [x]

def _gather_objectids(node_dict):
    """
    Accepts a nodes[<direction>] dict and returns a list of objectids (as strings).
    Handles 'objectid' and 'objectids' keys.
    """
    ids = []
    if not isinstance(node_dict, dict):
        return ids
    if "objectid" in node_dict:
        ids.extend(_ensure_list(node_dict["objectid"]))
    if "objectids" in node_dict:
        ids.extend(_ensure_list(node_dict["objectids"]))
    return [str(i) for i in ids if i is not None and str(i).strip() != ""]

def _gather_zones(node_dict):
    """
    Returns (ahead_zones, behind_zones) lists from a nodes[<direction>] dict.
    """
    ahead = _ensure_list(node_dict.get("zonename_ahead", []))
    behind = _ensure_list(node_dict.get("zonename_behind", []))
    return ahead, behind

def _iter_locations(aadt_locations):
    """
    Yields normalized location records:
    {
      'name': <location name>,
      'daytype': <daytype string>,
      'objectids': [list of str],
      'ahead_zones': [list of str],
      'behind_zones': [list of str]
    }
    Works whether aadt_locations is:
      - a list with a single dict keyed by location names, or
      - a list of location dicts, or
      - a dict keyed by location names.
    """
    if isinstance(aadt_locations, dict):
        # dict keyed by location name
        for name, loc in aadt_locations.items():
            yield _normalize_one_location(name, loc)
    elif isinstance(aadt_locations, list):
        for item in aadt_locations:
            if isinstance(item, dict) and "nodes" in item:
                # this is a single location dict (not keyed by name)
                name = item.get("location_description") or item.get("name") or "UNKNOWN"
                yield _normalize_one_location(name, item)
            elif isinstance(item, dict):
                # dict keyed by location names
                for name, loc in item.items():
                    yield _normalize_one_location(name, loc)

def _normalize_one_location(name, loc):
    daytype = loc.get("daytype", "0: All Days (M-Su)")
    nodes = loc.get("nodes", {}) or {}
    all_ids, ahead, behind = [], [], []
    for _, node in nodes.items():
        all_ids.extend(_gather_objectids(node))
        a, b = _gather_zones(node)
        ahead.extend([z for z in a if z])
        behind.extend([z for z in b if z])
    # De-dup while preserving order
    def _dedup(seq):
        seen = set(); out = []
        for x in seq:
            if x not in seen:
                out.append(x); seen.add(x)
        return out
    return {
        "name": name,
        "daytype": daytype,
        "objectids": _dedup(all_ids),
        "ahead_zones": _dedup(ahead),
        "behind_zones": _dedup(behind),
    }


In [None]:
# ------------------------------------------------------
# 2) Traditional (Traffic Census) AADT for a location
# ------------------------------------------------------
def _traditional_aadt_for_ids(df_tc, obj_ids):
    """
    Given df_tc and a list of objectids (strings), compute:
      mean_ahead, mean_back, overall = (mean_ahead + mean_back) / 2
    Returns (overall, mean_ahead, mean_back, count_used)
    """
    if not obj_ids:
        return np.nan, np.nan, np.nan, 0
    sub = df_tc[df_tc["objectid"].astype(str).isin(obj_ids)]
    if sub.empty:
        return np.nan, np.nan, np.nan, 0

    # Expect columns 'ahead_aadt' and 'back_aadt'
    ahead_vals = pd.to_numeric(sub.get("ahead_aadt"), errors="coerce").dropna()
    back_vals  = pd.to_numeric(sub.get("back_aadt"),  errors="coerce").dropna()

    mean_ahead = ahead_vals.mean() if not ahead_vals.empty else np.nan
    mean_back  = back_vals.mean()  if not back_vals.empty  else np.nan

    overall = np.nanmean([mean_ahead, mean_back])  # average of the two means
    count_used = len(sub)
    return overall, mean_ahead, mean_back, count_used

In [None]:
# ------------------------------------------------------
# 3) StreetLight (Non-Traditional) AADT for a location
# ------------------------------------------------------
def _stl_aadt_for_zones(
    df_stl, ahead_zones, behind_zones,
    daytype="0: All Days (M-Su)",
    daypart="0: All Day (12am-12am)",
    modeoftravel=None,
    zonename_col="zonename",
    stl_volume_col="averagedailysegmenttraffic(stlvolume)"
):
    """
    Compute non-traditional AADT for a location as:
       (mean(ahead_zones) + mean(behind_zones)) / 2
    after filtering by daytype/daypart/(optional) modeoftravel.
    Returns (overall, mean_ahead, mean_behind, ahead_n, behind_n, missing_ahead, missing_behind)
    """
    filt = (df_stl["daytype"] == daytype) & (df_stl["daypart"] == daypart)
    if modeoftravel:
        filt = filt & (df_stl["modeoftravel"] == modeoftravel)
    stl = df_stl.loc[filt]

    def mean_for(zones):
        if not zones:
            return np.nan, 0, []
        sub = stl[stl[zonename_col].isin(zones)]
        present = set(sub[zonename_col].unique())
        missing = [z for z in zones if z not in present]
        vals = pd.to_numeric(sub[stl_volume_col], errors="coerce").dropna()
        return (vals.mean() if not vals.empty else np.nan, len(sub), missing)

    mean_ahead, ahead_n, miss_a = mean_for(ahead_zones)
    mean_behind, behind_n, miss_b = mean_for(behind_zones)
    overall = np.nanmean([mean_ahead, mean_behind])

    return overall, mean_ahead, mean_behind, ahead_n, behind_n, miss_a, miss_b

In [None]:
# ------------------------------------------------------
# 4) Build the per-location comparison DataFrame
# ------------------------------------------------------
def build_aadt_comparison_df(
    aadt_locations,
    df_tc,
    df_stl,
    daytype_filter="0: All Days (M-Su)",
    daypart_filter="0: All Day (12am-12am)",
    modeoftravel_filter=None,
    zonename_col="zonename",
    stl_volume_col="averagedailysegmenttraffic(stlvolume)"
):
    """
    Returns a tidy DataFrame with one row per (location), including:
      • objectids, ahead_zones, behind_zones
      • traditional_aadt, traditional_ahead_mean, traditional_behind_mean
      • non_trad_aadt, non_trad_ahead_mean, non_trad_behind_mean
      • TCE (%)
      • counts & missing info for debugging
    """
    records = []

    for loc in _iter_locations(aadt_locations):
        # traditional
        trad_overall, trad_ahead, trad_behind, n_ids = _traditional_aadt_for_ids(
            df_tc, loc["objectids"]
        )

        # non-traditional
        stl_overall, stl_ahead, stl_behind, ahead_n, behind_n, miss_a, miss_b = _stl_aadt_for_zones(
            df_stl,
            loc["ahead_zones"],
            loc["behind_zones"],
            daytype=daytype_filter,
            daypart=daypart_filter,
            modeoftravel=modeoftravel_filter,
            zonename_col=zonename_col,
            stl_volume_col=stl_volume_col
        )

        # TCE
        tce = np.nan
        if pd.notna(trad_overall) and trad_overall != 0 and pd.notna(stl_overall):
            tce = 100.0 * (stl_overall - trad_overall) / trad_overall

        records.append({
            "location": loc["name"],
            "daytype_expected": loc["daytype"],
            "daytype_used": daytype_filter,
            "daypart_used": daypart_filter,
            "objectids": ",".join(loc["objectids"]),
            "n_objectids": len(loc["objectids"]),
            "ahead_zones": ",".join(loc["ahead_zones"]),
            "behind_zones": ",".join(loc["behind_zones"]),

            "traditional_aadt": trad_overall,
            "traditional_ahead_mean": trad_ahead,
            "traditional_behind_mean": trad_behind,

            "non_trad_aadt": stl_overall,
            "non_trad_ahead_mean": stl_ahead,
            "non_trad_behind_mean": stl_behind,

            "tce_percent": tce,

            "stl_ahead_rows": ahead_n,
            "stl_behind_rows": behind_n,
            "missing_ahead_zones": ",".join(miss_a) if miss_a else "",
            "missing_behind_zones": ",".join(miss_b) if miss_b else "",
        })

    df = pd.DataFrame.from_records(records)
    # Optional: keep a stable, readable column order
    preferred_cols = [
        "location", "objectids", "n_objectids",
        "ahead_zones", "behind_zones",
        "traditional_ahead_mean", "traditional_behind_mean", "traditional_aadt",
        "non_trad_ahead_mean", "non_trad_behind_mean", "non_trad_aadt",
        "tce_percent",
        "daytype_expected", "daytype_used", "daypart_used",
        "stl_ahead_rows", "stl_behind_rows",
        "missing_ahead_zones", "missing_behind_zones",
    ]
    df = df[[c for c in preferred_cols if c in df.columns]]
    return df


In [None]:
# ------------------------------------------------------
# 5) Confidence interval over TCE
# ------------------------------------------------------
def tce_confidence_interval(detail_df, confidence=0.95):
    """
    Computes mean TCE, CI, t-critical, and t-test statistic over the rows in detail_df.
    Expects a 'tce_percent' column.
    """
    tces = pd.to_numeric(detail_df["tce_percent"], errors="coerce").dropna().values
    n = len(tces)
    if n == 0:
        return None, None, None, None, None

    mean_tce = float(np.mean(tces))
    std_tce = float(np.std(tces, ddof=1)) if n > 1 else 0.0
    se = std_tce / np.sqrt(n) if n > 1 else 0.0

    if n > 1 and se > 0:
        dfree = n - 1
        tcrit = float(stats.t.ppf((1 + confidence) / 2, dfree))
        ci_lo = mean_tce - tcrit * se
        ci_hi = mean_tce + tcrit * se
        t_stat = mean_tce / se
    else:
        tcrit = None
        ci_lo = None
        ci_hi = None
        t_stat = None

    return mean_tce, ci_lo, ci_hi, tcrit, t_stat

In [None]:
# 1) Build the per-location comparison table
detail = build_aadt_comparison_df(
    aadt_locations=sr_99_d3_tc_aadt_locations,
    df_tc=df_tc,
    df_stl=df_stl,
    daytype_filter="0: All Days (M-Su)",
    daypart_filter="0: All Day (12am-12am)",
    modeoftravel_filter="All Vehicles - StL All Vehicles Volume",  # or None if not needed
    zonename_col="zonename",
    stl_volume_col="averagedailysegmenttraffic(stlvolume)"
)

# e.g., export for inspection
# detail.to_csv("sr99_d3_aadt_comparison.csv", index=False)

In [None]:
# 2) Compute the CI across locations
mean_tce, ci_lo, ci_hi, tcrit, t_stat = tce_confidence_interval(detail, confidence=0.95)
print(mean_tce, ci_lo, ci_hi, tcrit, t_stat)

### Mean TCE: -3.62
Traffic Census Error (TCE)
* 

### 95% Confidence Interval (-10.78%, 3.54%)
* 

### T-Test Statistic  
* 
### Summary
* 
