# GLM Analysis for NYC Taxi Arrivals

In [4]:
from __future__ import annotations

import json
import sys
from pathlib import Path

BASE_DIR = Path.cwd()
for candidate in [BASE_DIR, *BASE_DIR.parents]:
    if (candidate / "src").exists():
        BASE_DIR = candidate
        break
else:
    raise FileNotFoundError("Could not find project root containing src/")

if str(BASE_DIR / "src") not in sys.path:
    sys.path.append(str(BASE_DIR / "src"))

DATA_DIR = BASE_DIR / "data" / "raw"
lookup_path = DATA_DIR / "taxi_zone_lookup.csv"

In [5]:
from __future__ import annotations

import json
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from statsmodels.tsa.stattools import acf

BASE_DIR = Path.cwd().resolve().parent
DATA_DIR = BASE_DIR / "data" / "raw"
lookup_path = DATA_DIR / "taxi_zone_lookup.csv"

from modeling.poisson_zone import (
    load_taxi_pickups,
    attach_zone_metadata,
    bucket_counts_by_group,
)
from modeling.glm_counts import fit_glm, compare_models



poisson_sample = load_taxi_pickups(DATA_DIR / "yellow_tripdata_2024-01.parquet", max_rows=500_000)
poisson_sample = attach_zone_metadata(poisson_sample, lookup_path)
in_scope = (
    poisson_sample.dropna(subset=["Borough"])
    .assign(event_time=lambda d: d["event_time"].dt.tz_convert(None))
)
borough_counts = bucket_counts_by_group(in_scope, freq="15min", group_cols="Borough")
zone_counts = bucket_counts_by_group(in_scope, freq="15min", group_cols="Zone")
print("Prepared counts with shape", borough_counts.shape)

Prepared counts with shape (569, 7)


## Single-zone GLM example

In [6]:
zone = "Manhattan"
counts = borough_counts[zone]
poisson_fit = fit_glm(counts, family="poisson")
nb_fit = fit_glm(counts, family="nb")
print("AIC Poisson", poisson_fit.aic)
print("AIC NegBin", nb_fit.aic)

AIC Poisson 81205.95090813618
AIC NegBin 8500.815938778102




In [7]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=poisson_fit.residuals.index, y=poisson_fit.residuals.values, name="Poisson"))
fig.add_trace(go.Scatter(x=nb_fit.residuals.index, y=nb_fit.residuals.values, name="NegBin"))
fig.show()

## Top zones summary

In [8]:
top_zones = zone_counts.sum().sort_values(ascending=False).head(10).index
rows = []
for zn in top_zones:
    models = compare_models(zone_counts[zn])
    rows.append({
        "zone": zn,
        "poisson_aic": models["poisson"].aic,
        "poisson_disp": models["poisson"].dispersion,
        "nb_aic": models["nb"].aic,
        "nb_disp": models["nb"].dispersion,
    })
summary_df = pd.DataFrame(rows)
summary_df


Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.


Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.


Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.


Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.


Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.


Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.


Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.


Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.


Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.


Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.



Unnamed: 0,zone,poisson_aic,poisson_disp,nb_aic,nb_disp
0,JFK Airport,5949.77834,4.68083,5512.89997,0.137832
1,Midtown Center,6241.059083,8.46501,4922.99174,0.426036
2,Upper East Side South,6257.017456,10.610643,4795.293423,0.562123
3,Upper East Side North,5167.61746,7.740622,4621.745397,0.588313
4,Lincoln Square East,7963.781758,13.704984,4755.202311,0.607484
5,Penn Station/Madison Sq West,7470.731382,8.897334,4972.677688,0.378076
6,Times Sq/Theatre District,5287.267113,4.675985,4819.462848,0.262336
7,Midtown East,5531.626961,6.544211,4722.433073,0.45024
8,Upper West Side South,5832.199563,8.307551,4568.34759,0.548103
9,LaGuardia Airport,4905.745076,4.54,4267.649444,0.455423
