# 7-Day Average NYC Subway Riders
## Summary
The purpose of this notebook is to predict the next day's number of daily subway riders. Once the prediction is made, the prediction is appended to a file containing these predictions. 

## Pull data

In [35]:
import requests
import json

def get_ny_data():
    url = "https://data.ny.gov/resource/sayj-mze2.json"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        data = response.json()
        return data
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

if __name__ == "__main__":
    ny_data = get_ny_data()
    if ny_data:
        print(json.dumps(ny_data[:3], indent=4))
    else:
        print("No data retrieved.")

[
    {
        "date": "2025-08-11T00:00:00.000",
        "mode": "LIRR",
        "count": "254512.0"
    },
    {
        "date": "2025-08-11T00:00:00.000",
        "mode": "MNR",
        "count": "211894.0"
    },
    {
        "date": "2025-08-11T00:00:00.000",
        "mode": "Subway",
        "count": "3536290.0"
    }
]


## Pre-process data

In [36]:
import pandas as pd

def clean_data(ny_data_df):
    # Filter rows based on column: 'mode'
    ny_data_df = ny_data_df[ny_data_df['mode'].str.contains("Subway", regex=False, na=False, case=False)]
    # Calculate 7-day trailing average of 'count' in descending date order
    ny_data_df['7_day_avg'] = (
        ny_data_df['count']
        .astype(float)
        .iloc[::-1]  # Reverse the order for correct trailing average
        .rolling(window=7)
        .mean()
        .iloc[::-1]  # Reverse back to original order
    )
    ny_data_df.drop(columns=['mode'], inplace=True)
    return ny_data_df

# Loaded variable 'ny_data' from kernel state
ny_data_df = pd.DataFrame(ny_data)
ny_data_df_clean = clean_data(ny_data_df.copy())
ny_data_df_clean = ny_data_df_clean.sort_values(by='date', ascending=True)
ny_data_df_clean.head()

Unnamed: 0,date,count,7_day_avg
995,2025-04-04T00:00:00.000,3990036.0,
987,2025-04-05T00:00:00.000,2658935.0,
971,2025-04-06T00:00:00.000,2059867.0,
965,2025-04-07T00:00:00.000,3848628.0,
961,2025-04-08T00:00:00.000,4324220.0,


In [37]:
ny_data_df_clean.tail()

Unnamed: 0,date,count,7_day_avg
30,2025-08-07T00:00:00.000,3944006.0,3414440.0
27,2025-08-08T00:00:00.000,3653380.0,3422360.0
19,2025-08-09T00:00:00.000,2669104.0,3423458.0
12,2025-08-10T00:00:00.000,2199245.0,3436332.0
2,2025-08-11T00:00:00.000,3536290.0,3426744.0


In [None]:
# Initialize darts_model before fitting
from darts.models import LightGBMModel

lags = [-1, -2, -3, -4, -5, -6, -7, -14, -21]
darts_model = LightGBMModel(
    lags=lags,
    lags_future_covariates=[0],  # use contemporaneous calendar/holiday features
    output_chunk_length=1,
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42,
)

# Fit on full data and forecast next day
darts_model.fit(series_scaled, future_covariates=future_covariates)

NameError: name 'darts_model' is not defined

In [None]:
# (Optional) Save the Darts prediction to the same CSV log used earlier
SAVE_TO_CSV = True
if SAVE_TO_CSV:
    import os, csv
    from datetime import datetime
    log_file = 'C:\\Users\\Setup User\\Documents\\Codespaces\\MR Technology projects\\kalshi\\transportation\\Zach\\subway_predictions.csv'
    file_exists = os.path.isfile(log_file)
    with open(log_file, 'a', newline='') as f:
        writer = csv.writer(f)
        if not file_exists:
            writer.writerow(['target_date', 'predicted', 'predicted_at'])
        writer.writerow([DARTS_NEXT_DATE.strftime('%Y-%m-%d'), DARTS_NEXT_VALUE, datetime.now().isoformat()])
    print(f"Logged Darts prediction {DARTS_NEXT_VALUE:.2f} for {DARTS_NEXT_DATE.date()}")

## Forecast with Darts (LightGBMModel)

We use Darts + LightGBM with calendar and holiday covariates. We backtest with rolling 1-step forecasts, then fit on all data to forecast the next day and log it.

In [None]:
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler
from darts.models import LightGBMModel
from darts.utils.timeseries_generation import datetime_attribute_timeseries
from darts.metrics import mae, smape
import numpy as np
import pandas as pd
import holidays as pyholidays

print("Darts imports OK")

In [None]:
# Build daily TimeSeries from cleaned dataframe
ny_df = ny_data_df_clean.copy()
ny_df['date'] = pd.to_datetime(ny_df['date'])
ny_df = ny_df.sort_values('date')
ny_df['count'] = pd.to_numeric(ny_df['count'], errors='coerce')
ny_df = ny_df.dropna(subset=['count']).set_index('date').asfreq('D')
ny_df['count'] = ny_df['count'].interpolate(method='time').ffill().bfill()

series = TimeSeries.from_series(ny_df['count'], fill_missing_dates=True, freq='D').astype(np.float32)
series

In [None]:
# Future covariates: calendar + holiday flags
idx = pd.date_range(start=series.start_time(), end=series.end_time(), freq='D')
# One-hot weekday (0-6)
dow = datetime_attribute_timeseries(idx, attribute="weekday", one_hot=True)
# One-hot month (1-12)
moy = datetime_attribute_timeseries(idx, attribute="month", one_hot=True)
# Weekend flag (Sat/Sun)
weekend_flag = pd.Series((idx.weekday >= 5).astype(np.int8), index=idx)
is_weekend = TimeSeries.from_series(weekend_flag, freq='D')

# US holiday flag
us_holidays = pyholidays.UnitedStates()
holiday_flag = pd.Series(idx.map(lambda d: 1 if d in us_holidays else 0), index=idx)
holidays_ts = TimeSeries.from_series(holiday_flag, freq='D')

future_covariates = dow.stack(moy).stack(is_weekend).stack(holidays_ts)
future_covariates

In [None]:
# Scale target and backtest
scaler = Scaler()
series_scaled = scaler.fit_transform(series)

lags = [-1, -2, -3, -4, -5, -6, -7, -14, -21]
darts_model = LightGBMModel(
    lags=lags,
    lags_future_covariates=[0],  # use contemporaneous calendar/holiday features
    output_chunk_length=1,
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42,
)

# Rolling 1-step backtest from 80%
backtest_fcst_scaled = darts_model.historical_forecasts(
    series=series_scaled,
    future_covariates=future_covariates,
    start=0.8,
    forecast_horizon=1,
    stride=1,
    retrain=True,
    last_points_only=True,
    verbose=True,
)

backtest_fcst = scaler.inverse_transform(backtest_fcst_scaled)
series_orig = scaler.inverse_transform(series_scaled)
print(f"Backtest MAE: {mae(series_orig.slice_intersect(backtest_fcst), backtest_fcst):.2f}")
print(f"Backtest sMAPE: {smape(series_orig.slice_intersect(backtest_fcst), backtest_fcst):.2f}%")

In [None]:
# Fit on full data and forecast next day
darts_model.fit(series_scaled, future_covariates=future_covariates)

last_date = series.end_time()
next_date = last_date + pd.Timedelta(days=1)
full_idx = pd.date_range(start=series.start_time(), end=next_date, freq='D')

# Recreate covariates over the extended range using same attributes
dow_f = datetime_attribute_timeseries(full_idx, attribute="weekday", one_hot=True)
moy_f = datetime_attribute_timeseries(full_idx, attribute="month", one_hot=True)
weekend_flag_f = pd.Series((full_idx.weekday >= 5).astype(np.int8), index=full_idx)
is_weekend_f = TimeSeries.from_series(weekend_flag_f, freq='D')
us_holidays = pyholidays.UnitedStates()
holiday_flag_f = pd.Series(full_idx.map(lambda d: 1 if d in us_holidays else 0), index=full_idx)
holidays_ts_f = TimeSeries.from_series(holiday_flag_f, freq='D')
future_covariates_ext = dow_f.stack(moy_f).stack(is_weekend_f).stack(holidays_ts_f)

next_scaled = darts_model.predict(n=1, future_covariates=future_covariates_ext)
next_value = float(scaler.inverse_transform(next_scaled).values()[-1][0])
print(f"Next-day forecast (Darts LightGBM): {next_value:.0f}")

DARTS_NEXT_VALUE = next_value
DARTS_NEXT_DATE = next_date

In [None]:
# Save forecast to CSV (same file as before)
SAVE_TO_CSV = True
if SAVE_TO_CSV:
    import os, csv
    from datetime import datetime
    log_file = 'C:\\Users\\Setup User\\Documents\\Codespaces\\MR Technology projects\\kalshi\\transportation\\Zach\\subway_predictions.csv'
    file_exists = os.path.isfile(log_file)
    with open(log_file, 'a', newline='') as f:
        writer = csv.writer(f)
        if not file_exists:
            writer.writerow(['target_date', 'predicted', 'predicted_at'])
        writer.writerow([DARTS_NEXT_DATE.strftime('%Y-%m-%d'), DARTS_NEXT_VALUE, datetime.now().isoformat()])
    print(f"Logged Darts prediction {DARTS_NEXT_VALUE:.2f} for {DARTS_NEXT_DATE.date()}")