# 7-Day Average NYC Subway Riders
## Summary
The purpose of this notebook is to predict the next day's number of daily subway riders. Once the prediction is made, the prediction is appended to a file containing these predictions. 

## Pull data

In [1]:
import requests
import json

def get_ny_data():
    url = "https://data.ny.gov/resource/sayj-mze2.json"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        data = response.json()
        return data
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

if __name__ == "__main__":
    ny_data = get_ny_data()
    if ny_data:
        print(json.dumps(ny_data[:3], indent=4))
    else:
        print("No data retrieved.")

[
    {
        "date": "2025-08-12T00:00:00.000",
        "mode": "MNR",
        "count": "225090.0"
    },
    {
        "date": "2025-08-12T00:00:00.000",
        "mode": "SIR",
        "count": "7365.0"
    },
    {
        "date": "2025-08-12T00:00:00.000",
        "mode": "Bus",
        "count": "1327749.0"
    }
]


## Pre-process data

In [36]:
import pandas as pd

def clean_data(ny_data_df):
    # Filter rows based on column: 'mode'
    ny_data_df = ny_data_df[ny_data_df['mode'].str.contains("Subway", regex=False, na=False, case=False)]
    # Calculate 7-day trailing average of 'count' in descending date order
    ny_data_df['7_day_avg'] = (
        ny_data_df['count']
        .astype(float)
        .iloc[::-1]  # Reverse the order for correct trailing average
        .rolling(window=7)
        .mean()
        .iloc[::-1]  # Reverse back to original order
    )
    ny_data_df.drop(columns=['mode'], inplace=True)
    return ny_data_df

# Loaded variable 'ny_data' from kernel state
ny_data_df = pd.DataFrame(ny_data)
ny_data_df_clean = clean_data(ny_data_df.copy())
ny_data_df_clean = ny_data_df_clean.sort_values(by='date', ascending=True)
ny_data_df_clean.head()

Unnamed: 0,date,count,7_day_avg
995,2025-04-04T00:00:00.000,3990036.0,
987,2025-04-05T00:00:00.000,2658935.0,
971,2025-04-06T00:00:00.000,2059867.0,
965,2025-04-07T00:00:00.000,3848628.0,
961,2025-04-08T00:00:00.000,4324220.0,


In [37]:
ny_data_df_clean.tail()

Unnamed: 0,date,count,7_day_avg
30,2025-08-07T00:00:00.000,3944006.0,3414440.0
27,2025-08-08T00:00:00.000,3653380.0,3422360.0
19,2025-08-09T00:00:00.000,2669104.0,3423458.0
12,2025-08-10T00:00:00.000,2199245.0,3436332.0
2,2025-08-11T00:00:00.000,3536290.0,3426744.0


## Save to subway_predictions.csv

In [None]:
# True/False to save the run to CSV
SAVE_TO_CSV = False
if SAVE_TO_CSV:
    import os, csv
    from datetime import datetime
    log_file = 'C:\\Users\\Setup User\\Documents\\Codespaces\\MR Technology projects\\kalshi\\transportation\\Zach\\subway_predictions.csv'
    file_exists = os.path.isfile(log_file)
    with open(log_file, 'a', newline='') as f:
        writer = csv.writer(f)
        if not file_exists:
            writer.writerow(['target_date', 'predicted', 'predicted_at'])
        writer.writerow([DARTS_NEXT_DATE.strftime('%Y-%m-%d'), DARTS_NEXT_VALUE, datetime.now().isoformat()])
    print(f"Logged Darts prediction {DARTS_NEXT_VALUE:.2f} for {DARTS_NEXT_DATE.date()}")