In [2]:


import requests
import json

def get_ny_data():
    url = "https://data.ny.gov/resource/sayj-mze2.json"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        data = response.json()
        return data
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

if __name__ == "__main__":
    ny_data = get_ny_data()
    if ny_data:
        print(json.dumps(ny_data, indent=4))
    else:
        print("No data retrieved.")

[
    {
        "date": "2025-08-10T00:00:00.000",
        "mode": "MNR",
        "count": "116733.0"
    },
    {
        "date": "2025-08-10T00:00:00.000",
        "mode": "BT",
        "count": "961204.0"
    },
    {
        "date": "2025-08-10T00:00:00.000",
        "mode": "SIR",
        "count": "2107.0"
    },
    {
        "date": "2025-08-10T00:00:00.000",
        "mode": "Bus",
        "count": "712941.0"
    },
    {
        "date": "2025-08-10T00:00:00.000",
        "mode": "Subway",
        "count": "2198432.0"
    },
    {
        "date": "2025-08-10T00:00:00.000",
        "mode": "AAR",
        "count": "27679.0"
    },
    {
        "date": "2025-08-10T00:00:00.000",
        "mode": "LIRR",
        "count": "136708.0"
    },
    {
        "date": "2025-08-09T00:00:00.000",
        "mode": "Subway",
        "count": "2669029.0"
    },
    {
        "date": "2025-08-09T00:00:00.000",
        "mode": "AAR",
        "count": "28414.0"
    },
    {
        "date": "2025-08-

In [3]:
"""
Cell generated by Data Wrangler.
"""
import pandas as pd

def clean_data(ny_data_df):
    # Filter rows based on column: 'mode'
    ny_data_df = ny_data_df[ny_data_df['mode'].str.contains("Subway", regex=False, na=False, case=False)]
    # Calculate 7-day trailing average of 'count' in descending date order
    ny_data_df['7_day_avg'] = (
        ny_data_df['count']
        .astype(float)
        .iloc[::-1]  # Reverse the order for correct trailing average
        .rolling(window=7)
        .mean()
        .iloc[::-1]  # Reverse back to original order
    )
    ny_data_df.drop(columns=['mode'], inplace=True)
    return ny_data_df

# Loaded variable 'ny_data' from kernel state
ny_data_df = pd.DataFrame(ny_data)

ny_data_df_clean = clean_data(ny_data_df.copy())
ny_data_df_clean = ny_data_df_clean.sort_values(by='date', ascending=True)
ny_data_df_clean.head()

Unnamed: 0,date,count,7_day_avg
995,2025-04-03T00:00:00.000,4286943.0,
988,2025-04-04T00:00:00.000,3990036.0,
980,2025-04-05T00:00:00.000,2658935.0,
964,2025-04-06T00:00:00.000,2059867.0,
958,2025-04-07T00:00:00.000,3848628.0,


## Predict Next Value with Supervised Learning
Convert the 1-dimensional time series into lagged features and train a regression model.

In [4]:
# 1) Build lag features
df = ny_data_df_clean[['date','count']].copy()
df['lag1'] = df['count'].shift(1)
df['lag7'] = df['count'].shift(7)
df = df.dropna().set_index('date')

# 2) Define X/y
X = df[['lag1','lag7']]
y = df['count']

# 3) Train/test split
split_idx = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

# 4) Train model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 5) Evaluate
from sklearn.metrics import mean_absolute_error
y_pred = model.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")

# 6) Next-day prediction
last_row = X.iloc[[-1]]

print(f"Next-day forecast: {model.predict(last_row.round())[0]:.0f}")

MAE: 121118.78
Next-day forecast: 2228815


In [5]:
import os
import csv
from datetime import datetime

# 1) Determine the next chronological date
target_date = pd.to_datetime(df.index.max()) + pd.Timedelta(days=1)

# 2) Capture prediction and current timestamp
prediction_value = model.predict(last_row)[0]
predicted_at = datetime.now().isoformat()

# 3) Append to CSV log
log_file = 'transportation\\subway_predictions.csv'
file_exists = os.path.isfile(log_file)
with open(log_file, 'a', newline='') as f:
    writer = csv.writer(f)
    if not file_exists:
        writer.writerow(['target_date', 'predicted', 'predicted_at'])
    writer.writerow([target_date.strftime('%Y-%m-%d'), prediction_value, predicted_at])

print(f"Logged prediction {prediction_value:.2f} for {target_date.date()} at {predicted_at}")

Logged prediction 2228815.20 for 2025-08-11 at 2025-08-12T01:37:42.984408
