In [1]:
import matplotlib.pyplot as plt
import math
import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA

In [2]:
path = r"C:\Users\yanzh\Desktop\code_and_data\4. Deep learning part\处理数据\daily\2015-2019(daily_total_trips).csv"
data = pd.read_csv(path)
data ['date'] = pd.to_datetime(data['date'])
data = data['total_trips']

num_windows = 5
window_size = math.ceil(len(data) // (num_windows - 1.2))
step_size = math.ceil(window_size * 0.7)

windows = []
train_dataset = []
validation_dataset = []
start = 0

while start + step_size <= len(data):
    end = start + window_size
    window_data = data[start:end]
    train_val_split = int(len(window_data) * 0.9)
    train_data = window_data[:train_val_split]
    validation_data = window_data[train_val_split:]

    windows.append(window_data,)
    train_dataset.append(train_data)
    validation_dataset.append(validation_data)

    start += step_size

Therefore, q=5, p=3

In [3]:
total_mae_losses = []
total_mape_losses = []
total_rmse_losses = []
total_r2_scores = []

# Fit the ARIMA model
for i in range(5):
    model = ARIMA(train_dataset[i], order=(3, 0, 5))
    model_fit = model.fit()

    forecast_size = len(validation_dataset[i])
    forecast = model_fit.forecast(steps=forecast_size)
    forecast = np.ceil(forecast).astype(int)

    mae_losses = []
    mape_losses = []
    rmse_losses = []
    all_predicted = []
    all_actual = []

    for idx, forecast_value in enumerate(forecast):
        actual_value = validation_dataset[i].iloc[idx]
        all_actual.append(actual_value)
        all_predicted.append(forecast_value)

        # Calculate MAE
        mae_loss = np.abs(actual_value - forecast_value)
        mae_losses.append(mae_loss)

        # Calculate MAPE
        mape_loss = np.abs((actual_value - forecast_value) / actual_value)
        mape_losses.append(mape_loss)

        # Calculate RMSE
        rmse_loss = np.square(actual_value - forecast_value)
        rmse_losses.append(rmse_loss)
    
    # Convert lists to numpy arrays for R-squared calculation
    all_predicted = np.array(all_predicted)
    all_actual = np.array(all_actual)

    # Calculate R-squared
    ss_res = np.sum((all_actual - all_predicted) ** 2)
    ss_tot = np.sum((all_actual - np.mean(all_actual)) ** 2)
    r2 = 1 - (ss_res / ss_tot)

    total_mae_losses.append(np.mean(mae_losses))
    total_mape_losses.append(np.mean(mape_losses) * 100)
    total_rmse_losses.append(np.sqrt(np.mean(rmse_losses)))
    total_r2_scores.append(r2)

# Calculate average of metrics across all windows
average_mae_loss = np.mean(total_mae_losses)
average_mape_loss = np.mean(total_mape_losses)
average_rmse_loss = np.mean(total_rmse_losses)
average_r2_score = np.mean(total_r2_scores)

print(f'Average MAE on Test Set: {average_mae_loss:.4f}')
print(f'Average MAPE on Test Set: {average_mape_loss:.4f}')
print(f'Average RMSE on Test Set: {average_rmse_loss:.4f}')
print(f'R^2 Score on Test Set: {average_r2_score:.4f}')



Average MAE on Test Set: 23865.3208
Average MAPE on Test Set: 12.9175
Average RMSE on Test Set: 31842.0265
R^2 Score on Test Set: 0.0791




Next is seasonal ARIMA model

In [3]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
total_mae_losses = []
total_mape_losses = []
total_rmse_losses = []
total_r2_scores = []

# Fit the ARIMA model
for i in range(5):
    model = SARIMAX(train_dataset[i], order=(3, 0, 5), seasonal_order=(1, 1, 1, 7))
    model_fit = model.fit()

    forecast_size = len(validation_dataset[i])
    forecast = model_fit.forecast(steps=forecast_size)
    forecast = np.ceil(forecast).astype(int)

    mae_losses = []
    mape_losses = []
    rmse_losses = []
    all_predicted = []
    all_actual = []

    for idx, forecast_value in enumerate(forecast):
        actual_value = validation_dataset[i].iloc[idx]
        all_actual.append(actual_value)
        all_predicted.append(forecast_value)

        # Calculate MAE
        mae_loss = np.abs(actual_value - forecast_value)
        mae_losses.append(mae_loss)

        # Calculate MAPE
        mape_loss = np.abs((actual_value - forecast_value) / actual_value)
        mape_losses.append(mape_loss)

        # Calculate RMSE
        rmse_loss = np.square(actual_value - forecast_value)
        rmse_losses.append(rmse_loss)
    
    # Convert lists to numpy arrays for R-squared calculation
    all_predicted = np.array(all_predicted)
    all_actual = np.array(all_actual)

    # Calculate R-squared
    ss_res = np.sum((all_actual - all_predicted) ** 2)
    ss_tot = np.sum((all_actual - np.mean(all_actual)) ** 2)
    r2 = 1 - (ss_res / ss_tot)

    total_mae_losses.append(np.mean(mae_losses))
    total_mape_losses.append(np.mean(mape_losses) * 100)
    total_rmse_losses.append(np.sqrt(np.mean(rmse_losses)))
    total_r2_scores.append(r2)

# Calculate average of metrics across all windows
average_mae_loss = np.mean(total_mae_losses)
average_mape_loss = np.mean(total_mape_losses)
average_rmse_loss = np.mean(total_rmse_losses)
average_r2_score = np.mean(total_r2_scores)

print(f'Average MAE on Test Set: {average_mae_loss:.4f}')
print(f'Average MAPE on Test Set: {average_mape_loss:.4f}')
print(f'Average RMSE on Test Set: {average_rmse_loss:.4f}')
print(f'R^2 Score on Test Set: {average_r2_score:.4f}')



Average MAE on Test Set: 29309.8167
Average MAPE on Test Set: 14.3552
Average RMSE on Test Set: 37955.5427
R^2 Score on Test Set: -0.4545
