In [1]:
from _Setup import *
from _Functions import *

Installing dependencies from requirements.txt...
All dependencies installed successfully.


In [2]:
# Load data as csv
sector_train = pd.read_csv(sector_data_csv_path_train_pivoted, index_col='Area of Responsibility')

# Import the scaler
with open(scalers_file, "rb") as file:
    scalers = pickle.load(file)

# DONE : Create scaled version of original Dataframe
# Apply pre-trained scalers row-wise

# Create an empty DataFrame to store scaled values
sector_train_scaled = pd.DataFrame(index=sector_train.index, columns=sector_train.columns)

# Apply each pre-trained scaler row-wise
for index, row in sector_train.iterrows():
	scaler = scalers[index]
	scaled_row = scaler.transform(row.values.reshape(-1, 1)).flatten()
	sector_train_scaled.loc[index] = scaled_row

sector_train_scaled.head()

Unnamed: 0_level_0,2019-10-01,2019-11-01,2019-12-01,2020-01-01,2020-02-01,2020-03-01,2020-04-01,2020-05-01,2020-06-01,2020-07-01,...,2023-09-01,2023-10-01,2023-11-01,2023-12-01,2024-01-01,2024-02-01,2024-03-01,2024-04-01,2024-05-01,2024-06-01
Area of Responsibility,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Atlanta Field Office,0.309259,0.148765,0.026543,0.003704,0.0,0.201235,0.318519,1.0,0.624074,0.559877,...,0.388272,0.244444,0.401852,0.287654,0.340741,0.267284,0.329012,0.324691,0.340741,0.301235
Baltimore Field Office,0.202374,0.26286,0.323912,0.346523,0.325042,0.159412,0.0,0.052007,0.036744,0.017524,...,0.940079,0.851328,0.811758,0.821933,0.836631,0.848502,0.765969,0.681176,0.66026,0.74788
Big Bend Sector,0.090589,0.064323,0.067839,0.080455,0.071768,0.09514,0.060393,0.085419,0.092037,0.109824,...,0.071355,0.054602,0.04364,0.02213,0.022544,0.073009,0.045708,0.064943,0.110858,0.0
Blaine Sector,0.091463,0.067073,0.103659,0.070122,0.07622,0.042683,0.012195,0.033537,0.009146,0.039634,...,0.658537,0.692073,0.597561,0.881098,0.67378,0.710366,0.682927,1.0,0.804878,0.545732
Boston Field Office,0.242847,0.171496,0.272365,0.16226,0.234698,0.100869,0.0431,0.0,0.040746,0.027888,...,1.0,0.61427,0.578776,0.757334,0.806592,0.644151,0.847881,0.88573,0.861101,0.807678


In [3]:
# Import Test data
test_data = pd.read_csv(sector_data_csv_path_test_pivoted, index_col='Area of Responsibility')
test_data.shape

(41, 6)

In [15]:
from statsmodels.tsa.arima.model import ARIMA
import numpy as np
import pandas as pd
import itertools
import warnings

warnings.filterwarnings("ignore")  # Suppress warnings

# Define the range of ARIMA parameters to search
p_values = range(0, 4)  # Autoregressive terms
d_values = range(0, 2)  # Differencing order
q_values = range(0, 4)  # Moving average terms

# Generate all (p, d, q) combinations safely
pdq_combinations = list(itertools.product(p_values, d_values, q_values))

mapes = []

# Ensure all values are valid integers (avoid NaNs, None, etc.)
fixed_pdq_combinations = [
    tuple(int(value) for value in order)  # Explicit integer conversion
    for order in pdq_combinations
]

# Dictionary to store predictions
predictions_dict = {}

# Iterate over each time series row
for index, row in sector_train_scaled.iterrows():
    try:
        # Convert row to numeric and drop NaN values
        input_series = pd.to_numeric(row, errors='coerce').dropna().values.flatten()

        # Ensure input_series is numeric and of sufficient length
        if len(input_series) < 3:
            print(f"Skipping index {index}: Not enough data points for ARIMA.")
            continue
        
        # Grid search to find the best ARIMA parameters
        best_aic = float("inf")
        best_order = None
        best_model = None
        best_residuals = None

        for order in fixed_pdq_combinations:
            try:
                model = ARIMA(input_series, order=order)
                fitted_model = model.fit()
                aic = fitted_model.aic  # Akaike Information Criterion

                # TODO : Calculate the MAPE of the training data
                

                if aic < best_aic:
                    best_aic = aic
                    best_order = order
                    best_model = fitted_model
                    best_residuals = np.array(fitted_model.resid)

            except Exception as e:
                print(f"Skipping ARIMA{order} for index {index}: {e}")
                continue  # Skip parameter sets that fail to converge


        row_array = np.array(row)

        # Inverse transform predictions and true values
        true_values = scalers[index].inverse_transform(row_array.reshape(-1, 1)).flatten()
        pred_values = true_values - scalers[index].inverse_transform(best_residuals.reshape(-1, 1)).flatten()

        # Avoid division by zero by masking zero true values
        nonzero_mask = true_values != 0
        if not np.any(nonzero_mask):
            print(f"Skipping index {index}: All true values are zero after inverse transform.")
            continue

        mape = np.mean(np.abs((true_values[nonzero_mask] - pred_values[nonzero_mask]) / true_values[nonzero_mask])) * 100
        mapes.append(mape)


    except Exception as e:
        print(f"Skipping index {index} due to unexpected error: {e}")

In [17]:
np.mean(mapes)

np.float64(53.66799295988368)