In [1]:
import os
import io
import sys
import json
import time
import random
import logging

import numpy as np
import pandas as pd
import datetime

import plotly.io as pio
import streamlit as st
import seaborn as sns
import zipfile

import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error

from utils.manager.login import *
from utils.inputs.validation import *
from utils.inputs.ads import *
from utils.modeling.general import *
from utils.modeling.skforecast_utils import *
from utils.modeling.sktime_utils import *

# Set up the logging configuration for cmdstanpy
logger = logging.getLogger()

# Add NullHandler with CRITICAL log level
null_handler = logging.NullHandler()
null_handler.setLevel(logging.CRITICAL)
logger.addHandler(null_handler)

# Add StreamHandler with INFO log level
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
stream_handler.setLevel(logging.INFO)
logger.addHandler(stream_handler)

logger.propagate = False

## Inputs

In [2]:
# Add dropdown for Country
country_name = "CA"

# Add dropdown for frequency
forecast_freq = "B"

# Add dropdown for data selection
data_selection = True

# Add dropdown for data selection
external_features = True

# Add file uploader to the sidebar
uploaded_historical_file = 'Agency Services_multi.csv'

if external_features:
    uploaded_forecast_file = 'Agency Services_multi_forecast.csv'

In [3]:
if forecast_freq == "D":
    forecast_period = 92
elif forecast_freq == "B":
    forecast_period = 66
elif forecast_freq == "W":
    forecast_period = 26
elif forecast_freq == "M":
    forecast_period = 12

In [4]:
# Organize these selections into a dictionary
run_params = {
    "country_name": country_name,
    "forecast_freq": forecast_freq,
    "forecast_period": forecast_period,
    "data_selection": data_selection,
    "external_features": external_features,
    "weekend_weight": 5,
    "holiday_weight": 10,
    "metric_key": "mspe"
}

## Validation

In [5]:
try:
    # Validate the input file
    historical_df = validate_input_file(uploaded_historical_file, external_features)
    logging.info(f"Historical Data Size: {historical_df.shape}")
    # Find the min data for optimal train data
    run_params["historical_start_date"] = historical_df['ds'].min()
    run_params["historical_end_date"] = historical_df['ds'].max()
    run_params["forecast_start_date"] = historical_df['ds'].max() + pd.Timedelta(days=1)
    run_params["forecast_end_date"] = historical_df['ds'].max() + pd.Timedelta(days=run_params["forecast_period"])
    
except Exception as e:
    # Log this exception or handle it further up the call stack
    raise Exception(f"An error occurred while validating the uploaded historical data: {str(e)}")

In [6]:
historical_df

Unnamed: 0,ds,y,Insurance_Actual
0,2021-01-01,0,9315434.0
1,2021-01-02,0,9315434.0
2,2021-01-03,0,9315434.0
3,2021-01-04,228,9315434.0
4,2021-01-05,186,9315434.0
...,...,...,...
937,2023-07-27,265,17681592.0
938,2023-07-28,247,17681592.0
939,2023-07-29,0,17681592.0
940,2023-07-30,0,17681592.0


In [7]:
try:
    if run_params["external_features"]:
        # Validate the input file
        forecast_df = validate_input_file(uploaded_forecast_file, external_features)
        logging.info(f"Forecast Data Size: {forecast_df.shape}")
        
        assert forecast_df['ds'].min() == run_params["forecast_start_date"], 'Forecast Start Data is not aligned with Historical End Date'
        
        assert forecast_df['ds'].max() >= run_params["forecast_end_date"], 'Forecast Data is not available for entire Forecast Period'
        
    else:
        forecast_df = pd.DataFrame(columns=historical_df.columns)
    
except Exception as e:
    # Log this exception or handle it further up the call stack
    raise Exception(f"An error occurred while validating the uploaded forecast data: {str(e)}")

In [8]:
forecast_df

Unnamed: 0,ds,y,Insurance_Actual
0,2023-08-01,0.0,19162757.00
1,2023-08-02,0.0,19162757.00
2,2023-08-03,0.0,19162757.00
3,2023-08-04,0.0,19162757.00
4,2023-08-05,0.0,19162757.00
...,...,...,...
484,2024-11-27,0.0,31824538.23
485,2024-11-28,0.0,31824538.23
486,2024-11-29,0.0,31824538.23
487,2024-11-30,0.0,31824538.23


## Automated Data Selection

In [9]:
try:
    if data_selection:
        
        # Find optimal window 
        optimal_window_size = find_optimal_window(historical_df)
            
        logging.info(f"Optimal Window Size: {optimal_window_size}")

        # Add 180 days for feature engineering to optimal window
        optimal_window_size += 180

    else:
        optimal_window_size = len(historical_df)
        
except Exception as e:
    # Log this exception or handle it further up the call stack
    raise Exception(f"An error occurred while finding the optimal window: {str(e)}")

    
# Truncate the train set based on optimal window
optimal_df = historical_df[-optimal_window_size:].copy(deep=True)
    
logging.info(f"Optimal Train Data Size: {optimal_df.shape}")

# Find the min data for optimal train data
run_params["optimal_window_size"] = optimal_window_size
run_params["optimal_window_start_date"] = optimal_df['ds'].min()

In [10]:
optimal_window_size

540

## Final Data Checks

In [11]:
try:
    # Validate column counts based on whether external features are used
    if external_features:
        assert optimal_df.shape[1] > 2 and forecast_df.shape[1] > 2, "Uploaded Historical or Forecast Data does have required number of columns!"
    else:
        assert optimal_df.shape[1] == 2 and forecast_df.shape[1] == 2, "Uploaded Historical or Forecast Data does have required number of columns!"
    # Ensure non-empty data structure
    assert optimal_df.shape[1] > 0, "Uploaded Historical Data does not have enough rows!"
    # Ensure same number of columns
    assert optimal_df.shape[1] == forecast_df.shape[1], "Uploaded Historical and Forecast Data do not have the same number of columns"
except Exception as e:
    raise ValueError("Invalid input data format.")

In [12]:
# Get the names of the exogenous variables from the train data
run_params["exog_cols"] = list((optimal_df.columns).difference(['y', 'ds']))

## Feature Engineering

In [13]:
try:
    # Generate date features
    optimal_df = resample_dataframe(optimal_df, run_params["forecast_freq"])
    forecast_df = resample_dataframe(forecast_df, run_params["forecast_freq"])
except Exception as e:
    raise Exception(f"Failed to set the data frequency to {forecast_freq}: {e}")

In [14]:
try:
    # Generate date features
    optimal_df = generate_date_features(optimal_df, forecast_freq, country_name)
    forecast_df = generate_date_features(forecast_df, forecast_freq, country_name)
except Exception as e:
    raise ValueError(f"Failed to generate features using 'ds': {e}")

In [15]:
# Get the names of the exogenous variables from the train data
run_params["exog_cols_all"]  = list((optimal_df.columns).difference(['y', 'ds']))

In [16]:
try:
    initial_window_size, lag_window_range, rolling_window_range, test_size, test_steps = determine_params(forecast_freq)
    logger.info(f"Initial Window Size: {initial_window_size}, Lag Window Range: {lag_window_range}")
    logger.info(f"Test Size: {test_size}, Test Steps: {test_steps}")
except Exception as e:
    raise Exception(e)

In [17]:
run_params.update({
    "initial_window_size": initial_window_size,
    "lag_window_range": lag_window_range,
    "rolling_window_range": rolling_window_range,
    "test_size": test_size,
    "test_steps": test_steps
})

## Train and Test Split

In [18]:
try:
    test_df = optimal_df[-test_size:].copy(deep=True)
    test_df = test_df.set_index('ds').resample(run_params["forecast_freq"]).sum()
    test_df = test_df.fillna(0)
    
    train_df = optimal_df[:-test_size].copy(deep=True)
    train_df = train_df.set_index('ds').resample(run_params["forecast_freq"]).sum()
    train_df = train_df.fillna(0)
    
    assert len(train_df) + len(test_df) == len(optimal_df)
    
    run_params["train_start_date"] = train_df.index.min()
    run_params["train_end_date"] = train_df.index.max()
    
    run_params["test_start_date"] = test_df.index.min()
    run_params["test_end_date"] = test_df.index.max()
    
    optimal_df = optimal_df.set_index('ds').resample(run_params["forecast_freq"]).sum()
    optimal_df = optimal_df.fillna(0)
    
    forecast_df = forecast_df.set_index('ds').resample(run_params["forecast_freq"]).sum()
    forecast_df = forecast_df.fillna(0)
except Exception as e:
    raise ValueError(f"Failed to split into train and test: {e}")

## Grid Search

In [19]:
current_dir = 'utils/modeling'

model_types = {
    'prophet': 'sktime',
    'naive': 'sktime',
    'random_forest': 'skforecast',
    'xgboost': 'skforecast'
}

search_results = {}

for model_type, package_type in model_types.items():
    # Load parameters for grid search
    model, param_grid = load_model_params_and_create_instance(model_type, current_dir)
    
    if package_type == 'sktime':
        # Find best model
        best_configuration, all_results, best_model = find_best_model_sktime(
            train_df['y'], run_params, model, param_grid
        )
    
    elif package_type == 'skforecast':
        # Find best model
        best_configuration, all_results, best_model = find_best_model_skforecast(
            lag_window_range, model, train_df, param_grid, run_params
        )
    
    else:
        raise Exception('Unknown package type!')
    
    # Save best model and config
    search_results[model_type] = {
        'best_model': best_model,
        'best_configuration': best_configuration,
        'all_results': all_results,
        'package_type': package_type
    }

Fitting 10 folds for each of 1 candidates, totalling 10 fits


INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
2024-06-23 15:01:47,176 - prophet - INFO - Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
2024-06-23 15:01:47,177 - prophet - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /var/folders/4t/nrkfp1595tb366616gpf9d540000gn/T/tmplmjiuqcd/y76gptn3.json
DEBUG:cmdstanpy:input tempfile: /var/folders/4t/nrkfp1595tb366616gpf9d540000gn/T/tmplmjiuqcd/1jalayw4.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/abhishekagarwal/opt/anaconda3/lib/python3.9/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=96480', 'data', 'file=/var/folders/4t/nrkfp1595tb366616gpf9d540000gn/T/tmplmjiuqcd/y76gptn3.json'

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

## Test Set Evaluation

In [20]:
test_eval = {}

for model_type, model_results in search_results.items():
    if model_results['package_type'] == 'sktime':
        best_model = search_results[model_type]['best_model']
        best_model.fit(y=train_df['y'])
        predictions_df = generate_forecast_sktime(best_model, len(test_df))
    elif model_results['package_type'] == 'skforecast':
        best_model = search_results[model_type]['best_model']
        best_model.fit(y=train_df['y'], exog=train_df[run_params["exog_cols_all"]])
        predictions_df = generate_forecast_skforecast(best_model, run_params, train_df['y'], test_df.drop('y', axis=1),
                                      run_params["test_start_date"])
    else:
        raise Exception('Unknown package type!')

    test_eval[model_type] = compute_metrics(predictions_df.merge(test_df.reset_index()), train_df["y"])
    
# Convert the list of dictionaries into a DataFrame for easy manipulation
metrics_df = pd.DataFrame(test_eval).T

# Round off the values in the DataFrame to 3 decimal places for better readability
metrics_df = metrics_df.round(3)

# Sort the DataFrame based on the performance metrics in the order of preference
# MASE > RMSSE > Coverage > MAPE > RMSPE
metric_order = ['MASE', 'RMSSE', 'Coverage', 'MAPE', 'RMSPE']
ascending_order = [True, True, False, True, True]

metrics_df = metrics_df.sort_values(by=metric_order, ascending=ascending_order).reset_index()

metrics_df.rename(columns={'index': 'Model'}, inplace=True)
metrics_df['Model'] = metrics_df['Model'].str.replace('_', ' ').str.title()

INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
2024-06-23 15:01:57,445 - prophet - INFO - Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
2024-06-23 15:01:57,446 - prophet - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /var/folders/4t/nrkfp1595tb366616gpf9d540000gn/T/tmplmjiuqcd/t2uzfdeu.json
DEBUG:cmdstanpy:input tempfile: /var/folders/4t/nrkfp1595tb366616gpf9d540000gn/T/tmplmjiuqcd/tbz0c900.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/abhishekagarwal/opt/anaconda3/lib/python3.9/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=80030', 'data', 'file=/var/folders/4t/nrkfp1595tb366616gpf9d540000gn/T/tmplmjiuqcd/t2uzfdeu.json'

## Forecast

In [21]:
forecasts_all = {}

for model_type, model_results in search_results.items():
    if model_results['package_type'] == 'sktime':
        best_model = search_results[model_type]['best_model']
        best_model.fit(y=optimal_df['y'])
        forecasts_all[model_type] = generate_forecast_sktime(best_model, len(forecast_df))
    elif model_results['package_type'] == 'skforecast':
        best_model = search_results[model_type]['best_model']
        best_model.fit(y=optimal_df['y'], exog=optimal_df[run_params["exog_cols_all"]])
        forecasts_all[model_type] = generate_forecast_skforecast(best_model, run_params, optimal_df['y'],
                                                      forecast_df.drop('y', axis=1), run_params["forecast_start_date"])
    else:
        raise Exception('Unknown package type!')

INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
2024-06-23 15:02:00,826 - prophet - INFO - Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
2024-06-23 15:02:00,827 - prophet - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /var/folders/4t/nrkfp1595tb366616gpf9d540000gn/T/tmplmjiuqcd/bwodkig3.json
DEBUG:cmdstanpy:input tempfile: /var/folders/4t/nrkfp1595tb366616gpf9d540000gn/T/tmplmjiuqcd/r6qspu6v.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/abhishekagarwal/opt/anaconda3/lib/python3.9/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=38021', 'data', 'file=/var/folders/4t/nrkfp1595tb366616gpf9d540000gn/T/tmplmjiuqcd/bwodkig3.json'