In [1]:
import os
import io
import sys
import json
import time
import random
import logging

import numpy as np
import pandas as pd
import datetime

import plotly.io as pio
import streamlit as st
import seaborn as sns
import zipfile

import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error

from utils.manager.login import *
from utils.inputs.validation import *
from utils.inputs.preprocess import *
from utils.inputs.ads import *
from utils.modeling.general import *
from utils.modeling.skforecast_utils import *
from utils.modeling.sktime_utils import *

# Set up the logging configuration for cmdstanpy
logger = logging.getLogger()

# Add NullHandler with CRITICAL log level
null_handler = logging.NullHandler()
null_handler.setLevel(logging.CRITICAL)
logger.addHandler(null_handler)

# Add StreamHandler with INFO log level
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
stream_handler.setLevel(logging.INFO)
logger.addHandler(stream_handler)

logger.propagate = False

## Inputs

In [2]:
# Add dropdown for Country
country_name = "CA"

# Add dropdown for frequency
forecast_freq = "D"

# Add dropdown for data selection
data_selection = False

# Add dropdown for data selection
external_features = False

# Add file uploader to the sidebar
uploaded_file = 'Agency Services.csv'

In [3]:
if forecast_freq == "D":
    forecast_period = 92
elif forecast_freq == "B":
    forecast_period = 66
elif forecast_freq == "W":
    forecast_period = 26
elif forecast_freq == "M":
    forecast_period = 12

In [4]:
# Organize these selections into a dictionary
run_params = {
    "country_name": country_name,
    "forecast_freq": forecast_freq,
    "forecast_period": forecast_period,
    "data_selection": data_selection,
    "external_features": external_features,
    "weekend_weight": 5,
    "holiday_weight": 10,
    "metric_key": "mspe"
}

## Validation

In [5]:
try:
    # Validate the input file
    df = validate_input_file(uploaded_file, external_features)
    logging.info(f"Train Data Size: {df.shape}")
except Exception as e:
    # Log this exception or handle it further up the call stack
    raise Exception(f"An error occurred while validating the file: {str(e)}")

In [6]:
df

Unnamed: 0,ds,y
0,2018-01-01,0.0
1,2018-01-02,367.0
2,2018-01-03,391.0
3,2018-01-04,431.0
4,2018-01-05,395.0
...,...,...
360,2018-12-27,280.0
361,2018-12-28,278.0
362,2018-12-29,0.0
363,2018-12-30,0.0


## Processing

In [7]:
try:
    # Process the input file
    processed_df, forecast_df = process_input_file(df)
    logging.info(f"Train Data Size: {processed_df.shape}")
    logging.info(f"Forecast Data Size: {forecast_df.shape}")
except Exception as e:
    # Log this exception or handle it further up the call stack
    raise Exception(f"An error occurred while processing the file: {str(e)}")

-1


In [8]:
# Find the min data for optimal train data
run_params["historical_start_date"] = processed_df['ds'].min()
run_params["historical_end_date"] = processed_df['ds'].max()
run_params["forecast_start_date"] = processed_df['ds'].max() + pd.Timedelta(days=1)
run_params["forecast_end_date"] = processed_df['ds'].max() + pd.Timedelta(days=run_params["forecast_period"])

## Automated Data Selection

In [9]:
try:
    if data_selection:
        
        # Find optimal window 
        optimal_window_size = find_optimal_window(processed_df)
            
        logging.info(f"Optimal Window Size: {optimal_window_size}")

        # Add 180 days for feature engineering to optimal window
        optimal_window_size += 180

    else:
        optimal_window_size = len(processed_df)
        
except Exception as e:
    # Log this exception or handle it further up the call stack
    raise Exception(f"An error occurred while finding the optimal window: {str(e)}")
    

run_params["optimal_window_size"] = optimal_window_size
    
# Truncate the train set based on optimal window
optimal_df = processed_df[-optimal_window_size:].copy(deep=True)
    
logging.info(f"Optimal Train Data Size: {optimal_df.shape}")

# Find the min data for optimal train data
run_params["optimal_window_start_date"] = optimal_df['ds'].min()

In [10]:
optimal_window_size

365

## Final Data Checks

In [11]:
# Set forecast start and end dates
min_forecast_date = optimal_df['ds'].min() + pd.Timedelta(days=1)
max_forecast_date = min_forecast_date + pd.Timedelta(days=forecast_period)
logging.info(f"Forecast Range: {min_forecast_date} to {max_forecast_date}")

try:
    # Validate column counts based on whether external features are used
    if external_features:
        assert optimal_df.shape[1] > 2 and forecast_df.shape[1] > 2
    else:
        assert optimal_df.shape[1] == 2 and forecast_df.shape[1] == 2
    # Ensure non-empty data structure
    assert optimal_df.shape[1] > 0
    # Ensure same number of columns
    assert optimal_df.shape[1] == forecast_df.shape[1]
except Exception as e:
    raise ValueError("Invalid input data format.")

try:
    # Check coverage of forecast period by data
    if external_features:
        assert forecast_df['ds'].max() > max_forecast_date
except Exception as e:
    raise Exception("Incomplete external variable coverage for forecast period.")

In [12]:
# Get the names of the exogenous variables from the train data
run_params["exog_cols"] = list((optimal_df.columns).difference(['y', 'ds']))

## Feature Engineering

In [13]:
try:
    # Generate date features
    optimal_df = resample_dataframe(optimal_df, forecast_freq)
    forecast_df = resample_dataframe(forecast_df, forecast_freq)
except Exception as e:
    raise Exception(f"Failed to set the data frequency to {forecast_freq}: {e}")

In [14]:
try:
    # Generate date features
    optimal_df = generate_date_features(optimal_df, forecast_freq, country_name)
    forecast_df = generate_date_features(forecast_df, forecast_freq, country_name)
except Exception as e:
    raise ValueError(f"Failed to generate features using 'ds': {e}")

In [15]:
# Get the names of the exogenous variables from the train data
run_params["exog_cols_all"]  = list((optimal_df.columns).difference(['y', 'ds']))

In [16]:
try:
    initial_window_size, lag_window_range, rolling_window_range, test_size, test_steps = determine_params(forecast_freq)
    logger.info(f"Initial Window Size: {initial_window_size}, Lag Window Range: {lag_window_range}")
    logger.info(f"Test Size: {test_size}, Test Steps: {test_steps}")
except Exception as e:
    raise Exception(e)

In [17]:
run_params.update({
    "initial_window_size": initial_window_size,
    "lag_window_range": lag_window_range,
    "rolling_window_range": rolling_window_range,
    "test_size": test_size,
    "test_steps": test_steps
})

## Train and Test Split

In [18]:
try:
    test_df = optimal_df[-test_size:].copy(deep=True)
    test_df = test_df.set_index('ds').asfreq(forecast_freq)
    
    train_df = optimal_df[:-test_size].copy(deep=True)
    train_df = train_df.set_index('ds').asfreq(forecast_freq)
    
    assert len(train_df) + len(test_df) == len(optimal_df)
    
    run_params["train_start_date"] = train_df.index.min()
    run_params["train_end_date"] = train_df.index.max()
    
    run_params["test_start_date"] = test_df.index.min()
    run_params["test_end_date"] = test_df.index.max()
    
except Exception as e:
    raise ValueError(f"Failed to split into train and test: {e}")

## Grid Search

In [19]:
current_dir = 'utils/modeling'

search_results = {}

for model_type in ['random_forest', 'xgboost']:
    # Load parameters for grid search
    model, param_grid = load_model_params_and_create_instance(model_type, current_dir)
    
    # Find best model
    best_configuration, all_results, best_model = find_best_model_skforecast(
        lag_window_range, model, train_df, param_grid, run_params
    )
    
    # Save best model and config
    search_results[model_type] = {
        'best_model': best_model,
        'best_configuration': best_configuration,
        'all_results': all_results,
        'package_type': 'skforecast'
    }
    
    
for model_type in ['prophet', 'naive']:
    # Load parameters for grid search
    model, param_grid = load_model_params_and_create_instance(model_type, current_dir)
    
    # Find best model
    best_configuration, all_results, best_model = find_best_model_sktime(
        train_df['y'], run_params, model, param_grid
    )
    
    # Save best model and config
    search_results[model_type] = {
        'best_model': best_model,
        'best_configuration': best_configuration,
        'all_results': all_results,
        'package_type': 'sktime'
    }

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Number of models compared: 1.


lags grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting 10 folds for each of 1 candidates, totalling 10 fits


INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
2024-06-23 09:57:41,991 - prophet - INFO - Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
2024-06-23 09:57:41,992 - prophet - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /var/folders/4t/nrkfp1595tb366616gpf9d540000gn/T/tmpgu1sbioo/2p1ja1tj.json
DEBUG:cmdstanpy:input tempfile: /var/folders/4t/nrkfp1595tb366616gpf9d540000gn/T/tmpgu1sbioo/ivd2j271.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/abhishekagarwal/opt/anaconda3/lib/python3.9/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=15478', 'data', 'file=/var/folders/4t/nrkfp1595tb366616gpf9d540000gn/T/tmpgu1sbioo/2p1ja1tj.json'

Fitting 10 folds for each of 1 candidates, totalling 10 fits


## Test Set Evaluation

In [20]:
best_model = search_results['prophet']['best_model']
best_model.fit(y=train_df['y'])
generate_forecast_sktime(best_model, 30)

INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
2024-06-23 09:57:43,526 - prophet - INFO - Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
2024-06-23 09:57:43,527 - prophet - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /var/folders/4t/nrkfp1595tb366616gpf9d540000gn/T/tmpgu1sbioo/c479ebtv.json
DEBUG:cmdstanpy:input tempfile: /var/folders/4t/nrkfp1595tb366616gpf9d540000gn/T/tmpgu1sbioo/p5srkdye.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/abhishekagarwal/opt/anaconda3/lib/python3.9/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=26025', 'data', 'file=/var/folders/4t/nrkfp1595tb366616gpf9d540000gn/T/tmpgu1sbioo/c479ebtv.json'

Unnamed: 0,ds,y_pred,min_pred,max_pred
0,2018-12-02,16.476869,-72.10954,112.623233
1,2018-12-03,321.821408,222.228378,417.262439
2,2018-12-04,349.966183,253.328865,443.595518
3,2018-12-05,324.809638,231.64807,421.108353
4,2018-12-06,311.461711,215.192698,406.102719
5,2018-12-07,300.084428,205.62787,395.058986
6,2018-12-08,16.275035,-81.919747,110.203283
7,2018-12-09,16.433143,-85.572413,107.613539
8,2018-12-10,320.967026,222.537752,416.71119
9,2018-12-11,349.036729,248.321754,441.294058


In [21]:
best_model = search_results['random_forest']['best_model']
best_model.fit(y=train_df['y'], exog=train_df[run_params["exog_cols_all"]])

In [22]:
 generate_forecast_skforecast(best_model, run_params, train_df['y'], forecast_df,
                              run_params["test_start_date"], 30)

Unnamed: 0,ds,y_pred,min_pred,max_pred
0,2018-12-02,0.128795,-40.662935,51.775708
1,2018-12-03,301.701402,278.317365,371.868564
2,2018-12-04,331.730097,285.868052,395.058669
3,2018-12-05,319.382581,179.37791,370.722927
4,2018-12-06,323.474521,267.397214,382.948702
5,2018-12-07,308.138399,293.578125,384.038684
6,2018-12-08,0.281346,-45.745426,54.834958
7,2018-12-09,0.090617,-20.278804,48.863218
8,2018-12-10,301.701402,256.708968,366.813984
9,2018-12-11,331.730097,264.015908,383.189429


# add check if forecast df euqla, to forcats period