<a href="https://colab.research.google.com/github/achett/Hierarchical-Model/blob/main/Bayesian_Hierarchical_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install hierarchicalforecast
!pip install statsforecast
!pip install datasetsforecast
!pip install nixtlats>=0.1.0
!pip install darts
!pip install mlforecast



In [2]:
########################
# PACKAGES
########################
# !pip install -U numba statsforecast datasetsforecast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import openpyxl
from datetime import datetime
import pymc as pm

# compute base forecast no coherent
from statsforecast.core import StatsForecast
from statsforecast.models import AutoARIMA, Naive, AutoETS, AutoCES

#obtain hierarchical reconciliation methods and evaluation
from hierarchicalforecast.core import HierarchicalReconciliation
from hierarchicalforecast.evaluation import HierarchicalEvaluation
from hierarchicalforecast.methods import BottomUp, TopDown, MiddleOut, MinTrace, OptimalCombination, ERM
from hierarchicalforecast.utils import aggregate

# #obtain hierarchical dataset
# from datasetsforecast.hierarchical import HierarchicalData

  from tqdm.autonotebook import tqdm


In [3]:
##############
# PARAMS
##############
fct_periods = 12
fct_st_date = '2023-04-01'
fct_end_date = '2023-12-01'

# Create hierarchical structure and constraints
hierarchy_levels = [['top_level'],
                    ['top_level', 'level2'],
                    ['top_level', 'level2', 'level3'],
                    ['top_level', 'level2', 'level3', 'bottom_level']]

inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/regional_hierarchy.csv'
r_hier = pd.read_csv(inputFile)

# Sample data
products2include = ['TAMSUL_TAB - Tamsulosin tab', 'TAMSULOSIN - Tamsulosin HCl', 'SOLIFENACN - Solifenacin Succinate', 'SOLIF_TAMS - Solifenacin / Tamsulosin']
# products2include = ['SOLIFENACN - Solifenacin Succinate', 'TAMSULOSIN - Tamsulosin HCl']


In [4]:
##############
# DATA LOAD - BUDGET
##############
inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/budgetFY23.csv'
budget = pd.read_csv(inputFile)

# Filter
budget = budget[budget['product'].isin(products2include)]

# Merge hierarchy
budget = budget.merge(r_hier, how='left', left_on='country', right_on='cost_object')


# Transform date and y
budget['ds'] = pd.to_datetime(budget['ds'])
budget['y'] = budget['y'].astype(float)

# Filter
budget = budget[budget['category']=='EQUIV_UNIT - Equivalent Units']

# Address NA values
budget['y'] = budget['y'].fillna(0)
budget['region'] = budget['region'].fillna('')
budget['country'] = budget['country'].fillna('')
budget['product'] = budget['product'].fillna('')

# Create hierarchical dataframe
budget['top_level'] = 'global'
budget.rename(columns={'region': 'level2'}, inplace=True)
budget.rename(columns={'country': 'level3'}, inplace=True)
budget.rename(columns={'product': 'bottom_level'}, inplace=True)
budget = budget[['level2', 'level3', 'bottom_level', 'top_level', 'ds', 'y']]

# Drop duplicates
budget = budget.drop_duplicates()

budget_hier_df, budget_S_df, budget_tags = aggregate(df=budget, spec=hierarchy_levels)

# Rename column y
budget_hier_df=budget_hier_df.reset_index()
budget_hier_df.rename(columns={'y': 'budget'}, inplace=True)

In [5]:
##############
# DATA LOAD - PAM
##############
inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/PAMFY23.csv'
pam = pd.read_csv(inputFile)

# Filter
pam = pam[pam['product'].isin(products2include)]

# Merge hierarchy
pam = pam.merge(r_hier, how='left', left_on='country', right_on='cost_object')

# List of date columns to transform
date_columns = ['4/1/2022', '5/1/2022', '6/1/2022', '7/1/2022', '8/1/2022',
       '9/1/2022', '10/1/2022', '11/1/2022', '12/1/2022', '1/1/2023',
       '2/1/2023', '3/1/2023', '4/1/2023', '5/1/2023', '6/1/2023', '7/1/2023',
       '8/1/2023', '9/1/2023', '10/1/2023', '11/1/2023', '12/1/2023',
       '1/1/2024', '2/1/2024', '3/1/2024', '4/1/2024', '5/1/2024', '6/1/2024',
       '7/1/2024', '8/1/2024', '9/1/2024', '10/1/2024', '11/1/2024',
       '12/1/2024', '1/1/2025', '2/1/2025', '3/1/2025']

# Melting the DataFrame
pam = pam.melt(id_vars=['region','country', 'product', 'Version', 'Category'], value_vars=date_columns, var_name='ds', value_name='y')

# Transform date
pam['ds'] = pd.to_datetime(pam['ds'])

# Filter and merge
pam = pam[pam['Category']=='Volume']
# pam['pam'] = pam['pam'].astype(float)*1000000

# Address NA values
pam['y'] = pam['y'].fillna(0)
pam['region'] = pam['region'].fillna('')
pam['product'] = pam['product'].fillna('')

# Create hierarchical dataframe
pam['top_level'] = 'global'
pam.rename(columns={'region': 'level2'}, inplace=True)
pam.rename(columns={'country': 'level3'}, inplace=True)
pam.rename(columns={'product': 'bottom_level'}, inplace=True)
pam = pam[['level2', 'level3', 'bottom_level', 'top_level', 'ds', 'y']]

# Drop duplicates
pam = pam.drop_duplicates()

pam_hier_df, pam_S_df, pam_tags = aggregate(df=pam, spec=hierarchy_levels)

# Rename column y
pam_hier_df=pam_hier_df.reset_index()
pam_hier_df.rename(columns={'y': 'pam'}, inplace=True)

pam_hier_df = pam_hier_df.drop_duplicates()

In [7]:
########################
# DATA LOAD - ACTUALS
########################
inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/volume_output.csv'
data_raw = pd.read_csv(inputFile)

# Filter
data = data_raw[data_raw['product'].isin(products2include)]

# Merge hierarchy
data = data.merge(r_hier, how='left', on='cost_object')

# Update columns
data['ds'] = pd.to_datetime(data['ds'])
data.rename(columns={'value': 'y'}, inplace=True)

# Address NA values
data['y'] = data['y'].fillna(0)
data['region'] = data['region'].fillna('')
data['product'] = data['product'].fillna('')

# Create hierarchical dataframe
data['top_level'] = 'global'
data.rename(columns={'region': 'level2'}, inplace=True)
data.rename(columns={'cost_object': 'level3'}, inplace=True)
data.rename(columns={'product': 'bottom_level'}, inplace=True)
data = data[['level2', 'level3', 'bottom_level', 'top_level', 'ds', 'y']]

Y_hier_df, S_df, tags = aggregate(df=data, spec=hierarchy_levels)
Y_hier_df = Y_hier_df.reset_index()

#split train/test sets
Y_test_df  = Y_hier_df.groupby('unique_id').tail(fct_periods)
Y_train_df = Y_hier_df.drop(Y_test_df.index)

In [80]:
########################
# COVARIATE: REVENUE
########################
inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/revenue_output.csv'
data_rawr = pd.read_csv(inputFile)

# Filter
datar = data_rawr[data_rawr['product'].isin(products2include)]

# Merge hierarchy
datar = datar.merge(r_hier, how='left', on='cost_object')

# Update columns
datar['ds'] = pd.to_datetime(datar['ds'])
datar.rename(columns={'value': 'y'}, inplace=True)

# Address NA values
datar['y'] = datar['y'].fillna(0)
datar['region'] = datar['region'].fillna('')
datar['product'] = datar['product'].fillna('')

# Create hierarchical dataframe
datar['top_level'] = 'global'
datar.rename(columns={'region': 'level2'}, inplace=True)
datar.rename(columns={'cost_object': 'level3'}, inplace=True)
datar.rename(columns={'product': 'bottom_level'}, inplace=True)
datar = datar[['level2', 'level3', 'bottom_level', 'top_level', 'ds', 'y']]

Y_hier_df_rev, S_df, tags = aggregate(df=datar, spec=hierarchy_levels)
Y_hier_df_rev = Y_hier_df_rev.reset_index()

#split train/test sets
Y_test_df_rev  = Y_hier_df_rev.groupby('unique_id').tail(fct_periods)
Y_train_df_rev = Y_hier_df_rev.drop(Y_test_df_rev.index)

# Compute base predictions
fcst_rev = StatsForecast(df=Y_train_df_rev,
                     models=[AutoETS(season_length=12)],
                     freq='MS', n_jobs=-1)

Y_hat_df_rev = fcst_rev.forecast(h=fct_periods)

Y_hat_df_rev = Y_hat_df_rev.reset_index()
Y_hat_df_rev.rename(columns={'AutoETS': 'cov_rev'}, inplace=True)

# Rename
Y_train_df_rev.rename(columns={'y': 'cov_rev'}, inplace=True)

# Add covariate
Y_train_df_rev_cov = Y_train_df.merge(Y_train_df_rev, how='left', on=['unique_id', 'ds'])

# Filter for common ids
common_ids = pd.Series(list(set(Y_train_df_rev_cov['unique_id']) & set(Y_hat_df_rev['unique_id'])))
Y_train_df_rev_cov = Y_train_df_rev_cov[Y_train_df_rev_cov['unique_id'].isin(common_ids)]
Y_hat_df_rev = Y_hat_df_rev[Y_hat_df_rev['unique_id'].isin(common_ids)]

# Forecast
timegpt_fcst_ex_vars_df = timegpt.forecast(df=Y_train_df_rev_cov, X_df=Y_hat_df_rev, h=fct_periods, freq='MS', time_col='ds', target_col='y')
timegpt_fcst_ex_vars_df.head()



Unnamed: 0,unique_id,ds,TimeGPT
0,global,2023-01-01,111815300.0
1,global,2023-02-01,114413900.0
2,global,2023-03-01,123221800.0
3,global,2023-04-01,108497600.0
4,global,2023-05-01,110404000.0


In [67]:
########################
# RUN HIERARCHICAL
########################
# Compute base predictions
fcst = StatsForecast(df=Y_train_df,
                     models=[AutoETS(season_length=12)],
                     freq='MS', n_jobs=-1)

Y_hat_df = fcst.forecast(h=fct_periods)

# Reconcile the base predictions
reconcilers = [
    TopDown(method='forecast_proportions')
    # OptimalCombination(method = 'ols', nonnegative=True)
    # BottomUp()
    # ERM(method='closed')
]

hrec = HierarchicalReconciliation(reconcilers=reconcilers)
Y_rec_df = hrec.reconcile(Y_hat_df=Y_hat_df, Y_df=Y_train_df,
                          S=S_df, tags=tags)



Exception: Check `S_df`, `Y_hat_df` series difference, S\Y_hat=26, Y_hat\S=1

In [9]:
########################
# RUN TIMEGPT
########################
from nixtlats import TimeGPT

timegpt = TimeGPT(
    # defaults to os.environ.get("TIMEGPT_TOKEN")
    token = 'CgM6BBwvTfRrFetlmNCxZzRUAmSRCXndbU1CQJhPryVhgxWp0WgIDIpV9xagvCbg4ZH3TqG012wKMGNs8L7voLeNiydi5U6nFZtnu1P3VJrqECHKzZxCyef8FwNroHEkHM9vR5ltTGAFYW7bOgiiBRzjrdzgk2FIJFsLpXAP9GgMGqa0dVd8it3zbWQ9d02T7JBf9ikAPZyy1bIU206eQZV1Zdd1Rsd9a0PMjUyWxlgg0mckf59d7lwDhzGVBUpd'
)
tgpt = timegpt.forecast(Y_train_df, h=fct_periods, freq='MS', time_col='ds', target_col='y', level=[80, 90])

# Create hierarchical forecast
tgpt_h = tgpt.set_index('unique_id')
tgpt_h = tgpt_h[['ds', 'TimeGPT']]

# Reconcile the base predictions
reconcilers = [
    TopDown(method='forecast_proportions')
    # OptimalCombination(method = 'ols', nonnegative=True)
    # BottomUp()
    # ERM(method='closed')
]

hrec = HierarchicalReconciliation(reconcilers=reconcilers)
Y_rec_df_tgpt = hrec.reconcile(Y_hat_df=tgpt_h, Y_df=Y_train_df,
                          S=S_df, tags=tags)

In [10]:
########################
# RUN PROPHET
########################
from darts.models import Prophet
from darts import TimeSeries
import pandas as pd

# Assuming Y_train_df is your DataFrame

# Function to create a list of TimeSeries objects from the DataFrame
def create_training_list(df):
    training_list = []
    for uid in df['unique_id'].unique():
        temp_df = df[df['unique_id'] == uid].copy()
        temp_df.set_index('ds', inplace=True)
        ts = TimeSeries.from_dataframe(temp_df, value_cols='y')
        training_list.append(ts)
    return training_list

# Modified function to use Prophet models
def ProphetModelBuilder(training_list):
    listOfProphetModels = []
    for ts in training_list:
        prophet_model = Prophet()
        prophet_model.fit(ts)
        listOfProphetModels.append(prophet_model)
    return listOfProphetModels

# Function to generate forecasts with Prophet models
def ProphetForecaster(model_list):
    listOfProphetPreds = []
    for model in model_list:
        pred_Prophet = model.predict(n=fct_periods)
        listOfProphetPreds.append(pred_Prophet)
    return listOfProphetPreds

# Creating a list of TimeSeries objects
training_list = create_training_list(Y_train_df)

# Building Prophet models
listOfProphetModels = ProphetModelBuilder(training_list)

# Generating forecasts with Prophet
listOfProphetPreds = ProphetForecaster(listOfProphetModels)

# Convert predictions to a DataFrame
prophet_df = pd.DataFrame(columns=['unique_id', 'ds', 'Prophet'])

# Get unique IDs
unique_ids = Y_train_df['unique_id'].unique()

# Iterate over the forecasts and unique IDs
for uid, forecast in zip(unique_ids, listOfProphetPreds):
    # Convert forecast to DataFrame
    forecast_df = forecast.pd_dataframe()

    # Reset index to make 'ds' a column
    forecast_df.reset_index(inplace=True)

    # Rename columns to match the desired output
    forecast_df.rename(columns={'index': 'ds', forecast_df.columns[1]: 'Prophet'}, inplace=True)

    # Add the unique_id column
    forecast_df['unique_id'] = uid

    # Append to the final DataFrame
    prophet_df = pd.concat([prophet_df, forecast_df], ignore_index=True)

# prophet_df now contains the data in the format ['unique_id', 'ds', 'Prophet']
prophet_df.head()


INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpnvurfpjq/l81ju0o1.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpnvurfpjq/8rzq4kc0.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=72824', 'data', 'file=/tmp/tmpnvurfpjq/l81ju0o1.json', 'init=/tmp/tmpnvurfpjq/8rzq4kc0.json', 'output', 'file=/tmp/tmpnvurfpjq/prophet_model5perz_pz/prophet_model-20240201200121.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
20:01:21 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
20:01:21 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonalit

Unnamed: 0,unique_id,ds,Prophet
0,global,2023-01-01,177643600.0
1,global,2023-02-01,179602300.0
2,global,2023-03-01,181704000.0
3,global,2023-04-01,190545900.0
4,global,2023-05-01,178978000.0


In [11]:
########################
# RUN LGBM
########################
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn import linear_model

# Feature Engineering
inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/volume_output.csv'
features = pd.read_csv(inputFile)

# Merge hierarchy
features = features.merge(r_hier, how='left', on='cost_object')

features['unique_id'] = 'global' + '/' + features['region'] + '/' + features['cost_object'] + '/' + features['product']
features = features[['unique_id', 'region', 'cost_object', 'product']].drop_duplicates()

Y_train_df_ml = Y_train_df.merge(features, on='unique_id', how='left')

# Creating LabelEncoder instance
label_encoder = LabelEncoder()

# Encoding 'cost_object' and 'product'
Y_train_df_ml['region_encoded'] = label_encoder.fit_transform(Y_train_df_ml['region'])
Y_train_df_ml['cost_object_encoded'] = label_encoder.fit_transform(Y_train_df_ml['cost_object'])
Y_train_df_ml['product_encoded'] = label_encoder.fit_transform(Y_train_df_ml['product'])

# Subset
Y_train_df_ml2 = Y_train_df_ml[['unique_id', 'ds', 'y', 'region_encoded', 'cost_object_encoded','product_encoded']]

models = [
    # linear_model.BayesianRidge(),
    # ElasticNet(alpha=0.08, l1_ratio=0.5),
    lgb.LGBMRegressor(verbosity=-1),
    xgb.XGBRegressor(),
    RandomForestRegressor(random_state=0),
]

from mlforecast import MLForecast
from mlforecast.target_transforms import Differences
from numba import njit
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean


@njit
def rolling_mean_quarter(x):
    return rolling_mean(x, window_size=6)


fcst = MLForecast(
    models=models,
    freq='MS',
    lags=[1, 2, 3, 6, 12],
    lag_transforms={
        1: [expanding_mean],
        7: [rolling_mean_quarter]
    },
    date_features=['month']
    # ,target_transforms=[Differences([1])],
)

fcst.fit(Y_train_df_ml2, static_features=['region_encoded', 'cost_object_encoded', 'product_encoded'])

# Predict
ml_models = fcst.predict(fct_periods)
ml_models.head()

Unnamed: 0,unique_id,ds,LGBMRegressor,XGBRegressor,RandomForestRegressor
0,global,2023-01-01,70017280.0,110784200.0,109314100.0
1,global,2023-02-01,70017280.0,110535368.0,117217400.0
2,global,2023-03-01,78068280.0,129574712.0,123643300.0
3,global,2023-04-01,70017280.0,128322408.0,109464500.0
4,global,2023-05-01,69434810.0,110498096.0,108880700.0


In [81]:
# Create data to plot
data2plot = Y_hier_df.reset_index(drop=True)
data2plot['ds'] = pd.to_datetime(data2plot['ds'])

# Create a new column 'Actuals'
fct_st_date = pd.to_datetime(fct_st_date)
data2plot['Actuals'] = np.where(data2plot['ds'] >= fct_st_date, data2plot['y'], np.nan)

# Replace 'y' values with NaN where the condition is met
data2plot['y'] = np.where(data2plot['ds'] >= fct_st_date, np.nan, data2plot['y'])
data2plot.rename(columns={'y': 'Actuals (Train)'}, inplace=True)

# Add budget, pam
data2plot = data2plot.merge(budget_hier_df[['unique_id', 'ds', 'budget']], on=['ds', 'unique_id'], how='left')
data2plot = data2plot.merge(pam_hier_df[pam_hier_df['ds']>=fct_st_date][['unique_id', 'ds', 'pam']], on=['ds', 'unique_id'], how='left')

# Add hierarchical
Y_rec_df['ds'] = pd.to_datetime(Y_rec_df['ds'])
data2plot = data2plot.merge(Y_rec_df[Y_rec_df['ds']>=fct_st_date], on=['ds', 'unique_id'], how='left')

# # Add tgpt
# tgpt['ds'] = pd.to_datetime(tgpt['ds'])
# data2plot = data2plot.merge(tgpt[tgpt['ds']>=fct_st_date][['unique_id', 'ds', 'TimeGPT']], on=['ds', 'unique_id'], how='left')

# Add hierarchical tgpt
Y_rec_df_tgpt['ds'] = pd.to_datetime(Y_rec_df_tgpt['ds'])
data2plot = data2plot.merge(Y_rec_df_tgpt[Y_rec_df_tgpt['ds']>=fct_st_date], on=['ds', 'unique_id'], how='left')

# Add tgpt+covariate
timegpt_fcst_ex_vars_df['ds'] = pd.to_datetime(timegpt_fcst_ex_vars_df['ds'])
data2plot = data2plot.merge(timegpt_fcst_ex_vars_df[timegpt_fcst_ex_vars_df['ds']>=fct_st_date], on=['ds', 'unique_id'], how='left')

# Add prophet
data2plot = data2plot.merge(prophet_df[prophet_df['ds']>=fct_st_date][['unique_id', 'ds', 'Prophet']], on=['ds', 'unique_id'], how='left')

# Add ml_models
data2plot = data2plot.merge(ml_models[ml_models['ds']>=fct_st_date], on=['ds', 'unique_id'], how='left')

# Rename columns
data2plot.columns = ['unique_id', 'ds', 'Actuals (Train)', 'Actuals', 'Budget','PAM', 'AETS', 'AETS_H', 'TGPT', 'TGPT_H', 'TGPT_C', 'Prophet', 'LGBM', 'XGB', 'RF']

# Filter to end date
data2plot = data2plot[data2plot['ds']<=fct_end_date]

In [85]:
# Filter for common ids
data2plot = data2plot[data2plot['unique_id'].isin(common_ids)]

In [86]:
# # Install ipywidgets if not already installed
# !pip install ipywidgets

import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt
import pandas as pd

# Example DataFrame
# data = pd.DataFrame({'x': range(1, 11), 'y1': range(10, 20), 'y2': range(20, 30)})

# Update the function to include filtering based on 'unique_id'
def plot_data(unique_id, x_column, y_columns):
    # Filter data based on selected unique_id
    filtered_data = data2plot[data2plot['unique_id'] == unique_id]

    # Plotting
    plt.figure(figsize=(8, 5))
    for y_column in y_columns:
        plt.plot(filtered_data[x_column], filtered_data[y_column], label=y_column)
    plt.xlabel(x_column)
    plt.ylabel('Values')
    plt.title(f'Multiple Y-Axis Plot for unique_id {unique_id}')
    plt.legend()
    plt.show()

    # Displaying the sum table
    display_data = filtered_data[[x_column] + list(y_columns)].copy()

    # Create a sum row
    sum_values = {x_column: 'Sum'}
    for col in list(y_columns):
        sum_values[col] = display_data[col].sum()
    sum_row = pd.DataFrame([sum_values])

    display_data = pd.concat([display_data, sum_row], ignore_index=True)

    # Calculating and displaying the percentage difference table
    perc_diff_rows = []
    actuals_sum = sum_values['Actuals']
    for y_col in y_columns:
        y_sum = sum_values[y_col]
        perc_diff = ((y_sum - actuals_sum) / actuals_sum) * 100 if actuals_sum != 0 else None
        perc_diff_rows.append({'Y Column': y_col, 'Percentage Difference': perc_diff})

    perc_diff_data = pd.DataFrame(perc_diff_rows)
    display(perc_diff_data)
    display(display_data)

# Create widgets
# unique_id_selector = widgets.Dropdown(options=data2plot['unique_id'].unique(), description='unique_id:')
# Create widgets
unique_id_selector = widgets.SelectionSlider(
    options=data2plot['unique_id'].unique(),
    description='unique_id:',
    orientation='horizontal',
    readout=True
)

x_column = widgets.Dropdown(options=data2plot.columns, value=data2plot.columns[1], description='X-axis:')
y_columns = widgets.SelectMultiple(options=data2plot.columns, value=[data2plot.columns[2], data2plot.columns[3], data2plot.columns[4], data2plot.columns[5], data2plot.columns[6], data2plot.columns[7], data2plot.columns[8]], description='Y-axis:')

# Display interactive plot
interact(plot_data, unique_id=unique_id_selector, x_column=x_column, y_columns=y_columns)
print('hi')

interactive(children=(SelectionSlider(description='unique_id:', options=('global', 'global/BENELUX', 'global/B…

hi


In [96]:
########################
# CHECK WINNER
########################
# Filter data2plot
data4metrics = data2plot[data2plot['ds'].isin(budget['ds'].unique())]
data4metrics = data4metrics[data4metrics['ds']<=fct_end_date]

numeric_cols = data4metrics.columns.drop(['unique_id', 'ds'])

# Sum up the values for each unique_id
summed_df = data4metrics.groupby('unique_id')[numeric_cols].sum()

# Remove columns with no actuals
summed_df = summed_df[summed_df['Actuals'] != 0]

# Calculate percentage differences from 'Actuals'
percentage_diff = summed_df.subtract(summed_df['Actuals'], axis=0).div(summed_df['Actuals'], axis=0).abs()

# Drop the 'Actuals' column as we don't need to compare it with itself
percentage_diff.drop(columns=['Actuals', 'Actuals (Train)', 'Prophet', 'XGB', 'TGPT_H', 'TGPT', 'TGPT_C', 'AETS', 'LGBM', 'RF', 'PAM'], inplace=True)
# percentage_diff.drop(columns=['Actuals', 'Actuals (Train)'], inplace=True)

# Find the column with the lowest percentage difference for each unique_id
min_diff_col = percentage_diff.idxmin(axis=1)

# Add this as a new column in your original DataFrame
data4metrics['lowest_diff_col'] = data4metrics['unique_id'].map(min_diff_col)

winner = data4metrics[data4metrics['unique_id'].isin(budget_hier_df['unique_id'])].groupby('lowest_diff_col')

winner['unique_id'].nunique()

lowest_diff_col
AETS_H     76
Budget    136
Name: unique_id, dtype: int64

In [15]:
winner.get_group('Budget')['unique_id'].unique()

array(['global', 'global/CIS', 'global/East Europe',
       'global/Greater China', 'global/Japan', 'global/MEA',
       'global/North America', 'global/SESA',
       'global/CIS/D_I_CIS_BEL - Belarus',
       'global/East Europe/D_E_BG - Bulgaria',
       'global/East Europe/D_E_HU - Hungary',
       'global/East Europe/D_E_PO - Poland',
       'global/East Europe/D_E_RO - Romania',
       'global/East Europe/D_E_SLOVAKIA - Slovakia',
       'global/East Europe/D_E_SLVNA - Slovenia',
       'global/East Europe/D_E_UA - Ukraine',
       'global/East Europe/D_GR10_CY - Cyprus',
       'global/East Europe/D_SI10_ME - Montenegro',
       'global/Greater China/D_CN_TOTAL - China Total',
       'global/Greater China/D_HK_TOTAL - Hong Kong Total',
       'global/Greater China/D_TW_TOTAL - Taiwan Total',
       'global/Japan/JP10 - Astellas Pharma Inc',
       'global/LATAM/D_I_AR - Domestic Argentina',
       'global/LATAM/D_I_BRAZIL - Brazil',
       'global/LATAM/D_I_CO - Domestic Colombia

In [16]:
with pm.Model() as hierarchical_model:
    # Hyperpriors
    mu_alpha = pm.Normal('mu_alpha', mu=0, sigma=10)
    sigma_alpha = pm.HalfNormal('sigma_alpha', sigma=10)
    mu_beta = pm.Normal('mu_beta', mu=0, sigma=10)
    sigma_beta = pm.HalfNormal('sigma_beta', sigma=10)

    # Priors
    alpha = pm.Normal('alpha', mu=mu_alpha, sigma=sigma_alpha, shape=n_groups)  # group-specific intercepts
    beta = pm.Normal('beta', mu=mu_beta, sigma=sigma_beta, shape=n_groups)  # group-specific slopes
    sigma = pm.HalfNormal('sigma', sigma=1)

    # Expected value
    mu = alpha[group_indicator] + beta[group_indicator] * x

    # Likelihood
    y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y)

    # Sampling
    trace = pm.sample(2000, tune=1000)

NameError: name 'n_groups' is not defined

In [None]:
# Checking the trace
pm.plot_trace(trace,var_names=['alpha','beta'])
plt.show()

In [None]:
# Posterior samples
alpha_samples = trace.posterior['alpha'].values
beta_samples = trace.posterior['beta'].values

# New x values for predictions
x_new = np.linspace(0, 10, 200)

plt.figure(figsize=(10, 5))

# Plot raw data and predictions for each group
for i in range(n_groups):
    # Plot raw data

    plt.plot(x[group_indicator == i], y[group_indicator == i], 'o', color=colors[i], label=f'Group {i+1} observed')
    x_new = x[group_indicator == i]
    # Generate and plot predictions
    alpha = trace.posterior.sel(alpha_dim_0=i,beta_dim_0=i)['alpha'].values
    beta = trace.posterior.sel(alpha_dim_0=i,beta_dim_0=i)['beta'].values
    y_hat = alpha[..., None] + beta[..., None] * x_new[None,:]
    y_hat_mean = y_hat.mean(axis=(0, 1))
    y_hat_std = y_hat.std(axis=(0, 1))
    plt.plot(x_new, y_hat_mean, color=colors[i], label=f'Group {i+1} predicted')
    plt.fill_between(x_new, y_hat_mean - 2*y_hat_std, y_hat_mean + 2*y_hat_std, color=colors[i], alpha=0.3)

plt.title('Raw Data with Posterior Predictions by Group')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()
plt.show()