<a href="https://colab.research.google.com/github/achett/Hierarchical-Model/blob/main/Bayesian_Hierarchical_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install hierarchicalforecast
!pip install statsforecast
!pip install datasetsforecast
!pip install nixtlats>=0.1.0

Collecting hierarchicalforecast
  Downloading hierarchicalforecast-0.4.1-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 kB[0m [31m296.0 kB/s[0m eta [36m0:00:00[0m
Collecting quadprog (from hierarchicalforecast)
  Downloading quadprog-0.1.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (452 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.8/452.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: quadprog, hierarchicalforecast
Successfully installed hierarchicalforecast-0.4.1 quadprog-0.1.11


In [None]:
########################
# PACKAGES
########################
# !pip install -U numba statsforecast datasetsforecast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import openpyxl
from datetime import datetime
import pymc as pm

# compute base forecast no coherent
from statsforecast.core import StatsForecast
from statsforecast.models import AutoARIMA, Naive

#obtain hierarchical reconciliation methods and evaluation
from hierarchicalforecast.core import HierarchicalReconciliation
from hierarchicalforecast.evaluation import HierarchicalEvaluation
from hierarchicalforecast.methods import BottomUp, TopDown, MiddleOut

# #obtain hierarchical dataset
# from datasetsforecast.hierarchical import HierarchicalData

In [None]:
##############
# DATA LOAD - BUDGET
##############
inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/budgetFY23.csv'
budget = pd.read_csv(inputFile)

# List of date columns to transform
date_columns = ['4/1/2023', '5/1/2023', '6/1/2023', '7/1/2023', '8/1/2023', '9/1/2023', '10/1/2023',
                '11/1/2023', '12/1/2023', '1/1/2024', '2/1/2024', '3/1/2024']

# Melting the DataFrame
budget = budget.melt(id_vars=['country', 'product', 'Version', 'Category'], value_vars=date_columns, var_name='ds', value_name='budget')

# Transform date
budget['ds'] = pd.to_datetime(budget['ds'])

# Create unique_id
budget['unique_id'] = 'global/' + budget['country'] + '/' + budget['product']

# Filter and merge
budget = budget[budget['Category']=='Volume']
# budget['budget'] = budget['budget'].astype(float)*1000000

In [None]:
##############
# DATA LOAD - PAM
##############
inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/PAMFY23.csv'
pam = pd.read_csv(inputFile)

# List of date columns to transform
date_columns = ['4/1/2022', '5/1/2022', '6/1/2022', '7/1/2022', '8/1/2022',
       '9/1/2022', '10/1/2022', '11/1/2022', '12/1/2022', '1/1/2023',
       '2/1/2023', '3/1/2023', '4/1/2023', '5/1/2023', '6/1/2023', '7/1/2023',
       '8/1/2023', '9/1/2023', '10/1/2023', '11/1/2023', '12/1/2023',
       '1/1/2024', '2/1/2024', '3/1/2024', '4/1/2024', '5/1/2024', '6/1/2024',
       '7/1/2024', '8/1/2024', '9/1/2024', '10/1/2024', '11/1/2024',
       '12/1/2024', '1/1/2025', '2/1/2025', '3/1/2025']

# Melting the DataFrame
pam = pam.melt(id_vars=['country', 'product', 'Version', 'Category'], value_vars=date_columns, var_name='ds', value_name='pam')

# Transform date
pam['ds'] = pd.to_datetime(pam['ds'])

# Create unique_id
pam['unique_id'] = 'global/' + pam['country'] + '/' + pam['product']

# Filter and merge
pam = pam[pam['Category']=='Volume']
# pam['pam'] = pam['pam'].astype(float)*1000000

In [None]:
########################
# PREP DATA
########################
from hierarchicalforecast.utils import aggregate

inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/volume_output.csv'
data = pd.read_csv(inputFile)

# Sample data
products2include = ['TAMSUL_TAB - Tamsulosin tab', 'TAMSULOSIN - Tamsulosin HCl', 'SOLIFENACN - Solifenacin Succinate']
data = data[data['product'].isin(products2include)]

# Create hierarchical dataframe
data['top_level'] = 'global'
data.rename(columns={'cost_object': 'middle_level'}, inplace=True)
data.rename(columns={'product': 'bottom_level'}, inplace=True)
data = data[['middle_level', 'bottom_level', 'top_level', 'ds', 'value']]

# Update columns
data['ds'] = pd.to_datetime(data['ds'])
data.rename(columns={'value': 'y'}, inplace=True)

# Create hierarchical structure and constraints
hierarchy_levels = [['top_level'],
                    ['top_level', 'middle_level'],
                    ['top_level', 'middle_level', 'bottom_level']]

Y_hier_df, S_df, tags = aggregate(df=data, spec=hierarchy_levels)
Y_hier_df = Y_hier_df.reset_index()

In [21]:
########################
# RUN HIERARCHICAL
########################
#split train/test sets
Y_test_df  = Y_hier_df.groupby('unique_id').tail(9)
Y_train_df = Y_hier_df.drop(Y_test_df.index)

# Compute base auto-ARIMA predictions
fcst = StatsForecast(df=Y_train_df,
                     models=[AutoARIMA(season_length=4)],
                     freq='M', n_jobs=-1)

Y_hat_df = fcst.forecast(h=9)

# Reconcile the base predictions
reconcilers = [
    TopDown(method='forecast_proportions')
]

hrec = HierarchicalReconciliation(reconcilers=reconcilers)
Y_rec_df = hrec.reconcile(Y_hat_df=Y_hat_df, Y_df=Y_train_df,
                          S=S_df, tags=tags)



KeyboardInterrupt: 

In [31]:
########################
# RUN TIMEGPT
########################
from nixtlats import TimeGPT

from nixtlats import TimeGPT
timegpt = TimeGPT(
    # defaults to os.environ.get("TIMEGPT_TOKEN")

)
tgpt = timegpt.forecast(Y_train_df, h=9, freq='MS', time_col='ds', target_col='y', level=[80, 90])

In [43]:
# Create data to plot
data2plot = Y_rec_df.reset_index()
data2plot['ds'] = pd.to_datetime(data2plot['ds'])
data2plot['ds'] = data2plot['ds'] + pd.Timedelta(days=1)
data2plot = data2plot.merge(Y_test_df, on=['ds', 'unique_id'], how='left')
data2plot.rename(columns={'y': 'Actuals'}, inplace=True)

# Add budget, pam
data2plot = data2plot.merge(budget[['unique_id', 'ds', 'budget']], on=['ds', 'unique_id'], how='left')
data2plot = data2plot.merge(pam[['unique_id', 'ds', 'pam']], on=['ds', 'unique_id'], how='left')

# Add tgpt
tgpt['ds'] = pd.to_datetime(tgpt['ds'])
data2plot = data2plot.merge(tgpt[['unique_id', 'ds', 'TimeGPT']], on=['ds', 'unique_id'], how='left')

# Add older actuals
Y_train_df.rename(columns={'y': 'Actuals (Train)'}, inplace=True)
data2plot = pd.concat([data2plot, Y_train_df])

# Rename columns
data2plot.columns = ['unique_id', 'ds', 'AutoARIMA','AA/TopDown', 'Actuals', 'Budget','PAM', 'TimeGPT', 'Actuals (Train)']

In [45]:
# # Install ipywidgets if not already installed
# !pip install ipywidgets

import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt
import pandas as pd

# Example DataFrame
# data = pd.DataFrame({'x': range(1, 11), 'y1': range(10, 20), 'y2': range(20, 30)})

# Update the function to include filtering based on 'unique_id'
def plot_data(unique_id, x_column, y_columns):
    # Filter data based on selected unique_id
    filtered_data = data2plot[data2plot['unique_id'] == unique_id]

    # Plotting
    plt.figure(figsize=(10, 6))
    for y_column in y_columns:
        plt.plot(filtered_data[x_column], filtered_data[y_column], label=y_column)
    plt.xlabel(x_column)
    plt.ylabel('Values')
    plt.title(f'Multiple Y-Axis Plot for unique_id {unique_id}')
    plt.legend()
    plt.show()

    # Displaying the sum table
    display_data = filtered_data[[x_column] + list(y_columns)].copy()

    # Create a sum row
    sum_values = {x_column: 'Sum'}
    for col in list(y_columns):
        sum_values[col] = display_data[col].sum()
    sum_row = pd.DataFrame([sum_values])

    display_data = pd.concat([display_data, sum_row], ignore_index=True)

    # Calculating and displaying the percentage difference table
    perc_diff_rows = []
    actuals_sum = sum_values['Actuals']
    for y_col in y_columns:
        y_sum = sum_values[y_col]
        perc_diff = ((y_sum - actuals_sum) / actuals_sum) * 100 if actuals_sum != 0 else None
        perc_diff_rows.append({'Y Column': y_col, 'Percentage Difference': perc_diff})

    perc_diff_data = pd.DataFrame(perc_diff_rows)
    display(perc_diff_data)
    display(display_data)

# Create widgets
# unique_id_selector = widgets.Dropdown(options=data2plot['unique_id'].unique(), description='unique_id:')
# Create widgets
unique_id_selector = widgets.SelectionSlider(
    options=data2plot['unique_id'].unique(),
    description='unique_id:',
    orientation='horizontal',
    readout=True
)

x_column = widgets.Dropdown(options=data2plot.columns, value=data2plot.columns[1], description='X-axis:')
y_columns = widgets.SelectMultiple(options=data2plot.columns, value=[data2plot.columns[3], data2plot.columns[4], data2plot.columns[5], data2plot.columns[6], data2plot.columns[7], data2plot.columns[8]], description='Y-axis:')

# Display interactive plot
interact(plot_data, unique_id=unique_id_selector, x_column=x_column, y_columns=y_columns)
print('hi')

interactive(children=(SelectionSlider(description='unique_id:', options=('global', 'global/D_CN_TOTAL - China …

hi


In [46]:
########################
# CHECK WINNER
########################
# Filter data2plot
data4metrics = data2plot[data2plot['ds'].isin(budget['ds'].unique())]

numeric_cols = data4metrics.columns.drop(['unique_id', 'ds'])

# Sum up the values for each unique_id
summed_df = data4metrics.groupby('unique_id')[numeric_cols].sum()

# Calculate percentage differences from 'Actuals'
percentage_diff = summed_df.subtract(summed_df['Actuals'], axis=0).div(summed_df['Actuals'], axis=0).abs()

# Drop the 'Actuals' column as we don't need to compare it with itself
percentage_diff.drop(columns=['Actuals', 'Actuals (Train)', 'AutoARIMA'], inplace=True)

# Find the column with the lowest percentage difference for each unique_id
min_diff_col = percentage_diff.idxmin(axis=1)

# Add this as a new column in your original DataFrame
data4metrics['lowest_diff_col'] = data4metrics['unique_id'].map(min_diff_col)

data4metrics[data4metrics['unique_id'].isin(budget['unique_id'])].groupby('lowest_diff_col')['unique_id'].nunique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data4metrics['lowest_diff_col'] = data4metrics['unique_id'].map(min_diff_col)


lowest_diff_col
AA/TopDown    25
Budget        47
PAM           23
TimeGPT       21
Name: unique_id, dtype: int64

In [18]:
summed_df.columns

Index(['AutoARIMA', 'AutoARIMA/TopDown_method-forecast_proportions', 'Actuals',
       'budget', 'pam', 'Actuals (Train)'],
      dtype='object')

In [10]:
with pm.Model() as hierarchical_model:
    # Hyperpriors
    mu_alpha = pm.Normal('mu_alpha', mu=0, sigma=10)
    sigma_alpha = pm.HalfNormal('sigma_alpha', sigma=10)
    mu_beta = pm.Normal('mu_beta', mu=0, sigma=10)
    sigma_beta = pm.HalfNormal('sigma_beta', sigma=10)

    # Priors
    alpha = pm.Normal('alpha', mu=mu_alpha, sigma=sigma_alpha, shape=n_groups)  # group-specific intercepts
    beta = pm.Normal('beta', mu=mu_beta, sigma=sigma_beta, shape=n_groups)  # group-specific slopes
    sigma = pm.HalfNormal('sigma', sigma=1)

    # Expected value
    mu = alpha[group_indicator] + beta[group_indicator] * x

    # Likelihood
    y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y)

    # Sampling
    trace = pm.sample(2000, tune=1000)

NameError: name 'n_groups' is not defined

In [None]:
# Checking the trace
pm.plot_trace(trace,var_names=['alpha','beta'])
plt.show()

In [None]:
# Posterior samples
alpha_samples = trace.posterior['alpha'].values
beta_samples = trace.posterior['beta'].values

# New x values for predictions
x_new = np.linspace(0, 10, 200)

plt.figure(figsize=(10, 5))

# Plot raw data and predictions for each group
for i in range(n_groups):
    # Plot raw data

    plt.plot(x[group_indicator == i], y[group_indicator == i], 'o', color=colors[i], label=f'Group {i+1} observed')
    x_new = x[group_indicator == i]
    # Generate and plot predictions
    alpha = trace.posterior.sel(alpha_dim_0=i,beta_dim_0=i)['alpha'].values
    beta = trace.posterior.sel(alpha_dim_0=i,beta_dim_0=i)['beta'].values
    y_hat = alpha[..., None] + beta[..., None] * x_new[None,:]
    y_hat_mean = y_hat.mean(axis=(0, 1))
    y_hat_std = y_hat.std(axis=(0, 1))
    plt.plot(x_new, y_hat_mean, color=colors[i], label=f'Group {i+1} predicted')
    plt.fill_between(x_new, y_hat_mean - 2*y_hat_std, y_hat_mean + 2*y_hat_std, color=colors[i], alpha=0.3)

plt.title('Raw Data with Posterior Predictions by Group')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()
plt.show()