<a href="https://colab.research.google.com/github/achett/Hierarchical-Model/blob/main/Bayesian_Hierarchical_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install hierarchicalforecast
!pip install statsforecast
!pip install datasetsforecast
!pip install nixtlats>=0.1.0
!pip install darts
!pip install mlforecast

Collecting hierarchicalforecast
  Downloading hierarchicalforecast-0.4.1-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy<1.24 (from hierarchicalforecast)
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
Collecting quadprog (from hierarchicalforecast)
  Downloading quadprog-0.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (508 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m508.2/508.2 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: numpy, quadprog, hierarchicalforecast
  Attempting uninstall: numpy
    Found existing installation: numpy 1.25.2
    Uninstalling numpy-1.25.2:
      Successfully uninstalled numpy-1.25.2
[31mERROR: pip's de

Collecting statsforecast
  Downloading statsforecast-1.7.3-py3-none-any.whl (120 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/120.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.7/120.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting fugue>=0.8.1 (from statsforecast)
  Downloading fugue-0.8.7-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.8/279.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting utilsforecast>=0.0.24 (from statsforecast)
  Downloading utilsforecast-0.0.27-py3-none-any.whl (38 kB)
Collecting triad>=0.9.3 (from fugue>=0.8.1->statsforecast)
  Downloading triad-0.9.5-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting adagio>=0.2.4 (from fugue>=0.8.1->statsforecast)
  Downloading adagio-0.2.4-py3-none-any.wh

In [3]:
########################
# PACKAGES
########################
# !pip install -U numba statsforecast datasetsforecast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import openpyxl
from datetime import datetime
import pymc as pm

# compute base forecast no coherent
from statsforecast.core import StatsForecast
from statsforecast.models import AutoARIMA, Naive, AutoETS, AutoCES, AutoTheta

#obtain hierarchical reconciliation methods and evaluation
from hierarchicalforecast.core import HierarchicalReconciliation
from hierarchicalforecast.evaluation import HierarchicalEvaluation
from hierarchicalforecast.methods import BottomUp, TopDown, MiddleOut, MinTrace, OptimalCombination, ERM
from hierarchicalforecast.utils import aggregate

# #obtain hierarchical dataset
# from datasetsforecast.hierarchical import HierarchicalData
from nixtlats import TimeGPT
import os
os.environ['NIXTLA_ID_AS_COL'] = '1'


  from tqdm.autonotebook import tqdm


In [4]:
##############
# PARAMS
##############
fct_periods = 12
fct_st_date = '2023-04-01'
fct_end_date = '2023-12-01'

# Create hierarchical structure and constraints
hierarchy_levels = [['top_level'],
                    ['top_level', 'level2'],
                    ['top_level', 'level2', 'level3'],
                    ['top_level', 'level2', 'level3', 'bottom_level']]

inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/regional_hierarchy.csv'
r_hier = pd.read_csv(inputFile)


In [5]:
def prepare_data(data, r_hier):
    # Merge hierarchy
    data = data.merge(r_hier, how='left', left_on='cost_object', right_on='cost_object')

    # Transform date and y
    data['ds'] = pd.to_datetime(data['ds'])
    data['y'] = data['y'].astype(float)

    # Address NA values
    data['volume'] = data['y'].fillna(0)
    data['region'] = data['region'].fillna('')
    data['cost_object'] = data['cost_object'].fillna('')
    data['product'] = data['product'].fillna('')

    # Create hierarchical dataframe
    data['top_level'] = 'global'  # Assuming 'top_level' does not contain '/', otherwise add a replace line for it too.
    data.rename(columns={'region': 'level2', 'cost_object': 'level3', 'product': 'bottom_level'}, inplace=True)
    data = data[['level2', 'level3', 'bottom_level', 'top_level', 'ds', 'y']]

    # Replace '/' with '_' in the four columns
    data['level2'] = data['level2'].str.replace('/', '_')
    data['level3'] = data['level3'].str.replace('/', '_')
    data['bottom_level'] = data['bottom_level'].str.replace('/', '_')

    data['unique_id'] = data['top_level'] + '/' + data['level2'] + '/' + data['level3'] + '/' + data['bottom_level']

    return data

In [6]:
##############
# DATA LOAD
##############
inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/budgetFY23.csv'
budget = pd.read_csv(inputFile)
budget = budget[budget['category']=='EQUIV_UNIT - Equivalent Units']
budget.rename(columns={'country': 'cost_object'}, inplace=True)
budget = prepare_data(budget, r_hier)

inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/volume_output.csv'
volume_act = pd.read_csv(inputFile)
volume_act.rename(columns={'value': 'y'}, inplace=True)
volume_act = prepare_data(volume_act, r_hier)

inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/ids2smooth.csv'
ids2smooth = pd.read_csv(inputFile)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  budget.rename(columns={'country': 'cost_object'}, inplace=True)


In [9]:
# Subset
regs2include = ['US10 - Astellas Pharma US, Inc.', 'US21 - Agensys, Inc.', 'JP10 - Astellas Pharma Inc']
volume_act = volume_act[volume_act['level3'].isin(regs2include)]

In [10]:
########################
# IDENTIFY UNIVERSE
########################
tested_ts = set(budget['unique_id'].unique()).intersection(volume_act['unique_id'].unique())

# Find unique IDs present in budget_h but not in rev
unique_ids_in_budget_not_in_rev = set(budget['unique_id'].unique()).difference(volume_act['unique_id'].unique())

# Find unique IDs present in rev but not in budget_h
unique_ids_in_rev_not_in_budget = set(volume_act['unique_id'].unique()).difference(budget['unique_id'].unique())

# Filter volume
volume_act = volume_act[volume_act['unique_id'].isin(tested_ts)]

In [None]:
########################
# TIME SERIES TYPES
########################
# new_products = ['ENFORTUMAB - Enforumab Vedotin', 'ROXADUSTNT - Roxadustant', 'GILTERITNB - Gilteritinib']
# loe_products = ['MICAFUNGIN - Micafungin Sodium']

# volume_np_act = volume_act[volume_act['bottom_level'].isin(new_products)]
# volume_loe_act = volume_act[volume_act['bottom_level'].isin(loe_products)]

# # Remove from original
# volume_act = volume_act[~(volume_act['bottom_level'].isin(new_products + loe_products))]

In [11]:
########################
# INTERMITTENT DEMAND
########################

# Function to calculate the percentage of zeros after the first non-zero
def calculate_percentage_zeros(df):
    # Find the index of the first non-zero entry in 'y'
    first_non_zero_index = df.loc[df['y'] != 0].index.min()
    # If there are no non-zero values, return None or 0 based on your preference
    if pd.isna(first_non_zero_index):
        return None  # Or return 0 if you want to treat this as 0% zeros following non-zero
    # Select the subset of 'y' after the first non-zero
    post_non_zero_series = df.loc[first_non_zero_index:, 'y']
    # Count the number of zeros in this subset
    num_zeros = (post_non_zero_series == 0).sum()
    # Calculate the percentage of zeros
    percentage_zeros = num_zeros / len(post_non_zero_series) * 100
    return percentage_zeros

# Apply the function to each group and reset index to make unique_id a column
percentage_zeros_df = volume_act.groupby('unique_id').apply(calculate_percentage_zeros).reset_index(name='percentage_zeros')

percentage_zeros_df


Unnamed: 0,unique_id,percentage_zeros
0,global/Japan/JP10 - Astellas Pharma Inc/ACLARB...,0.0
1,global/Japan/JP10 - Astellas Pharma Inc/BELZER...,0.0
2,global/Japan/JP10 - Astellas Pharma Inc/BIXALO...,0.0
3,global/Japan/JP10 - Astellas Pharma Inc/BLINAT...,0.0
4,global/Japan/JP10 - Astellas Pharma Inc/CERTOL...,0.0
5,global/Japan/JP10 - Astellas Pharma Inc/ENFORT...,0.0
6,global/Japan/JP10 - Astellas Pharma Inc/ENZA -...,0.0
7,global/Japan/JP10 - Astellas Pharma Inc/EVOLOC...,0.0
8,global/Japan/JP10 - Astellas Pharma Inc/GABAPE...,0.0
9,global/Japan/JP10 - Astellas Pharma Inc/GARENO...,0.0


In [12]:
########################
# CANDIDATES FOR AUTO-ARIMA
########################
import pandas as pd
from statsmodels.tsa.stattools import adfuller

# Function to test stationarity with handling for constant series
def test_stationarity(timeseries):
    # Check if the series is constant
    if np.all(timeseries == timeseries.iloc[0]):
        return 'constant'  # Return 'constant' if all values in the series are the same

    # Perform Dickey-Fuller test otherwise:
    dftest = adfuller(timeseries, autolag='AIC')
    # Return True if series is stationary, False otherwise
    return dftest[1] <= 0.05

# Function to difference the series
def difference_series(timeseries):
    return timeseries.diff().dropna()

# Your original dataframe
df = volume_act

# Prepare a DataFrame to store the results
results_df = pd.DataFrame(columns=['unique_id', 'original_stationary', 'differenced_stationary'])

# Loop through each unique_id
for unique_id in df['unique_id'].unique():
    # Extract the time series for the current unique_id
    sub_series = df[df['unique_id'] == unique_id].set_index('ds')['y']

    # Test for stationarity on the original series
    original_stationary = test_stationarity(sub_series)

    # Initialize differenced_stationary as None (it will stay None if differencing is not needed)
    differenced_stationary = None

    # If the original series is not stationary, difference the series and test again
    if not original_stationary:
        differenced_series = difference_series(sub_series)
        differenced_stationary = test_stationarity(differenced_series)

    # Append the results to the results DataFrame
    results_df = results_df.append({'unique_id': unique_id,
                                    'original_stationary': original_stationary,
                                    'differenced_stationary': differenced_stationary},
                                   ignore_index=True)


  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df = results_df.append({'unique_id': unique_id,
  results_df =

In [13]:
########################
# SEGMENT TIME SERIES
########################
inter_demand_list = percentage_zeros_df[percentage_zeros_df['percentage_zeros']>=50]['unique_id'].tolist()

original_stationary_list = results_df[results_df['original_stationary']==True]['unique_id'].tolist()
differenced_stationary_list = results_df[(results_df['original_stationary']==False) & (results_df['differenced_stationary']==True)]['unique_id'].tolist()

auto_arima_list = [item for item in original_stationary_list if item not in inter_demand_list]

In [None]:
########################
# SMOOTH DATA
########################
# import numpy as np
# import pandas as pd
# import statsmodels.api as sm

# # Filter
# volume_act_sm = volume_act[volume_act['unique_id'].isin(ids2smooth['unique_id'])]

# # Columns to smooth
# columns_to_smooth = ['y']

# # New column names for the smoothed data
# new_columns = ['Smoothed_y']

# # Iterate through each unique ID
# for unique_id in volume_act_sm['unique_id'].unique():
#     # Filter the DataFrame for the current ID
#     temp_df = volume_act_sm[volume_act_sm['unique_id'] == unique_id]

#     # Iterate through each column to smooth
#     for col, new_col in zip(columns_to_smooth, new_columns):
#         # Ensure the time series is a pandas Series (assuming 'ds' is the datetime column)
#         time_series = pd.Series(temp_df[col].values, index=pd.to_datetime(temp_df['ds']))

#         # Set up and fit the model
#         model = sm.tsa.UnobservedComponents(time_series, 'local linear trend')
#         print(unique_id)
#         results = model.fit()

#         # Get the smoothed series
#         smoothed_series = results.smoothed_state[0]

#         # Ensure smoothed values are non-negative
#         smoothed_series = np.maximum(smoothed_series, 0)

#         # Adjust the smoothed series to maintain the original sum
#         original_sum = time_series.sum()
#         smoothed_sum = smoothed_series.sum()
#         if smoothed_sum > 0:  # Prevent division by zero
#             adjustment_factor = original_sum / smoothed_sum
#             adjusted_smoothed_series = smoothed_series * adjustment_factor
#         else:
#             adjusted_smoothed_series = smoothed_series  # Keep original if smoothed sum is zero

#         # Store the adjusted smoothed series back in the DataFrame
#         # Note: We're using 'temp_df.index' to ensure the indices match
#         volume_act_sm.loc[temp_df.index, new_col] = adjusted_smoothed_series

# # After looping, 'sga_output2' will contain the new adjusted smoothed columns
# volume_act_sm = volume_act_sm[['unique_id', 'ds', 'Smoothed_y']]
# volume_act_sm.columns = ['unique_id', 'ds', 'y']

# # Re-add non smoothed
# volume_act_smx = volume_act[~(volume_act['unique_id'].isin(ids2smooth['unique_id'])) & (volume_act['unique_id'].isin(budget['unique_id'])) & (volume_act['unique_id'].isin(tested_ts))]

# # Concat
# volume_act2 = pd.concat([volume_act_smx, volume_act_sm])

In [14]:
########################
# RUN AUTO ARIMA
########################
# volume_act2 = volume_act[volume_act['unique_id'].isin(auto_arima_list)]
volume_act2 = volume_act

#split train/test sets
test  = volume_act2.groupby('unique_id').tail(fct_periods)
train = volume_act2.drop(test.index)

models=[AutoETS(season_length=12), AutoARIMA(season_length = 12), AutoTheta(season_length = 12)]
# models=[AutoARIMA(season_length = 12)]
# models=[AutoETS(season_length=12)]

# Compute base predictions
fcst = StatsForecast(df=train[['unique_id', 'ds', 'y']],
                     models=models,
                     freq='MS', n_jobs=-1)

volume_fct = fcst.forecast(h=fct_periods)

# volume_fct.columns = ['unique_id', 'ds', 'Forecast']

  acf = avf[: nlags + 1] / avf[0]
  acf = avf[: nlags + 1] / avf[0]


In [15]:
volume_fct['Forecast'] = volume_fct[['AutoETS', 'AutoARIMA', 'AutoTheta']].mean(axis=1)
volume_fct = volume_fct[['unique_id', 'ds', 'Forecast']]

In [None]:
########################
# RUN TIMEGPT HIERARCHICAL
########################
# #split train/test sets
# Y_test_df  = Y_hier_df.groupby('unique_id').tail(fct_periods)
# Y_train_df = Y_hier_df.drop(Y_test_df.index)

# timegpt = TimeGPT(
#     # defaults to os.environ.get("TIMEGPT_TOKEN")
#     token = 'CgM6BBwvTfRrFetlmNCxZzRUAmSRCXndbU1CQJhPryVhgxWp0WgIDIpV9xagvCbg4ZH3TqG012wKMGNs8L7voLeNiydi5U6nFZtnu1P3VJrqECHKzZxCyef8FwNroHEkHM9vR5ltTGAFYW7bOgiiBRzjrdzgk2FIJFsLpXAP9GgMGqa0dVd8it3zbWQ9d02T7JBf9ikAPZyy1bIU206eQZV1Zdd1Rsd9a0PMjUyWxlgg0mckf59d7lwDhzGVBUpd'
# )
# tgpt = timegpt.forecast(Y_train_df, h=fct_periods, freq='MS', time_col='ds', target_col='y', level=[80, 90])

# # Create hierarchical forecast
# tgpt_h = tgpt.set_index('unique_id')
# tgpt_h = tgpt_h[['ds', 'TimeGPT']]

# # Reconcile the base predictions
# reconcilers = [
#     TopDown(method='forecast_proportions')
#     # OptimalCombination(method = 'ols', nonnegative=True)
#     # BottomUp()
#     # ERM(method='closed')
# ]

# hrec = HierarchicalReconciliation(reconcilers=reconcilers)
# Y_rec_df_tgpt = hrec.reconcile(Y_hat_df=tgpt_h, Y_df=Y_train_df,
#                           S=S_df, tags=tags)

# # Reset Index and columns
# Y_rec_df_tgpt = Y_rec_df_tgpt.reset_index()
# Y_rec_df_tgpt.columns = ['unique_id', 'ds', 'TGPT','TGPT_H']

# Save data
# Y_rec_df_tgpt.to_csv('/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/tgpt_results.csv', index=False)

In [35]:
########################
# RUN LGBM
########################
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn import linear_model

# Feature Engineering
inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/volume_output.csv'
features = pd.read_csv(inputFile)

# Merge hierarchy
features = features.merge(r_hier, how='left', on='cost_object')

features['unique_id'] = 'global' + '/' + features['region'] + '/' + features['cost_object'] + '/' + features['product']
features = features[['unique_id', 'region', 'cost_object', 'product']].drop_duplicates()

#split train/test sets
test  = volume_act2.groupby('unique_id').tail(fct_periods)
train = volume_act2.drop(test.index)

train_ml = train.merge(features, on='unique_id', how='left')

# Creating LabelEncoder instance
label_encoder = LabelEncoder()

# Encoding 'cost_object' and 'product'
train_ml['region_encoded'] = label_encoder.fit_transform(train_ml['region'])
train_ml['cost_object_encoded'] = label_encoder.fit_transform(train_ml['cost_object'])
train_ml['product_encoded'] = label_encoder.fit_transform(train_ml['product'])

# Subset
train_ml = train_ml[['unique_id', 'ds', 'y', 'region_encoded', 'cost_object_encoded','product_encoded']]

models = [
    lgb.LGBMRegressor(verbosity=-1),
]

from mlforecast import MLForecast
from mlforecast.target_transforms import Differences
from numba import njit
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

@njit
def rolling_mean_quarter(x):
    return rolling_mean(x, window_size=6)


fcst = MLForecast(
    models=models,
    freq='MS',
    lags=[1, 2, 3, 6, 12],
    lag_transforms={
        1: [expanding_mean],
        7: [rolling_mean_quarter]
    },
    date_features=['month']
    # ,target_transforms=[Differences([1])],
)

fcst.fit(train_ml, static_features=['region_encoded', 'cost_object_encoded', 'product_encoded'])

# Predict
lgbm_fct = fcst.predict(fct_periods)
lgbm_fct.head()

Unnamed: 0,unique_id,ds,LGBMRegressor
0,global/Japan/JP10 - Astellas Pharma Inc/ACLARB...,2023-01-01,-5158.652409
1,global/Japan/JP10 - Astellas Pharma Inc/ACLARB...,2023-02-01,-5158.652409
2,global/Japan/JP10 - Astellas Pharma Inc/ACLARB...,2023-03-01,-6217.438742
3,global/Japan/JP10 - Astellas Pharma Inc/ACLARB...,2023-04-01,-9295.740904
4,global/Japan/JP10 - Astellas Pharma Inc/ACLARB...,2023-05-01,-9295.740904


In [54]:
import pandas as pd
from darts import TimeSeries
from darts.models import RegressionModel
from darts.dataprocessing.transformers import Scaler
from lightgbm import LGBMRegressor
from darts.metrics import mae



# Prepare individual time series for each 'unique_id'
time_series_dict = {}
for uid in train_ml['unique_id'].unique():
    df = train_ml[train_ml['unique_id'] == uid]
    # Ensure the dataframe is sorted by date
    df = df.sort_values('ds')
    time_series_dict[uid] = TimeSeries.from_dataframe(df, 'ds', 'y')

# Define feature columns and target column
feature_columns = ['region_encoded', 'cost_object_encoded', 'product_encoded']
target_column = 'y'

# Create and train a regression model for each time series
models = {}
scalers = {}  # It's often a good practice to scale your features
forecasts = {}
for uid, ts in time_series_dict.items():
    # We're assuming here that you want to use historical target values ('y') as features.
    # Darts can automatically lag the target series for this purpose when calling 'fit()'.
    # Here, we're also scaling the features for improved performance.
    scaler = Scaler()
    ts_transformed = scaler.fit_transform(ts)
    scalers[uid] = scaler  # Store scaler to inverse transform later

    model = RegressionModel(
        model=LGBMRegressor(),
        lags=len(ts) // 2,  # Number of lags (past values) to use; you can choose another strategy
        output_chunk_length=1,  # Predict one step at a time
        use_static_covariates=True
    )
    model.fit(series=ts_transformed)
    models[uid] = model

    # Generate forecasts (example: forecast 3 time steps ahead)
    forecast = model.predict(n=fct_periods, series=ts_transformed)
    forecasts[uid] = scalers[uid].inverse_transform(forecast)  # Inverse transform to get back to original scale

# Convert dict to dataframe
forecast_dfs = []

for unique_id, forecast_ts in forecasts.items():
    # Convert each TimeSeries to a DataFrame
    df = forecast_ts.pd_dataframe()
    # Add a column for 'unique_id' to identify the forecasts
    df['unique_id'] = unique_id
    # Append the DataFrame to the list
    forecast_dfs.append(df)

# Concatenate all forecast DataFrames into a single DataFrame
all_forecasts_df = pd.concat(forecast_dfs, axis=0)

# Reset the index if you want the DataFrame to have a default integer index
all_forecasts_df.reset_index(inplace=True)

all_forecasts_df = all_forecasts_df[['unique_id', 'ds', 'y']]
all_forecasts_df.columns.name = None


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000059 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 284
[LightGBM] [Info] Number of data points in the train set: 53, number of used features: 24
[LightGBM] [Info] Start training from score 0.384464
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 53, number of used features: 0
[LightGBM] [Info] Start training from score 0.134379
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 284
[LightGBM] [Info] Number of data points in the train set: 53, number of used features: 24
[LightGBM] [Info] Start training from score 0.318499
[LightGBM] [Info] Auto-

In [76]:
########################
# METRICS
########################
inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/tgpt_results.csv'
tgpt_fct = pd.read_csv(inputFile)

# Subset
volume_act_xsm = volume_act[['unique_id', 'ds', 'y']]
volume_act_sm = volume_act2[['unique_id', 'ds', 'y']]
budget2 = budget[['unique_id', 'ds', 'y']]

# Assign names
volume_act_xsm.rename(columns={'y': 'Actuals'}, inplace=True)
budget2.rename(columns={'y': 'Budget'}, inplace=True)
volume_act_sm.rename(columns={'y': 'Smoothed'}, inplace=True)
all_forecasts_df.rename(columns={'y': 'LGBM'}, inplace=True)

# Merge actuals, budget and forecast
rev_at = volume_act_xsm.merge(volume_fct, on=['unique_id', 'ds'], how='left')
rev_at = rev_at.merge(volume_act_sm, on=['unique_id', 'ds'], how='left')
rev_at = rev_at.merge(budget2, on=['unique_id', 'ds'], how='left')
rev_at = rev_at.merge(all_forecasts_df, on=['unique_id', 'ds'], how='left')

# Only keep tested ts
rev_at = rev_at[rev_at['unique_id'].isin(tested_ts)]

rev_at = rev_at[rev_at['unique_id'].isin(volume_act['unique_id'].unique())]




# Filter for dates
data4metrics = rev_at[(rev_at['ds']<=fct_end_date) & (rev_at['ds']>=fct_st_date)]

# Sum up the values for each unique_id
numeric_cols = data4metrics.columns.drop(['unique_id', 'ds'])
summed_df = data4metrics.groupby('unique_id')[numeric_cols].sum()

# Calculate difference and percentage differences from 'Actuals'
absolute_diff = summed_df.subtract(summed_df['Actuals'], axis=0).abs()
percentage_diff = summed_df.subtract(summed_df['Actuals'], axis=0).div(summed_df['Actuals'], axis=0).abs()

# Drop the 'Actuals' column as we don't need to compare it with itself
absolute_diff.drop(columns=['Actuals', 'Smoothed'], inplace=True)

# Find the column with the lowest difference for each unique_id and add to metrics table
min_diff_col = absolute_diff.idxmin(axis=1)
data4metrics['lowest_diff_col'] = data4metrics['unique_id'].map(min_diff_col)

# Find winner
winner = data4metrics.groupby('lowest_diff_col')

# Get Budget winners
bud_winners = winner.get_group('Budget')['unique_id'].unique()

winner['unique_id'].nunique()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  volume_act_xsm.rename(columns={'y': 'Actuals'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  budget2.rename(columns={'y': 'Budget'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  volume_act_sm.rename(columns={'y': 'Smoothed'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-doc

lowest_diff_col
Budget      25
Forecast    10
LGBM         9
Name: unique_id, dtype: int64

In [None]:
print(percentage_diff[percentage_diff['Forecast']<10].shape)
print(percentage_diff[percentage_diff['Budget']<10].shape)

In [77]:
########################
# CREATE PLOT DATA
########################
fct_st_date = pd.to_datetime(fct_st_date)

# Add revenue actuals
data2plot = rev_at.copy()
data2plot['ds'] = pd.to_datetime(data2plot['ds'])

# Update Actuals columns
data2plot['Actuals (Train)'] = data2plot['Actuals'].copy()
data2plot['Actuals'] = data2plot.apply(lambda row: row['Actuals'] if row['ds'] >= fct_st_date else None, axis=1)
data2plot['Actuals (Train)'] = data2plot.apply(lambda row: row['Actuals (Train)'] if row['ds'] < fct_st_date else None, axis=1)

# Add TGPT forecast
tgpt_fct['ds'] = pd.to_datetime(tgpt_fct['ds'])
data2plot = data2plot.merge(tgpt_fct, on=['ds', 'unique_id'], how='left')

# Filter to end date
data2plot = data2plot[data2plot['ds']<=fct_end_date]

# Find TS to fix
ts2fix = data2plot[data2plot['unique_id'].isin(bud_winners)]

data2plot.head()

Unnamed: 0,unique_id,ds,Actuals,Forecast,Smoothed,Budget,LGBM,Actuals (Train),TGPT,TGPT_H
0,global/Japan/JP10 - Astellas Pharma Inc/ACLARB...,2014-04-01,,,0.0,,,0.0,,
1,global/Japan/JP10 - Astellas Pharma Inc/ACLARB...,2014-05-01,,,0.0,,,0.0,,
2,global/Japan/JP10 - Astellas Pharma Inc/ACLARB...,2014-06-01,,,0.0,,,0.0,,
3,global/Japan/JP10 - Astellas Pharma Inc/ACLARB...,2014-07-01,,,0.0,,,0.0,,
4,global/Japan/JP10 - Astellas Pharma Inc/ACLARB...,2014-08-01,,,0.0,,,0.0,,


In [78]:
########################
# PLOT
########################
import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt
import pandas as pd

import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import base64
from io import BytesIO

# Update the function to include filtering based on 'unique_id'
def plot_data(unique_id):
    # Define x_column and y_columns directly
    x_column = data2use.columns[1]
    y_columns = [data2use.columns[2], data2use.columns[3], data2use.columns[5], data2use.columns[6], data2use.columns[7]]

    # Filter data based on selected unique_id
    filtered_data = data2plot[data2plot['unique_id'] == unique_id]

    # Set up a 1x3 grid of subplots
    fig, (ax1, ax4) = plt.subplots(1, 2, figsize=(25, 5), gridspec_kw={'width_ratios': [4, 1]}) # Adjust layout for table

    # Plotting multiple y-axes on the first subplot
    for y_column in y_columns:
        ax1.plot(filtered_data[x_column], filtered_data[y_column], label=y_column)
    ax1.set_xlabel(x_column)
    ax1.set_ylabel('Values')
    ax1.set_title(f'Revenue for {unique_id}')
    ax1.legend()

    # Remove axis for table
    ax4.axis('off')
    ax4.axis('tight')

    # Displaying the sum table
    display_data = filtered_data[[x_column] + list(y_columns)].copy()
    display_data = display_data[display_data['ds']>=fct_st_date]
    display_data['ds'] = display_data['ds'].dt.strftime('%m/%d/%Y')

    # Create a sum row
    sum_values = {x_column: 'Sum'}
    for col in list(y_columns):
        sum_values[col] = display_data[col].sum()
    sum_row = pd.DataFrame([sum_values])

    # Create a % diff row
    actuals_sum = sum_values['Actuals']
    pdiff_values = {x_column: '% Diff'}
    for col in list(y_columns):
        pdiff_values[col] = ((display_data[col].sum()-actuals_sum) / actuals_sum) * 100 if actuals_sum != 0 else None
        pdiff_values[col] = round(pdiff_values[col], 2)
    perc_diff_row = pd.DataFrame([pdiff_values])

    # Stack the sum row
    display_data = pd.concat([sum_row, display_data], ignore_index=True)

    # Round the values and add commas
    for column in y_columns:
        if column in display_data.columns:
            # Round to two decimal places
            display_data[column] = display_data[column].round(2)
            # Format with commas
            display_data[column] = display_data[column].apply(lambda x: f"{x:,.2f}")

    # Stack the % diff and remove 'Actuals Train'
    display_data = pd.concat([perc_diff_row, display_data], ignore_index=True)
    display_data = display_data.drop('Actuals (Train)', axis=1)

    # Convert perc_diff_data to array for table
    table_data = display_data.to_numpy()
    # Add table at the right
    table = ax4.table(cellText=table_data, colLabels=display_data.columns, loc='right')
    table.auto_set_font_size(False)
    table.set_fontsize(8.5)  # Set smaller font size if necessary
    table.scale(4, 1.8)  # Adjust scale to fit

    plt.tight_layout()
    plt.show()


data2use = ts2fix
# data2use = data2plot

# Create widgets
unique_id_selector = widgets.SelectionSlider(
    options=data2use['unique_id'].unique(),
    description='unique_id:',
    orientation='horizontal',
    readout=True
)

# Display interactive plot
interact(plot_data, unique_id=unique_id_selector)

interactive(children=(SelectionSlider(description='unique_id:', options=('global/Japan/JP10 - Astellas Pharma …

In [None]:
####################
# CREATE FINAL DF AND SAVE DATA
####################
volume_fct = data4metrics[data4metrics['unique_id'].isin(tested_ts)]

volume_fct = volume_fct[['unique_id', 'ds', 'Forecast','Actuals', 'Budget', 'lowest_diff_col']]

# Sum up the values for each unique_id
numeric_cols = volume_fct.columns.drop(['unique_id', 'ds', 'lowest_diff_col'])
volume_sum = volume_fct.groupby('unique_id')[numeric_cols].sum()

# Calculate difference and percentage differences from 'Actuals'
absolute_diff = volume_sum.subtract(volume_sum['Actuals'], axis=0).abs()
percentage_diff = volume_sum.subtract(volume_sum['Actuals'], axis=0).div(volume_sum['Actuals'], axis=0).abs()

# Drop uneeded columns
absolute_diff.drop(columns=['Actuals', 'Budget'], inplace=True)

# Find the column with the lowest difference for each unique_id and add to metrics table
min_diff_col = absolute_diff.idxmin(axis=1)
volume_fct['lowest_diff_col'] = volume_fct['unique_id'].map(min_diff_col)

# Define a function to apply to each row
def get_value_from_column(row):
    return row[row['lowest_diff_col']]

# Apply this function to each row
volume_fct['fct'] = volume_fct.apply(get_value_from_column, axis=1)

# Select columns
volume_fct = volume_fct[['unique_id', 'ds','fct']]

volume_fct.to_csv('/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/volume_fct.csv', index=False)