<a href="https://colab.research.google.com/github/achett/Hierarchical-Model/blob/main/ROI_Framework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [252]:
# !pip install hierarchicalforecast
# !pip install statsforecast
# !pip install datasetsforecast
# !pip install nixtlats>=0.1.0
# !pip install darts
# !pip install mlforecast

In [253]:
########################
# PACKAGES
########################
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import openpyxl
from datetime import datetime
from functools import reduce
from dateutil.relativedelta import relativedelta

from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
from itertools import product
import ast


pd.options.display.float_format = '{:,.2f}'.format

In [254]:
##############
# PARAMS
##############
fct_periods = 15
fct_st_date = '2023-04-01'
fct_end_date = '2024-03-01'

# Create hierarchical structure and constraints
hierarchy_levels = [['TopLv'],
                    ['TopLv', 'ProductLv'],
                    ['TopLv', 'ProductLv', 'Lv1'],
                    ['TopLv', 'ProductLv', 'Lv1', 'Lv2'],
                    ['TopLv', 'ProductLv', 'Lv1', 'Lv2', 'Lv3'],
                    ['TopLv', 'ProductLv', 'Lv1', 'Lv2', 'Lv3', 'Lv4'],
                    ['TopLv', 'ProductLv', 'Lv1', 'Lv2', 'Lv3', 'Lv4', 'Lv5']]

inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/regional_hierarchy.xlsx'
sheet_name = 'regional_hierarchy v2'
r_hier = pd.read_excel(inputFile, sheet_name=sheet_name)

inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/model_selection.xlsx'
model_selection = pd.read_excel(inputFile)

inputFile = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/Product Naming Convention.xlsx'
product_naming_convention = pd.read_excel(inputFile)

In [255]:
##############
# FUNCTIONS
##############
def prepare_data(data, r_hier):

    # Merge hierarchy
    data = data.merge(r_hier, how='inner', left_on='cost_object', right_on='Lv5')

    # Transform date and y
    data['ds'] = pd.to_datetime(data['ds'])
    data['y'] = data['y'].astype(float)

    # Address NA values
    data['y'] = data['y'].fillna(0)
    data['TopLv'] = data['TopLv'].fillna('')
    data['Lv1'] = data['Lv1'].fillna('')
    data['Lv2'] = data['Lv2'].fillna('')
    data['Lv3'] = data['Lv3'].fillna('')
    data['Lv4'] = data['Lv4'].fillna('')
    data['Lv5'] = data['Lv5'].fillna('')
    data['product'] = data['product'].fillna('')

    # Create hierarchical dataframe
    data.rename(columns={'product': 'ProductLv'}, inplace=True)
    data = data[['TopLv', 'ProductLv', 'Lv1', 'Lv2', 'Lv3', 'Lv4', 'Lv5', 'ds', 'y']]

    # Replace '/' with '_' in the four columns
    data['TopLv'] = data['TopLv'].str.replace('/', '_')
    data['ProductLv'] = data['ProductLv'].str.replace('/', '_')
    data['Lv1'] = data['Lv1'].str.replace('/', '_')
    data['Lv2'] = data['Lv2'].str.replace('/', '_')
    data['Lv3'] = data['Lv3'].str.replace('/', '_')
    data['Lv4'] = data['Lv4'].str.replace('/', '_')
    data['Lv5'] = data['Lv5'].str.replace('/', '_')

    data['unique_id'] = data['TopLv'] + '/' + data['ProductLv'] + '/' + data['Lv1'] + '/' + data['Lv2'] + '/' + data['Lv3'] + '/' + data['Lv4'] + '/' + data['Lv5']

    # Assuming df is your existing DataFrame
    grouping_columns = ['TopLv', 'ProductLv', 'Lv1', 'Lv2', 'Lv3', 'Lv4', 'Lv5', 'ds', 'unique_id']  # All columns except 'y'

    # Group by specified columns and sum 'y'
    data = data.groupby(grouping_columns)['y'].sum().reset_index()

    return data

def prepare_feature(data, r_hier, volume_act2, feature_name):

    # Select and rename columns
    data = data[['cost_object', 'product', 'ds', feature_name]].rename(columns={feature_name: 'y'})

    # Apply any additional preparation (assuming prepare_data is a function you have defined)
    data = prepare_data(data, r_hier)

    # Rename the columns back
    data = data.rename(columns={'y': feature_name})

    # Merge with the volume_act2 dataframe
    merged_df = data.merge(volume_act2[['unique_id', 'ds']], how='right', on=['unique_id', 'ds'])

    return merged_df


In [256]:
##############
# DATA LOAD
##############
inputFile_dx_r = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/revenue_output.csv'
inputFile_ms_r = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/unrestricted_model_settings.csv'

inputFile_dx_v = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/volume_output.csv'
inputFile_ms_v = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/unrestricted_model_settings_volume.csv'

inputFile_bud = '/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/data/budgetFY23.csv'

budget = pd.read_csv(inputFile_bud)
budget.rename(columns={'country': 'cost_object'}, inplace=True)
budget = prepare_data(budget, r_hier)

data_v = pd.read_csv(inputFile_dx_v)
data_v.rename(columns={'value': 'y'}, inplace=True)
data_v = prepare_data(data_v, r_hier)

data_r = pd.read_csv(inputFile_dx_r)
data_r.rename(columns={'value': 'y'}, inplace=True)
data_r = prepare_data(data_r, r_hier)

inputFile = '/content/drive/MyDrive/Colab Notebooks/SGA Prediction/data/sga_output.csv'
sga = pd.read_csv(inputFile)

sga1 = prepare_feature(sga, r_hier, data_r, 'AP')
sga2 = prepare_feature(sga, r_hier, data_r, 'Field_Sales')

sga1 = sga1[['unique_id', 'ProductLv', 'Lv5', 'ds', 'AP']]
sga1 = sga1.dropna(subset=['ProductLv'])
sga1['ID'] = sga1['ProductLv']+sga1['Lv5']

sga2 = sga2[['unique_id', 'ProductLv', 'Lv5', 'ds', 'Field_Sales']]
sga2 = sga2.dropna(subset=['ProductLv'])
sga2['ID'] = sga2['ProductLv']+sga2['Lv5']

In [257]:
########################
# IDENTIFY UNIVERSE
########################
temp_intersection = set(budget['unique_id'].unique()).intersection(data_v['unique_id'].unique())
tested_ts = temp_intersection.intersection(data_r['unique_id'].unique())

# Filter
data_v = data_v[data_v['unique_id'].isin(tested_ts)]
data_r = data_r[data_r['unique_id'].isin(tested_ts)]
budget = budget[budget['unique_id'].isin(tested_ts)]

In [258]:
########################
# SEGMENT TIME SERIES
########################
# IDs with A&P and Field Sales Spend
grouped1 = sga1.groupby('unique_id')[['AP']].sum()
grouped2 = sga2.groupby('unique_id')[['Field_Sales']].sum()
spend_ids = set(grouped1[(grouped1['AP'] > 0)].index.tolist() + grouped2[(grouped2['Field_Sales'] > 0)].index.tolist())

# IDs with no spend
non_spend_ids = data_r[~data_r['unique_id'].isin(spend_ids)]['unique_id'].unique()

In [259]:
########################
# DATA CONVERSION
########################
set2zero_list=['Global/TAMSULOSIN - Tamsulosin HCl/D_GCN - Greater China/D_CN_TOTAL - China Total/D_CN_TOTAL - China Total/D_CN_TOTAL - China Total/D_CN_TOTAL - China Total',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_BENELUX - Benelux/D_E_BELGIUM - Belgium/D_E_BELGIUM - Belgium',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_BBMCI - BBMCI group/D_E_BALKANS - Balkans/D_E_BOS_HER - Bosnia-Herz.',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_BBMCI - BBMCI group/D_E_BALKANS - Balkans/D_E_BOS_HER - Bosnia-Herz.',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_ADRCS_PT - Adriatics & Portugal/D_E_ADRCS - Adriatic Adriatics/D_E_CROATIA - Croatia',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_ADRCS_PT - Adriatics & Portugal/D_E_ADRCS - Adriatic Adriatics/D_E_CROATIA - Croatia',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_PCSU - PCSU/D_E_CZSK - Czech + Slovakia/D_E_CZECH - Czech',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_E_ESTMKT - Established Markets/D_E_FRANCE - France/D_E_FRANCE - France/D_E_FRANCE - France/D_E_FRANCE - France',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_E_ESTMKT - Established Markets/D_E_FRANCE - France/D_E_FRANCE - France/D_E_FRANCE - France/D_E_FRANCE - France',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_E_ESTMKT - Established Markets/D_E_GB - Great Britain/D_E_GB - Great Britain/D_E_GB - Great Britain/D_E_GB - Great Britain',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_HUBGROGR - HBRG/D_E_HUBGRO - Hungary  Bulgaria & Romania/D_E_HU - Hungary',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_IE - Ireland/D_E_IE - Ireland/D_E_IE - Ireland',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_E_ESTMKT - Established Markets/D_E_IT - Italy/D_E_IT - Italy/D_E_IT - Italy/D_E_IT - Italy',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_BBMCI - BBMCI group/D_E_MTCYIS - Malta  Cyprus & Iceland/D_E_MALTA - Malta',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_NORDIC - Nordic/D_E_NORWAY - Norway/D_E_NORWAY - Norway',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_PCSU - PCSU/D_E_PO - Poland/D_E_PO - Poland',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_PCSU - PCSU/D_E_PO - Poland/D_E_PO - Poland',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_PCSU - PCSU/D_E_CZSK - Czech + Slovakia/D_E_SLOVAKIA - Slovakia',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_E_ESTMKT - Established Markets/D_E_SPAIN - Spain/D_E_SPAIN - Spain/D_E_SPAIN - Spain/D_E_SPAIN - Spain',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_PCSU - PCSU/D_E_UA - Ukraine/D_E_UA - Ukraine',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_PCSU - PCSU/D_E_UA - Ukraine/D_E_UA - Ukraine',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_GCN - Greater China/D_HK_TOTAL - Hong Kong Total/D_HK_TOTAL - Hong Kong Total/D_HK_TOTAL - Hong Kong Total/D_HK_TOTAL - Hong Kong Total',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_GCN - Greater China/D_HK_TOTAL - Hong Kong Total/D_HK_TOTAL - Hong Kong Total/D_HK_TOTAL - Hong Kong Total/D_HK_TOTAL - Hong Kong Total',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_I_INTL - International Markets/D_I_RBK_CORE - RBK Core/D_I_CIS_BEL - Belarus/D_I_CIS_BEL - Belarus/D_I_CIS_BEL - Belarus',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_I_INTL - International Markets/D_I_RBK_CORE - RBK Core/D_I_CIS_BEL - Belarus/D_I_CIS_BEL - Belarus/D_I_CIS_BEL - Belarus',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_I_INTL - International Markets/D_I_RBK_CORE - RBK Core/D_I_CIS_KAZ - Kazakhstan/D_I_CIS_KAZ - Kazakhstan/D_I_CIS_KAZ - Kazakhstan',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_I_INTL - International Markets/D_I_RBK_CORE - RBK Core/D_I_CIS_KAZ - Kazakhstan/D_I_CIS_KAZ - Kazakhstan/D_I_CIS_KAZ - Kazakhstan',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_I_INTL - International Markets/D_I_RBK_CORE - RBK Core/D_I_CIS_RUS - Russia/D_I_CIS_RUS - Russia/D_I_CIS_RUS - Russia',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_I_INTL - International Markets/D_I_RBK_CORE - RBK Core/D_I_CIS_RUS - Russia/D_I_CIS_RUS - Russia/D_I_CIS_RUS - Russia',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_I_INTL - International Markets/D_I_MEA_OB - MEA Own Business/D_I_EGYPT - Egypt/D_I_EGYPT - Egypt/D_I_EGYPT - Egypt',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_I_INTL - International Markets/D_I_APAC_CORE - APAC CORE/D_I_INDONESIA - Indonesia/D_I_INDONESIA - Indonesia/D_I_INDONESIA - Indonesia',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_I_INTL - International Markets/D_I_MEA_DB - Distributor Business/D_I_IRAQ - Iraq/D_I_IRAQ - Iraq/D_I_IRAQ - Iraq',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_I_INTL - International Markets/D_I_MEA_DB - Distributor Business/D_I_JORDAN - Jordan/D_I_JORDAN - Jordan/D_I_JORDAN - Jordan',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_I_INTL - International Markets/D_I_LATAM - Domestic - LatAM/D_I_LATAM_REST - Domestic Rest of Latam/D_I_LATAM_REST_OTH - Domestic Rest of Latam Others/D_I_LATAM_REST_OTH - Domestic Rest of Latam Others',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_I_INTL - International Markets/D_I_MEA_DB - Distributor Business/D_I_LEBANON - Lebanon/D_I_LEBANON - Lebanon/D_I_LEBANON - Lebanon',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_I_INTL - International Markets/D_I_APAC_CORE - APAC CORE/D_I_PHILIPPINES - Philippines/D_I_PHILIPPINES - Philippines/D_I_PHILIPPINES - Philippines',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_I_INTL - International Markets/D_I_MEA_OB - MEA Own Business/D_I_SAFRICA - South Africa/D_I_SAFRICA - South Africa/D_I_SAFRICA - South Africa',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_I_INTL - International Markets/D_I_APAC_CORE - APAC CORE/D_I_SINMAL - SINMAL/D_I_SINGAPORE - SINGAPORE/D_I_SINGAPORE - SINGAPORE',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_I_INTL - International Markets/D_I_TURKEY - Turkey/D_I_TURKEY - Turkey/D_I_TURKEY - Turkey/D_I_TURKEY - Turkey',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_I_INTL - International Markets/D_I_APAC_CORE - APAC CORE/D_I_VIETNAM - Vietnam/D_I_VIETNAM - Vietnam/D_I_VIETNAM - Vietnam',
       'Global/TAMSULOSIN - Tamsulosin HCl/D_GCN - Greater China/D_TW_TOTAL - Taiwan Total/D_TW_TOTAL - Taiwan Total/D_TW_TOTAL - Taiwan Total/D_TW_TOTAL - Taiwan Total',
       'Global/TAMSUL_TAB - Tamsulosin tab/D_GCN - Greater China/D_TW_TOTAL - Taiwan Total/D_TW_TOTAL - Taiwan Total/D_TW_TOTAL - Taiwan Total/D_TW_TOTAL - Taiwan Total',
      'Global/SOLIF_TAMS - Solifenacin _ Tamsulosin/D_E_ESTMKT - Established Markets/D_E_IT - Italy/D_E_IT - Italy/D_E_IT - Italy/D_E_IT - Italy',
       'Global/SOLIF_TAMS - Solifenacin _ Tamsulosin/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_BENELUX - Benelux/D_E_NETHLND - Netherlands/D_E_NETHLND - Netherlands',
       'Global/SOLIF_TAMS - Solifenacin _ Tamsulosin/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_PCSU - PCSU/D_E_CZSK - Czech + Slovakia/D_E_SLOVAKIA - Slovakia',
       'Global/SOLIF_TAMS - Solifenacin _ Tamsulosin/D_E_ESTMKT - Established Markets/D_E_MSM - Mid Size Markets/D_E_PCSU - PCSU/D_E_UA - Ukraine/D_E_UA - Ukraine',
       'Global/SOLIF_TAMS - Solifenacin _ Tamsulosin/D_I_INTL - International Markets/D_I_LATAM - Domestic - LatAM/D_I_LATAM_DB - Domestic Latam Distributor Business/D_I_AR - Domestic Argentina/D_I_AR - Domestic Argentina',
       'Global/SOLIF_TAMS - Solifenacin _ Tamsulosin/D_I_INTL - International Markets/D_I_RBK_CORE - RBK Core/D_I_CIS_KAZ - Kazakhstan/D_I_CIS_KAZ - Kazakhstan/D_I_CIS_KAZ - Kazakhstan',
        'Global/ENZA - Enzalutamide/D_I_INTL - International Markets/D_I_RBK_CORE - RBK Core/D_I_CIS_RUS - Russia/D_I_CIS_RUS - Russia/D_I_CIS_RUS - Russia']

set2zero_list2=['Global/ISA_SULFAT - Isavuconazonium Sulfate/D_USCOM - US Commercial/D_USCOM - US Commercial/D_USCOM - US Commercial/D_USCOM - US Commercial/US10 - Astellas Pharma US, Inc.']

# Clear out zeros
data_v.loc[(data_v['unique_id'].isin(set2zero_list)) & (data_v['ds'] < '2022-04-01'), 'y'] = 0
data_v.loc[(data_v['unique_id'].isin(set2zero_list2)) & (data_v['ds'] < '2021-04-01'), 'y'] = 0

data_r.loc[(data_r['unique_id'].isin(set2zero_list)) & (data_r['ds'] < '2022-04-01'), 'y'] = 0
data_r.loc[(data_r['unique_id'].isin(set2zero_list2)) & (data_r['ds'] < '2021-04-01'), 'y'] = 0

# **ORIGINAL RUN**

In [203]:
########################
# XTREND - DECAY
########################
# import pandas as pd
# from dateutil.relativedelta import relativedelta
# import numpy as np

# def apply_exponential_decay(df, start_date, end_date, end_value_percentage, target_unique_ids):

#     # Convert 'ds' to datetime if it's not already and sort
#     df['ds'] = pd.to_datetime(df['ds'])
#     df = df.sort_values(by='ds')
#     start_date = pd.to_datetime(start_date)
#     end_date = pd.to_datetime(end_date)

#     # Loop through each group (unique_id)
#     for unique_id in target_unique_ids:
#         group = df[df['unique_id'] == unique_id]

#         # Columns to apply decay to
#         decay_columns = [col for col in group.columns if col not in ['unique_id', 'ds']]

#         # Initialize a dictionary to keep the end values for each decay column
#         end_values = {}

#         # Find start and end values and dates for each column
#         for col in decay_columns:
#             if start_date in group['ds'].values and end_date in group['ds'].values:
#                 start_value = group.loc[group['ds'] == start_date, col].iloc[0]
#                 end_value = start_value * end_value_percentage
#                 end_values[col] = end_value  # Store the end value for this column

#                 # Calculate the decay rate based on exponential decay formula
#                 months = relativedelta(end_date, start_date).months
#                 decay_rate = np.log(end_value / start_value) / months

#                 # Apply exponential decay for dates between start_date and end_date
#                 for date in pd.date_range(start_date, end_date):
#                     if date in group['ds'].values:
#                         t = relativedelta(date, start_date).months
#                         new_value = start_value * np.exp(decay_rate * t)
#                         df.loc[(df['unique_id'] == unique_id) & (df['ds'] == date), col] = new_value

#         # Replace column values for dates after end_date with the respective end values
#         for col, end_value in end_values.items():
#             if end_value is not None:  # Ensure there was an end value calculated
#                 df.loc[(df['unique_id'] == unique_id) & (df['ds'] > end_date), col] = end_value

#     return df


# # Apply exponential decay
# # lgbm_fct.rename(columns={'LGBM': 'y'}, inplace=True)
# ets_df.rename(columns={'ETS': 'y'}, inplace=True)
# arima_df.rename(columns={'ARIMA': 'y'}, inplace=True)

# # Micafungin
# arima_df = apply_exponential_decay(arima_df, '2023-07-01', '2023-08-01', 0, divested_ids)
# ets_df = apply_exponential_decay(ets_df, '2023-07-01', '2023-08-01', 0, divested_ids)

# # Lexiscan
# arima_df = apply_exponential_decay(arima_df, '2023-01-01', '2023-12-01', .1, loe_ids)
# ets_df = apply_exponential_decay(ets_df, '2023-01-01', '2023-12-01', .1, loe_ids)

# # # Tamsulosin
# # tamsulosin_ids = volume_act[volume_act['ProductLv'].isin(['TAMSULOSIN - Tamsulosin HCl', 'TAMSUL_TAB - Tamsulosin tab'])]['unique_id'].unique()
# # arima_df = apply_exponential_decay(arima_df, '2023-04-01', '2023-12-01', .9, tamsulosin_ids)
# # ets_df = apply_exponential_decay(ets_df, '2023-04-01', '2023-12-01', .9, tamsulosin_ids)

# # Solifinacin Tamsulosin
# solif_tams_ids = volume_act[(volume_act['ProductLv'].isin(['SOLIF_TAMS - Solifenacin _ Tamsulosin'])) & (volume_act['Lv5'].isin(['D_E_PORTUGAL - Portugal', 'D_E_SPAIN - Spain', 'D_E_GB - Great Britain', 'D_E_BG - Bulgaria']))]['unique_id'].unique()
# arima_df = apply_exponential_decay(arima_df, '2023-06-01', '2023-12-01', .7, solif_tams_ids)
# ets_df = apply_exponential_decay(ets_df, '2023-06-01', '2023-12-01', .7, solif_tams_ids)

# # Xtandi
# xtandi_ids = volume_act[(volume_act['ProductLv'].isin(['ENZA - Enzalutamide']))]['unique_id'].unique()
# arima_df = apply_exponential_decay(arima_df, '2027-11-01', '2028-11-01', .1, xtandi_ids)
# ets_df = apply_exponential_decay(ets_df, '2027-11-01', '2028-11-01', .1, xtandi_ids)

# # Mirabegron
# mira_ids = volume_act[(volume_act['ProductLv'].isin(['MIRABEGRON - Mirabegron']))]['unique_id'].unique()
# arima_df = apply_exponential_decay(arima_df, '2025-11-01', '2026-11-01', .1, mira_ids)
# ets_df = apply_exponential_decay(ets_df, '2025-11-01', '2026-11-01', .1, mira_ids)

# # Cresemba
# cres_ids = volume_act[(volume_act['ProductLv'].isin(['ISA_SULFAT - Isavuconazonium Sulfate']))]['unique_id'].unique()
# arima_df = apply_exponential_decay(arima_df, '2027-03-01', '2028-03-01', .1, cres_ids)
# ets_df = apply_exponential_decay(ets_df, '2027-03-01', '2028-03-01', .1, cres_ids)

# **BULK RUN**

In [260]:
def distribute_negatives(series):
    # Make sure the index is a DatetimeIndex
    if not isinstance(series.index, pd.DatetimeIndex):
        raise ValueError("The index of the series must be a DatetimeIndex")

    # Group by fiscal year (April to March)
    fiscal_years = series.index.to_period('A-MAR')

    def adjust_fiscal_year(group):
        # Identify negative and positive values
        negative_values = group[group < 0]
        positive_values = group[group >= 0]

        # Set negative values to zero
        group[negative_values.index] = 0

        # If there are no positive values to distribute the negative values to, return the group
        if positive_values.empty:
            return group

        # Calculate the total negative value to distribute
        total_negative_value = negative_values.sum()

        # Calculate the total positive value
        total_positive_value = positive_values.sum()

        # If the absolute value of the total negative value is greater than the absolute value of the total positive value,
        # set the total negative value equal to the total positive value
        if abs(total_negative_value) > abs(total_positive_value):
            total_negative_value = -total_positive_value

        # Calculate the proportional weights for the positive values
        weights = positive_values / total_positive_value

        # Distribute the negative values proportionally to the positive values
        group[positive_values.index] += total_negative_value * weights

        return group

    # Apply the adjustment to each fiscal year group
    adjusted_series = series.groupby(fiscal_years).apply(adjust_fiscal_year)

    return adjusted_series

In [261]:
def forecast_error(df, volume_act, exog_df, fct_periods, fct_st_date):
    # Initialize the output DataFrame
    results = pd.DataFrame(columns=['id2test', 'Product', 'Lv5', 'actuals_fy23', 'forecast_fy23', 'error'])
    results_monthly = pd.DataFrame(columns=['id2test', 'Product', 'Lv5', 'ds', 'Actuals', 'DX'])

    # Iterate through each row in the DataFrame
    for idx, row in df.iterrows():
        product = row['Product']
        Lv5 = row['Lv5']
        model_settings = row.drop(['Product', 'Lv5', 'exog']).to_dict()

        if row['exog']:
            exog2include = {k: v for d in row['exog'] for k, v in d.items()}
            exog_names = list(exog2include.keys())
        else:
            exog2include = None
            exog_names = []

        # Determine unique ID based on product and Lv5
        id2test = volume_act[(volume_act['ProductLv'] == product) & (volume_act['Lv5'] == Lv5)]['unique_id'].unique()[0]

        # Filter the data for this specific ID
        ssm_data = volume_act[volume_act['unique_id'] == id2test]

        # Add exog variables
        ssm_data = ssm_data.merge(exog_df[['unique_id', 'ds'] + exog_names], how='left', on=['unique_id', 'ds'])
        ssm_data = ssm_data[['ds', 'y'] + exog_names]

        # Efficiently fill NaNs with 0 in the specified columns
        ssm_data.loc[:, exog_names] = ssm_data.loc[:, exog_names].fillna(0)

        if exog2include is not None:
            for column, lag in exog2include.items():
                # Create new lagged column for each key in the dictionary
                ssm_data[column] = ssm_data[column].shift(lag).fillna(0)

        # Set ds as index
        ssm_data = ssm_data.set_index('ds')
        ssm_data.index = pd.DatetimeIndex(ssm_data.index.values, freq=ssm_data.index.inferred_freq)

        # Set train and test period
        ssm_test = ssm_data.tail(fct_periods)
        ssm_train = ssm_data.drop(ssm_test.index)

        if row['exog']:
            exog_test = ssm_test[exog_names]
            exog_train = ssm_train[exog_names]
        else:
            exog_test = None
            exog_train = None

        # Fit the model
        mod = sm.tsa.UnobservedComponents(ssm_train['y'], exog = exog_train, **model_settings)
        res = mod.fit(method='powell', disp=False)

        # Perform prediction and forecasting
        predict = res.get_prediction(exog = exog_test)
        forecast = res.get_forecast('2024-03-01', exog = exog_test)

        # Negative value distribution
        forecast_mean = distribute_negatives(forecast.predicted_mean)
        forecast_mean = forecast_mean.reset_index().rename(columns={'level_1': 'ds', 'predicted_mean': 'y'})[['ds', 'y']]
        forecast_mean.set_index('ds', inplace=True)

        # Yearly Error
        actuals_fy23 = ssm_test.loc[ssm_test.index >= fct_st_date]['y'].sum()
        forecast_fy23 = forecast_mean.loc[forecast_mean.index >= fct_st_date]['y'].sum()
        error = ((forecast_fy23 - actuals_fy23) / actuals_fy23) * 100

        # Monthly Error
        actuals_fy23_monthly = pd.DataFrame(ssm_test.loc[ssm_test.index >= fct_st_date]['y'])
        forecast_fy23_monthly = pd.DataFrame(forecast_mean.loc[forecast_mean.index >= fct_st_date]['y'])
        error_monthly = actuals_fy23_monthly.merge(forecast_fy23_monthly, left_index=True, right_index=True)
        error_monthly.columns = ['Actuals', 'DX']
        error_monthly['id2test'] = id2test
        error_monthly['Product'] = product
        error_monthly['Lv5'] = Lv5
        # error_monthly = error_monthly.reset_index(drop=True)
        error_monthly['ds'] = error_monthly.index
        error_monthly = error_monthly[['id2test', 'Product', 'Lv5', 'ds', 'Actuals', 'DX']]

        new_row = pd.DataFrame({
            'id2test': [id2test],
            'Product': [product],
            'Lv5': [Lv5],
            'actuals_fy23': [actuals_fy23],
            'forecast_fy23': [forecast_fy23],
            'error': [error]
        })

        # Using concat to append the new row
        results = pd.concat([results, new_row], ignore_index=True)
        results_monthly = pd.concat([results_monthly, error_monthly], ignore_index=True)

    return results, results_monthly

In [262]:
# Dataframes needed for both price and volume
# Budget
budget_monthly = budget[['unique_id', 'ds', 'y']]
budget_monthly.columns=['id2test', 'ds', 'BUD']
grouped_budget = budget_monthly.groupby(['id2test'])['BUD'].sum().reset_index()
grouped_budget.columns=['id2test', 'budget_fy23']

# Create exog dataframe
sga_exog = sga1.merge(sga2[['unique_id', 'ds', 'Field_Sales']], how='left', on=['unique_id', 'ds'])

## VOLUME

In [263]:
# Run volume
unrestricted_model_settings = pd.read_csv(inputFile_ms_v, dtype={'autoregressive': 'Int64', 'seasonal': 'Int64'})

# Define a safe literal eval function that checks for NaN values
def safe_literal_eval(s):
    if pd.isna(s):
        return None  # or you can return another appropriate value like an empty list or dict
    else:
        return ast.literal_eval(s)

# Apply the safe_literal_eval function
unrestricted_model_settings['freq_seasonal'] = unrestricted_model_settings['freq_seasonal'].apply(safe_literal_eval)
unrestricted_model_settings['exog'] = unrestricted_model_settings['exog'].apply(safe_literal_eval)

# Filter and drop
unrestricted_model_settings = unrestricted_model_settings[(unrestricted_model_settings['Success']!='No Data')]
unrestricted_model_settings = unrestricted_model_settings.drop(['Success', 'LostinOrig', '%ofSales'], axis=1)

# Create IDs
unrestricted_model_settings['ID'] = unrestricted_model_settings['Product']+unrestricted_model_settings['Lv5']
data_v['ID'] = data_v['ProductLv']+data_v['Lv5']

# Filter
unrestricted_model_settings = unrestricted_model_settings[unrestricted_model_settings['ID'].isin(data_v['ID'].unique())]

# Drop column
unrestricted_model_settings = unrestricted_model_settings.drop('ID', axis=1)
data_v = data_v.drop('ID', axis=1)

In [245]:
# product2test = 'ENZA - Enzalutamide'
# lv52test = 'US10 - Astellas Pharma US, Inc.'

# unrestricted_model_settings = unrestricted_model_settings[(unrestricted_model_settings['Product']==product2test) & (unrestricted_model_settings['Lv5']==lv52test)]

# unrestricted_model_settings

Unnamed: 0,Product,Lv5,irregular,autoregressive,level,stochastic_level,trend,stochastic_trend,cycle,damped_cycle,stochastic_cycle,seasonal,stochastic_seasonal,freq_seasonal,exog
0,ENZA - Enzalutamide,"US10 - Astellas Pharma US, Inc.",True,,True,True,True,True,True,False,True,6,True,,[{'Field_Sales': 5}]


In [264]:
# Calculate the number of chunks
num_rows = len(unrestricted_model_settings)
chunk_size = 50  # Process 5 rows at a time
num_chunks = (num_rows // chunk_size) + (1 if num_rows % chunk_size else 0)

start_chunk = 1
# num_chunks = 1

# Create dataframe to concat to
results2comp_full_v = pd.DataFrame(columns=['winner','Lv5', 'Product', 'actuals_fy23', 'forecast_fy23', 'error', 'id2test'])
results2comp_full_monthly_v = pd.DataFrame(columns=['id2test','Product', 'Lv5', 'ds', 'Actuals', 'DX', 'BUD'])

# Loop through each chunk
for i in range(start_chunk - 1, num_chunks):
    start_index = i * chunk_size
    end_index = start_index + chunk_size

    # Get the subset of the DataFrame
    subset = unrestricted_model_settings.iloc[start_index:end_index]

    # Check if the subset is empty (this might occur if your total rows are exactly divisible by the chunk size)
    if subset.empty:
        break

    # Run the function
    results_df, results_monthly_df = forecast_error(subset, data_v, sga_exog, fct_periods, fct_st_date)

    # Concat
    results2comp_full_v = pd.concat([results2comp_full_v, results_df], ignore_index=True)
    results2comp_full_monthly_v = pd.concat([results2comp_full_monthly_v, results_monthly_df], ignore_index=True)

results2comp_full_v.shape

  unconstrained[offset] = np.log(
  direc1 = x - x1
  warn("Specified model does not contain a stochastic element;"
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  error = ((forecast_fy23 - actuals_fy23) / actuals_fy23) * 100
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  error = ((forecast_fy23 - actuals_fy23) / actuals_fy23) * 100
  error = ((forecast_fy23 - actuals_fy23) / actuals_fy23) * 100
  error = ((forecast_fy23 - actuals_fy23) / actuals_fy23) * 100
  warn("Specified model does not contain a stochastic element;"
  error = ((forecast_fy23 - actuals_fy23) / actuals_fy23) * 100
  unconstrained[offset] = np.log(
  direc1 = x - x1
  warn("Specified model does not contain a stochastic element;"
  error = ((forecast_fy23 - actuals_fy23) / actuals_fy23) * 100
  unconstrained[offset] = np.log(
  direc1 = x - x1
  uncons

(549, 7)

## REVENUE

In [265]:
# Run Revenue
unrestricted_model_settings = pd.read_csv(inputFile_ms_r, dtype={'autoregressive': 'Int64', 'seasonal': 'Int64'})

# Define a safe literal eval function that checks for NaN values
def safe_literal_eval(s):
    if pd.isna(s):
        return None  # or you can return another appropriate value like an empty list or dict
    else:
        return ast.literal_eval(s)

# Apply the safe_literal_eval function
unrestricted_model_settings['freq_seasonal'] = unrestricted_model_settings['freq_seasonal'].apply(safe_literal_eval)
unrestricted_model_settings['exog'] = unrestricted_model_settings['exog'].apply(safe_literal_eval)

# Filter and drop
unrestricted_model_settings = unrestricted_model_settings[(unrestricted_model_settings['Success']!='No Data')]
unrestricted_model_settings = unrestricted_model_settings.drop(['Success', 'LostinOrig', '%ofSales'], axis=1)

# Create IDs
unrestricted_model_settings['ID'] = unrestricted_model_settings['Product']+unrestricted_model_settings['Lv5']
data_r['ID'] = data_r['ProductLv']+data_r['Lv5']

# Filter
unrestricted_model_settings = unrestricted_model_settings[unrestricted_model_settings['ID'].isin(data_r['ID'].unique())]

# Drop column
unrestricted_model_settings = unrestricted_model_settings.drop('ID', axis=1)
data_r = data_r.drop('ID', axis=1)

In [236]:
# product2test = 'ENZA - Enzalutamide'
# lv52test = 'US10 - Astellas Pharma US, Inc.'

# unrestricted_model_settings = unrestricted_model_settings[(unrestricted_model_settings['Product']==product2test) & (unrestricted_model_settings['Lv5']==lv52test)]

# unrestricted_model_settings

Unnamed: 0,Product,Lv5,irregular,autoregressive,level,stochastic_level,trend,stochastic_trend,cycle,damped_cycle,stochastic_cycle,seasonal,stochastic_seasonal,freq_seasonal,exog
0,ENZA - Enzalutamide,"US10 - Astellas Pharma US, Inc.",True,,True,True,True,True,True,False,True,6,True,,[{'Field_Sales': 5}]


In [266]:
# Calculate the number of chunks
num_rows = len(unrestricted_model_settings)
chunk_size = 50  # Process 5 rows at a time
num_chunks = (num_rows // chunk_size) + (1 if num_rows % chunk_size else 0)

start_chunk = 1
# num_chunks = 1

# Create dataframe to concat to
results2comp_full_r = pd.DataFrame(columns=['winner','Lv5', 'Product', 'actuals_fy23', 'forecast_fy23', 'error', 'id2test'])
results2comp_full_monthly_r = pd.DataFrame(columns=['id2test','Product', 'Lv5', 'ds', 'Actuals', 'DX', 'BUD'])

# Loop through each chunk
for i in range(start_chunk - 1, num_chunks):
    start_index = i * chunk_size
    end_index = start_index + chunk_size

    # Get the subset of the DataFrame
    subset = unrestricted_model_settings.iloc[start_index:end_index]

    # Check if the subset is empty (this might occur if your total rows are exactly divisible by the chunk size)
    if subset.empty:
        break

    # Run the function
    results_df, results_monthly_df = forecast_error(subset, data_r, sga_exog, fct_periods, fct_st_date)

    # Concat
    results2comp_full_r = pd.concat([results2comp_full_r, results_df], ignore_index=True)
    results2comp_full_monthly_r = pd.concat([results2comp_full_monthly_r, results_monthly_df], ignore_index=True)

results2comp_full_r.shape

  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  warn("Specified model does not contain a stochastic element;"
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1
  unconstrained[offset] = np.log(
  direc1 = x - x1


(610, 7)

## PRICE

In [267]:
results2comp_full_monthly_v.columns = ['id2test', 'Product', 'Lv5', 'ds', 'Actuals', 'DX_v', 'BUD']
results2comp_full_monthly_r.columns = ['id2test', 'Product', 'Lv5', 'ds', 'Actuals', 'DX_r', 'BUD']

results2comp_full_monthly = results2comp_full_monthly_r[['id2test', 'Product', 'Lv5', 'ds', 'Actuals', 'DX_r']].merge(results2comp_full_monthly_v[['id2test', 'Product', 'Lv5', 'ds','DX_v']], how='inner', on=['id2test', 'Product', 'Lv5', 'ds'])

# Calculate price
results2comp_full_monthly['DX_p'] = results2comp_full_monthly['DX_r'] / results2comp_full_monthly['DX_v']

results2comp_full_monthly

Unnamed: 0,id2test,Product,Lv5,ds,Actuals,DX_r,DX_v,DX_p
0,Global/ENZA - Enzalutamide/D_USCOM - US Commer...,ENZA - Enzalutamide,"US10 - Astellas Pharma US, Inc.",2023-04-01,24090218306.25,25431312044.92,30428.36,835776.60
1,Global/ENZA - Enzalutamide/D_USCOM - US Commer...,ENZA - Enzalutamide,"US10 - Astellas Pharma US, Inc.",2023-05-01,33083833436.25,27080544134.89,31523.68,859054.03
2,Global/ENZA - Enzalutamide/D_USCOM - US Commer...,ENZA - Enzalutamide,"US10 - Astellas Pharma US, Inc.",2023-06-01,21820056005.00,28262233897.83,32849.12,860364.96
3,Global/ENZA - Enzalutamide/D_USCOM - US Commer...,ENZA - Enzalutamide,"US10 - Astellas Pharma US, Inc.",2023-07-01,26651095577.50,24330720020.65,29527.68,823997.14
4,Global/ENZA - Enzalutamide/D_USCOM - US Commer...,ENZA - Enzalutamide,"US10 - Astellas Pharma US, Inc.",2023-08-01,29541795543.75,26435684698.99,31219.90,846757.54
...,...,...,...,...,...,...,...,...
6583,Global/ENZA - Enzalutamide/D_I_INTL - Internat...,ENZA - Enzalutamide,D_I_AR - Domestic Argentina,2023-11-01,0.00,0.00,423.75,0.00
6584,Global/ENZA - Enzalutamide/D_I_INTL - Internat...,ENZA - Enzalutamide,D_I_AR - Domestic Argentina,2023-12-01,-78923750.00,0.00,296.88,0.00
6585,Global/ENZA - Enzalutamide/D_I_INTL - Internat...,ENZA - Enzalutamide,D_I_AR - Domestic Argentina,2024-01-01,98640148.75,0.00,11.10,0.00
6586,Global/ENZA - Enzalutamide/D_I_INTL - Internat...,ENZA - Enzalutamide,D_I_AR - Domestic Argentina,2024-02-01,0.00,0.00,572.54,0.00


In [226]:
results2comp_full_monthly[(results2comp_full_monthly['Product']=='ENZA - Enzalutamide') & (results2comp_full_monthly['Lv5']=='D_E_MACEDONIA - North Macedonia')]

Unnamed: 0,id2test,Product,Lv5,ds,Actuals,DX_r,DX_v,DX_p
48,Global/ENZA - Enzalutamide/D_E_ESTMKT - Establ...,ENZA - Enzalutamide,D_E_MACEDONIA - North Macedonia,2023-04-01,0.0,0.0,,
49,Global/ENZA - Enzalutamide/D_E_ESTMKT - Establ...,ENZA - Enzalutamide,D_E_MACEDONIA - North Macedonia,2023-05-01,0.0,0.0,,
50,Global/ENZA - Enzalutamide/D_E_ESTMKT - Establ...,ENZA - Enzalutamide,D_E_MACEDONIA - North Macedonia,2023-06-01,0.0,0.0,,
51,Global/ENZA - Enzalutamide/D_E_ESTMKT - Establ...,ENZA - Enzalutamide,D_E_MACEDONIA - North Macedonia,2023-07-01,0.0,0.0,,
52,Global/ENZA - Enzalutamide/D_E_ESTMKT - Establ...,ENZA - Enzalutamide,D_E_MACEDONIA - North Macedonia,2023-08-01,0.0,3869.3,,
53,Global/ENZA - Enzalutamide/D_E_ESTMKT - Establ...,ENZA - Enzalutamide,D_E_MACEDONIA - North Macedonia,2023-09-01,0.0,0.0,,
54,Global/ENZA - Enzalutamide/D_E_ESTMKT - Establ...,ENZA - Enzalutamide,D_E_MACEDONIA - North Macedonia,2023-10-01,0.0,0.0,,
55,Global/ENZA - Enzalutamide/D_E_ESTMKT - Establ...,ENZA - Enzalutamide,D_E_MACEDONIA - North Macedonia,2023-11-01,0.0,0.0,,
56,Global/ENZA - Enzalutamide/D_E_ESTMKT - Establ...,ENZA - Enzalutamide,D_E_MACEDONIA - North Macedonia,2023-12-01,0.0,0.0,,
57,Global/ENZA - Enzalutamide/D_E_ESTMKT - Establ...,ENZA - Enzalutamide,D_E_MACEDONIA - North Macedonia,2024-01-01,0.0,0.0,,


In [269]:
results2comp_full_monthly[results2comp_full_monthly['DX_p'].apply(lambda x: pd.isna(x) or np.isinf(x))]['id2test'].nunique()

87

## COMPARE TO BUDGET

In [None]:
# Add budget
results2comp_monthly = results2comp_full_monthly.merge(budget_monthly, how='left', on=['id2test', 'ds'])
results2comp_monthly = results2comp_monthly[['id2test', 'Product', 'Lv5', 'ds', 'Actuals', 'BUD', 'DX_r', 'DX_v', 'DX_p']]

In [None]:
# Create ID
results2comp_full['ID'] = results2comp_full['Product'] + results2comp_full['Lv5']
results2comp_full_monthly['ID'] = results2comp_full_monthly['Product'] + results2comp_full_monthly['Lv5']

# Save the output to a CSV file, naming them uniquely
# results2comp_full.to_csv(f'/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/results/Blind Test FY23 Volume/results_summary_v+p.csv', index=False)
results2comp_full_monthly.to_csv(f'/content/drive/MyDrive/Colab Notebooks/Revenue Prediction/results/results_summary_volume_monthly_v+p.csv', index=False)