# Libraries and Functions

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import openpyxl
from PIL import Image
import io
import re
import dateutil.relativedelta
import os
import math
from datetime import date
from datetime import datetime
import functools
import statistics
import functools

def folder_create(wd, lev_2_folder, folder):
    # Create the string path
    path = os.path.join(wd, lev_2_folder)
    path = os.path.join(path, folder)
    try: 
        # Create the directory stored in 'path'
        os.mkdir(path)
    except OSError as error: 
        print('Folder already exists!') 

def data_prep(wd, folder, df, data_type):
    # Drop NaN rows & reset index
    df = df.dropna(how='all').reset_index(drop=True)
    # Slice 'df' from the first valid index row to all remaining rows to all columns then reset the index
    df = df.iloc[df.iloc[:,3].first_valid_index():,:].reset_index(drop=True)
    # Replace column headers with an index
    df.columns = np.arange(len(df.columns))
    # Assign new values to the first 3 cells in row zero
    df.iloc[0, 0:3] = [0, 1, 2]
    # Make row zero the column names
    df.columns = df.iloc[0]
    # Slice out row zero & reset the index again
    df = df.iloc[1:, :].reset_index(drop=True)
    # Replace zeros with 'NaN'
    df = df.replace(0, np.nan)
    # Replace 'LC - LC' with 'JPY - Japan Yen' for all 'D_INTL_LEGACY - International Legacy'
    if data_type == 'units':
        conds = [df[1] != 'D_INTL_LEGACY - International Legacy',
                 df[1] == 'D_INTL_LEGACY - International Legacy']
        choices = ['LC - LC', 'JPY - Japan Yen']
        rslt = np.select(conds, choices).tolist()
        df[0] = rslt

        
    # Create a list of the currency
    currency_list = list(df[0])
    # Create a list of the costobjects
    costobject_list = list(df[1])
    # Create a list of the products
    product_list = list(df[2])
    # Drop columns '0', '1', & '2'
    df.drop([0, 1, 2], inplace=True, axis=1)
    # Concatenate the currency, costobject & product to create the tag then append it to 'tags'
    tags = list()
    i = 0
    while i < len(df):
        x = currency_list[i] + '_' + costobject_list[i] + '_' + product_list[i]
        tags.append(x)
        i += 1
    # Add the tags to 'df'
    df_tags = pd.DataFrame(tags, columns=['tag'])
    df = pd.concat([df_tags, df], axis=1, ignore_index=False)
    
    
    # Convert all the column headers to a list
    df_header_list = list(df)
    # Covert all the dates in 'df_header_list' to %Y-%m-%d format & store them in 'converted_dates_list'
    converted_dates_list = list()
    for x in df_header_list:
        if x == 'tag':
            pass
        else:
            try:
                z = re.search('[A-Z][a-z]+', x)
                z = z.group(0)
                y = re.search('[0-9][0-9]', x)
                y = y.group(0)
                if z in ['January', 'February', 'March']:
                    y = str(int(y) + 1)
                year = '20' + y
                date = z + '01' + year
                d = datetime.strptime(date, '%B%d%Y').strftime('%Y-%m-%d')
                converted_dates_list.append(d)
            except:
                pass
    # Transpose & reset the index to get rid of the old dates
    df_T = df.T.reset_index(drop=True)
    # Make row 0 the column headers
    df_T.columns = df_T.iloc[0]
    # Drop the now duplicate row 0 & reset the index again
    df_T = df_T.drop(0).reset_index(drop=True)
    # Convert the entire dataframe to numeric data types
    df_T = df_T.apply(pd.to_numeric)
    # Add the months back to the data
    df_months = pd.DataFrame(converted_dates_list, columns=['ds'])
    df_concat = pd.concat([df_months, df_T], axis=1, ignore_index=False)
    # Add '-01' to create 'accrual_month'
    accrual_month = folder + '-01'
    accrual_month = datetime.strptime(accrual_month, '%Y%m-%d')
    accrual_month = str(accrual_month.date())

    # Identify the row index number of the accrual month
    idx_num = df_concat.index[df_concat['ds'] == accrual_month].tolist()
    # Create a list of row index numbers
    idx_rng = list(range(idx_num[0], len(df_concat)))
    # Drop rows
    df_concat.drop(idx_rng, inplace=True, axis=0)
    # Drop elements
    converted_dates_list = converted_dates_list[:-1*len(idx_rng)]
    
    
    # Create a copy of 'df_concat'
    df_export = df_concat.copy()
    # Reformat 'df_export'
    df_export.set_index('ds', inplace=True)
    # Transpose & reset the index
    df_export = df_export.T.reset_index(drop=True)
    # Add the tags column because at this point the tags are the index, which will get lost when exported
    df_export = pd.concat([df_tags, df_export], axis=1, ignore_index=False)
    
    
    # Restructure the data into the format needed for ForecastAI. To do this, create a list of dataframes
    i = 0
    df_list = list()
    while i < len(costobject_list):
        # Subset 'df_concat' so it only contains 2 columns of data: a specific tag & the dates column
        df_subset = df_concat[[tags[i], 'ds']]
        # Rename the columns
        df_subset.columns = ['value', 'ds']
        # Create a list of the repeated tag (repeated for each month of data)
        repeated_tag = [tags[i] for x in range(len(df_subset))]
        # Create a list of the repeated costobject (repeated for each month of data)
        repeated_costobject = [costobject_list[i] for x in range(len(df_subset))]
        # Create a list of the repeated product (repeated for each month of data)
        repeated_product = [product_list[i] for x in range(len(df_subset))]
        # Add the lists to 'df_subset'
        df_subset['tag'], df_subset['Cost Objects'],df_subset['Products'] = repeated_tag, repeated_costobject, repeated_product
        # Reorder the columns
        df_subset = df_subset[['tag', 'Cost Objects', 'Products', 'value', 'ds']]
        # Add 'df_subset' to 'df_list'
        df_list.append(df_subset)
        i += 1
        
      
    # Convert the string dates to datetime objects
    datetime_objects = list()
    for x in converted_dates_list:
        y = datetime.strptime(x, '%Y-%m-%d')
        datetime_objects.append(y)
    # Pull the last date in 'datetime_objects'
    last_date = datetime_objects[-1]
    
    
    ######
    # TAG1
    ######
    # Modified. The net sales TAG1 doesn't eliminate time series if there isn't enough data in the most 
    # recent 6 months. It assigns it a '1' in the 'noSalesLastYear' column if there isn't enough data

    # Subtract 6 months from 'last_date'
    ld6 = last_date - dateutil.relativedelta.relativedelta(months=6)
    # Create a list of all the dates > 'ld6' (dates more recent than 'ld6')
    selected_dates = list()
    for x in datetime_objects:
        if x > ld6:
            # Extract only the %Y-%m-%d
            y = str(x)[:10]
            selected_dates.append(y)

    # Filter out time series with less than the minimum number of months of data in the most recent 6 
    # months & add the 'no_sales' column to the dataframes
    i = 0
    df_list2 = list()   
    elim_via_TAG1 = list()
    while i < len(df_list):
        df_copy = df_list[i].copy()
        # Filter 'df_copy' by rows with dates in 'selected_dates'
        result_df = df_copy.loc[df_copy['ds'].isin(selected_dates)]
        # Drop all rows with 'NaN' in the 'value' column
        result_df.dropna(subset=['value'], inplace=True)
        if len(result_df) >= 4:
            # Create a list of zeros
            repeated_zeros = [0 for x in range(len(df_copy))]
            # Add the 'no_sales' column
            df_copy['no_sales'] = repeated_zeros
            # Append
            df_list2.append(df_copy)
        else:
            elim_via_TAG1.append(df_copy.iloc[0, 1])
        i += 1
        
        
    ######
    # TAG2
    ######
    
    # Subtract 14 months from 'last_date'
    ld14 = last_date - dateutil.relativedelta.relativedelta(months=14)
    # Create a list of all the dates <= 'ld14' (dates equal to or older than 'ld14')
    selected_dates2 = list()
    for x in datetime_objects:
        if x <= ld14:
            # Extract only the %Y-%m-%d
            y = str(x)[:10]
            selected_dates2.append(y)

    # Add the 'short_history' column to the dataframes. Assign a '0' if there is at least 2 months of 
    # data in the months preceding or equal to 'ld14' & a '1' if there isn't
    i = 0
    df_list3 = list()
    while i < len(df_list2):
        df_copy = df_list2[i].copy()
        # Filter 'df_copy' by rows with dates in 'selected_dates2'
        result_df = df_copy.loc[df_copy['ds'].isin(selected_dates2)]
        # Drop all rows with 'NaN' in the 'value' column
        result_df.dropna(subset=['value'], inplace=True)
        if len(result_df) >= 2:
            # Create a list of zeros
            repeated_zeros = [0 for x in range(len(df_copy))]
            # Add the 'short_history' column
            df_copy['short_history'] = repeated_zeros
            # Append
            df_list3.append(df_copy)
        else:
            # Create a list of ones
            repeated_ones = [1 for x in range(len(df_copy))]
            # Add the 'short_history' column
            df_copy['short_history'] = repeated_ones
            # Append
            df_list3.append(df_copy)
        i += 1
        
        
    ######
    # TAG3
    ######

    # Add the 'theta_check' column to the dataframes. Assign a '1' if there is at least 12 months of 
    # data in the months preceding or equal to 'ld14' & a '0' if there isn't
    i = 0
    df_list4 = list()
    while i < len(df_list3):
        df_copy = df_list3[i].copy()
        # Filter 'df_copy' by rows with dates in 'selected_dates2'
        result_df = df_copy.loc[df_copy['ds'].isin(selected_dates2)]
        # Drop all rows with 'NaN' in the 'value' column
        result_df.dropna(subset=['value'], inplace=True)
        if len(result_df) >= 12:
            # Create a list of ones
            repeated_ones = [1 for x in range(len(df_copy))]
            # Add the 'theta_check' column
            df_copy['theta_check'] = repeated_ones
            # Append
            df_list4.append(df_copy)
        else:
            # Create a list of zeros
            repeated_zeros = [0 for x in range(len(df_copy))]
            # Add the 'theta_check' column
            df_copy['theta_check'] = repeated_zeros
            # Append
            df_list4.append(df_copy)
        i += 1
        
        
    ######
    # TAG4
    ######
    # Modified. The net sales TAG4 filters out time series that don't have at least 2 months of data. 
    # The remaining time series from this data prep all have at least 4 months of data due to TAG1. This 
    # TAG4 filters out time series that don't have the minimum number of months of data in the 6 months 
    # preceding the 12 month holdout period
    
    # Subtract 12 months from 'last_date'
    ld12 = last_date - dateutil.relativedelta.relativedelta(months=12)
    # Subtract 18 months from 'last_date'
    ld18 = last_date - dateutil.relativedelta.relativedelta(months=18)
    # Create a list of all the dates <= 'ld12' (dates equal to or older than 'ld12') & > 'ld18' (dates 
    # more recent than 'ld18')
    selected_dates3 = list()
    for x in datetime_objects:
        if x <= ld12 and x > ld18:
            # Extract only the %Y-%m-%d
            y = str(x)[:10]
            selected_dates3.append(y)

    # Filter the time series and add the 'data_periods' column to the dataframes. 'data_periods' contains 
    # the total number of non-NaN months of data 
    i = 0
    df_list5 = list()
    elim_via_TAG4 = list()
    while i < len(df_list4):
        df_copy = df_list4[i].copy()
        # Filter 'df_copy' by rows with dates in 'selected_dates3'
        result_df = df_copy.loc[df_copy['ds'].isin(selected_dates3)]
        # Drop all rows with 'NaN' in the 'value' column
        result_df.dropna(subset=['value'], inplace=True)
        if len(result_df) >= 4:
            # Temporarily drop all rows with 'NaN' in the 'value' column to get the data period count
            d_periods = len(df_copy.dropna(subset=['value']))
            # Create a list of the number of data periods
            repeated_periods = [d_periods for x in range(len(df_copy))]
            # Add the 'data_periods' column
            df_copy['data_periods'] = repeated_periods
            # Append
            df_list5.append(df_copy)
        else:
            elim_via_TAG4.append(df_copy.iloc[0, 1])
        i += 1
        
        
    # Concatenate    
    df_final = pd.concat(df_list5)
    # Replace all NaNs with zeros & reset the index
    df_final = df_final.fillna(0).reset_index(drop=True)
    # Create dataframes
    df_elim_via_TAG1 = pd.DataFrame(elim_via_TAG1, columns=['tag'])
    df_elim_via_TAG4 = pd.DataFrame(elim_via_TAG4, columns=['tag'])
    
    return df_final, df_export, df_elim_via_TAG1, df_elim_via_TAG4



# Main

In [3]:
import warnings
warnings.filterwarnings("ignore")

input_folder = r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\Forecasting Product\price_model\inputs\v1'
inputs_p = 'inputs_p.xlsx'
units = 'equiv_units.xlsx'
sales = 'net_sales.xlsx'

input_path = os.path.join(input_folder, inputs_p)
inputFile=os.path.join(input_folder, units)
inputFile2=os.path.join(input_folder, sales)

# Read input file
inputs = pd.read_excel(input_path, engine='openpyxl')

# Read in inputs
run_id=inputs[inputs['key']=='run_id']['value'].to_string(index=False).strip()
run_time=inputs[inputs['key']=='run_time']['value'].to_string(index=False).strip()
deployment=inputs[inputs['key']=='deployment']['value'].to_string(index=False).strip()

wd=inputs[inputs['key']=='wd']['value'].to_string(index=False).strip()
input_path=inputs[inputs['key']=='Input Path']['value'].to_string(index=False).strip()
output_path=inputs[inputs['key']=='Output Path']['value'].to_string(index=False).strip()
supp_folder=inputs[inputs['key']=='Supplementary Folder']['value'].to_string(index=False).strip()
output_folder_run=inputs[inputs['key']=='Output Folder Run']['value'].to_string(index=False).strip()
month_folder=inputs[inputs['key']=='Month Folder']['value'].to_string(index=False).strip()
id_cols=inputs[inputs['key']=='ID Columns']['value'].to_string(index=False).strip()

id_col_no=int(inputs[inputs['key']=='ID Column #']['value'].to_string(index=False).strip())
sub_cores=int(inputs[inputs['key']=='Subtracted Cores']['value'].to_string(index=False).strip())
minMonths=int(inputs[inputs['key']=='Minimum Months']['value'].to_string(index=False).strip())
dis_months=int(inputs[inputs['key']=='Discontinued Months']['value'].to_string(index=False).strip())
fct_months=int(inputs[inputs['key']=='Forecast Months']['value'].to_string(index=False).strip())
hdt_months=int(inputs[inputs['key']=='Holdout Months']['value'].to_string(index=False).strip())


actual_start_dt=datetime.strptime(inputs[inputs['key']=='Actuals Start']['value'].to_string(index=False).strip(), '%Y-%m-%d %X')
actual_end_dt=datetime.strptime(inputs[inputs['key']=='Actuals End']['value'].to_string(index=False).strip(), '%Y-%m-%d %X')
date1=datetime.strptime(inputs[inputs['key']=='Holdout ACT End']['value'].to_string(index=False).strip(), '%Y-%m-%d %X')
date2=datetime.strptime(inputs[inputs['key']=='Holdout FCT Start']['value'].to_string(index=False).strip(), '%Y-%m-%d %X')
date3=datetime.strptime(inputs[inputs['key']=='Forecast ACT End']['value'].to_string(index=False).strip(), '%Y-%m-%d %X')
date4=datetime.strptime(inputs[inputs['key']=='Forecast FCT Start']['value'].to_string(index=False).strip(), '%Y-%m-%d %X')
dvbl_start_dt=datetime.strptime(inputs[inputs['key']=='DVBL Start']['value'].to_string(index=False).strip(), '%Y-%m-%d %X')
dvbl_end_dt=datetime.strptime(inputs[inputs['key']=='DVBL End']['value'].to_string(index=False).strip(), '%Y-%m-%d %X')

# Read in data
df_u = pd.read_excel(inputFile, engine='openpyxl')
df_s = pd.read_excel(inputFile2, engine='openpyxl')

In [4]:
# Execute 'data_prep' function
units_format, units_acts_df, units_elim_via_TAG1_df, units_elim_via_TAG4_df = data_prep(wd, month_folder, df_u, 'units')
rev_format, rev_lc_acts_df, rev_lc_elim_via_TAG1_df, rev_lc_elim_via_TAG4_df = data_prep(wd, month_folder, df_s, 'rev')

df_units = units_format.rename(columns={'value': 'value.units'})
df_units = df_units.reset_index(drop=True)

df_rev = rev_format.rename(columns={'value': 'value.rev'})
df_rev = df_rev.reset_index(drop=True)

# Filter & sort
df_units = df_units.sort_values('tag')
df_rev = df_rev.sort_values('tag')

# df_units=df_units[df_units['tag']=='LC - LC_D_US - US_P_XTD_TOT - Xtandi Total']
# df_rev=df_rev[df_rev['tag']=='LC - LC_D_US - US_P_XTD_TOT - Xtandi Total']

df_price=df_units[['tag','Cost Objects','Products','ds', 'value.units', 'no_sales', 'short_history', 'theta_check', 'data_periods']].merge(df_rev[['tag','ds', 'value.rev']], how='left', on=['tag','ds'])
df_price['value.price']=df_price['value.rev'] / df_price['value.units']

In [None]:
# !curl -L https://github.com/unit8co/amld2022-forecasting-and-metalearning/blob/main/data/m3_dataset.xls\?raw\=true -o m3_dataset.xls
# !curl -L https://github.com/unit8co/amld2022-forecasting-and-metalearning/blob/main/data/passengers.pkl\?raw\=true -o passengers.pkl
# !curl -L https://github.com/unit8co/amld2022-forecasting-and-metalearning/blob/main/data/m4_monthly_scaled.pkl\?raw\=true -o m4_monthly_scaled.pkl

In [6]:
%matplotlib inline

import os
import time
import random
import pandas as pd
import pickle
import numpy as np
from tqdm.auto import tqdm
from datetime import datetime
from itertools import product
import torch
from torch import nn
from typing import List, Tuple, Dict
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import Ridge
import matplotlib.pyplot as plt

from darts import TimeSeries
from darts.utils.losses import SmapeLoss
from darts.dataprocessing.transformers import Scaler
from darts.metrics import smape
from darts.utils.utils import SeasonalityMode, TrendMode, ModelMode
from darts.models import *

def load_m4() -> Tuple[List[TimeSeries], List[TimeSeries]]:
    # load TimeSeries - the splitting and scaling has already been done
    print("loading M4 TimeSeries...")
    with open("m4_monthly_scaled.pkl", "rb") as f:
        m4_series = pickle.load(f)

    # filter and keep only series that contain at least 48 training points
    m4_series = list(filter(lambda t: len(t[0]) >= 48, m4_series))

    m4_train_scaled, m4_test_scaled = zip(*m4_series)

    print(
        "done. There are {} series, with average training length {}".format(
            len(m4_train_scaled), np.mean([len(s) for s in m4_train_scaled])
        )
    )
    return m4_train_scaled, m4_test_scaled

m4_train, m4_test = load_m4()

loading M4 TimeSeries...
done. There are 47992 series, with average training length 216.32901316886148


In [8]:
# Slicing hyper-params:
IN_LEN = 36
OUT_LEN = 4

# Architecture hyper-params:
NUM_STACKS = 20
NUM_BLOCKS = 1
NUM_LAYERS = 2
LAYER_WIDTH = 136
COEFFS_DIM = 11

# Training settings:
LR = 1e-3
BATCH_SIZE = 1024
MAX_SAMPLES_PER_TS = (
    10  # <-- new parameter, limiting the number of training samples per series
)
NUM_EPOCHS = 5

In [10]:
# reproducibility
np.random.seed(42)
torch.manual_seed(42)

nbeats_model_m4 = NBEATSModel(
    input_chunk_length=IN_LEN,
    output_chunk_length=OUT_LEN,
    batch_size=BATCH_SIZE,
    num_stacks=NUM_STACKS,
    num_blocks=NUM_BLOCKS,
    num_layers=NUM_LAYERS,
    layer_widths=LAYER_WIDTH,
    expansion_coefficient_dim=COEFFS_DIM,
    loss_fn=SmapeLoss(),
    optimizer_kwargs={"lr": LR},
    pl_trainer_kwargs={
        "enable_progress_bar": True,
        "accelerator": "cpu",
    },
)

# Train
nbeats_model_m4.fit(
    m4_train,
    num_loader_workers=4,
    epochs=NUM_EPOCHS,
    max_samples_per_ts=MAX_SAMPLES_PER_TS,
)

2022-09-18 15:45:59 pytorch_lightning.utilities.rank_zero INFO: GPU available: False, used: False
2022-09-18 15:45:59 pytorch_lightning.utilities.rank_zero INFO: TPU available: False, using: 0 TPU cores
2022-09-18 15:45:59 pytorch_lightning.utilities.rank_zero INFO: IPU available: False, using: 0 IPUs
2022-09-18 15:45:59 pytorch_lightning.utilities.rank_zero INFO: HPU available: False, using: 0 HPUs
2022-09-18 15:46:00 pytorch_lightning.callbacks.model_summary INFO: 
  | Name          | Type             | Params
---------------------------------------------------
0 | criterion     | SmapeLoss        | 0     
1 | train_metrics | MetricCollection | 0     
2 | val_metrics   | MetricCollection | 0     
3 | stacks        | ModuleList       | 543 K 
---------------------------------------------------
541 K     Trainable params
1.9 K     Non-trainable params
543 K     Total params
2.173     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

2022-09-18 16:21:06 pytorch_lightning.utilities.rank_zero INFO: `Trainer.fit` stopped: `max_epochs=5` reached.


<darts.models.forecasting.nbeats.NBEATSModel at 0x226d1199e80>

In [17]:
start_time = time.time()
HORIZON=18
# df_price['value.price']=df_price['value.price'].float()
series = TimeSeries.from_dataframe(df_price[(df_price['Cost Objects']=='D_DE - Germany') & (df_price['Products']=='P_XTD_TOT - Xtandi Total')], 'ds', 'value.price')
preds = nbeats_model_m4.predict(series=series, n=HORIZON)  # get forecasts
# nbeats_m4_elapsed_time = time.time() - start_time

# nbeats_m4_smapes = eval_forecasts(preds, air_test)

Predicting: 469it [00:00, ?it/s]

RuntimeError: expected scalar type Float but found Double

In [37]:
df_price[(df_price['Cost Objects']=='D_DE - Germany') & (df_price['Products']=='P_XTD_TOT - Xtandi Total')]['ds']

15376    2018-12-01
15377    2018-09-01
15378    2018-10-01
15379    2018-11-01
15380    2019-01-01
            ...    
15433    2020-01-01
15434    2019-11-01
15435    2022-04-01
15436    2017-04-01
15437    2017-05-01
Name: ds, Length: 62, dtype: object