In [1]:
import pandas as pd
from tqdm.notebook import tqdm

import os
import sys

project_path = os.path.abspath(os.path.join('..'))

if project_path not in sys.path:
    sys.path.append(project_path)

## Generate AGG_HYB_FCST table

In [858]:
AGG_HYB_FCST = pd.DataFrame(
    {
        'PRODUCT_LVL_ID6': [x for x in range(600001, 602001)],
        'LOCATION_LVL_ID8': [x for x in range(800001, 802001)],
        'CUSTOMER_LVL_ID6': [x for x in range(600001, 602001)],
        'DISTR_CHANNEL_LVL_ID6': [x for x in range(600001, 602001)],
        'PERIOD_DT': pd.date_range(start='2015-01-01', periods=2000, freq='MS'),
        'PERIOD_END_DT': pd.date_range(start='2015-02-01', periods=2000, freq='MS'),
        'SEGMENT_NAME': ['name1' for x in range(600001, 602001)],
        'VF_FORECAST_VALUE': np.random.uniform(0, 100, 2000),
        'DEMAND_TYPE': np.random.randint(0, 2, 2000),
        'ASSORTMENT_TYPE': np.random.choice(['new', 'old'], 2000),
        'ML_FORECAST_VALUE': np.random.uniform(0, 100, 2000),
        'HYBRID_FORECAST_VALUE': np.random.uniform(0, 100, 2000)
    }
                           )

AGG_HYB_FCST['PERIOD_DT'] += pd.Timedelta('1D')

In [859]:
AGG_HYB_FCST.head()

Unnamed: 0,PRODUCT_LVL_ID6,LOCATION_LVL_ID8,CUSTOMER_LVL_ID6,DISTR_CHANNEL_LVL_ID6,PERIOD_DT,PERIOD_END_DT,SEGMENT_NAME,VF_FORECAST_VALUE,DEMAND_TYPE,ASSORTMENT_TYPE,ML_FORECAST_VALUE,HYBRID_FORECAST_VALUE
0,600001,800001,600001,600001,2015-01-02,2015-02-01,name1,78.434058,0,old,94.334636,16.237587
1,600002,800002,600002,600002,2015-02-02,2015-03-01,name1,30.282416,1,new,9.056554,80.811102
2,600003,800003,600003,600003,2015-03-02,2015-04-01,name1,3.719096,0,old,69.37642,74.197846
3,600004,800004,600004,600004,2015-04-02,2015-05-01,name1,24.284059,1,new,93.552082,5.556441
4,600005,800005,600005,600005,2015-05-02,2015-06-01,name1,19.226696,0,new,17.354831,81.377174


## Check granularity

In [860]:
def check_granulatiry(out_time_lvl):
    FINAL_GRANULARITY_DELIVERED = True
    if out_time_lvl == 'D':
        if (AGG_HYB_FCST['PERIOD_DT'] != AGG_HYB_FCST['PERIOD_END_DT']).any():
            FINAL_GRANULARITY_DELIVERED = False
    elif 'W' in out_time_lvl:
        if (AGG_HYB_FCST['PERIOD_DT'].apply(lambda x: x - pd.Timedelta(f'{x.dayofweek}D')) != \
            AGG_HYB_FCST['PERIOD_END_DT'].apply(lambda x: x - pd.Timedelta(f'{x.dayofweek}D'))).any():
            FINAL_GRANULARITY_DELIVERED = False
    if out_time_lvl == 'M':
        if (AGG_HYB_FCST['PERIOD_DT'].apply(lambda x: x - pd.Timedelta(f'{x.day - 1}D')) != \
            AGG_HYB_FCST['PERIOD_END_DT'].apply(lambda x: x - pd.Timedelta(f'{x.day - 1}D'))).any():
            FINAL_GRANULARITY_DELIVERED = False
            
    return FINAL_GRANULARITY_DELIVERED

In [862]:
check_granulatiry('W')

False

## Change Granularity

In [866]:
def change_granularity(AGG_HYB_FCST, out_time_lvl):
    data = AGG_HYB_FCST.copy()
    data['OUT_PERIOD_DT'] = data['PERIOD_DT']
    data['OUT_PERIOD_END_DT'] = data['PERIOD_END_DT']

    for ind, row in tqdm(AGG_HYB_FCST.iterrows(), total=AGG_HYB_FCST.shape[0]):
        cur_dates = pd.to_datetime(np.array([row['PERIOD_DT'], row['PERIOD_END_DT']]))
        split_dates = pd.period_range(cur_dates[0], cur_dates[1], freq=out_time_lvl).to_timestamp()
        taken_dates = split_dates[(split_dates > cur_dates[0]) & (split_dates < cur_dates[1])]
        needed_dates = np.append(taken_dates, cur_dates)
        needed_dates.sort()
        start_ind = 1
        if out_time_lvl == 'D':
            start_ind = 0
            data.loc[ind, 'OUT_PERIOD_END_DT'] = needed_dates[1] - pd.Timedelta('1D')
        else:
            data.loc[ind, 'OUT_PERIOD_END_DT'] = needed_dates[1]

        for i in range(start_ind, len(needed_dates) - 1):        
            new_row = row.copy()
            new_row['OUT_PERIOD_DT'] = needed_dates[i] + pd.Timedelta('1D')
            new_row['OUT_PERIOD_END_DT'] = needed_dates[i + 1]
            data = data.append(new_row, ignore_index=True)

    data_filled = data.sort_values(data.columns[:4].to_list()).reset_index(drop=True)
    
    return data_filled

data_filled = change_granularity(AGG_HYB_FCST, out_time_lvl)
data_filled

  0%|          | 0/2000 [00:00<?, ?it/s]

Unnamed: 0,PRODUCT_LVL_ID6,LOCATION_LVL_ID8,CUSTOMER_LVL_ID6,DISTR_CHANNEL_LVL_ID6,PERIOD_DT,PERIOD_END_DT,SEGMENT_NAME,VF_FORECAST_VALUE,DEMAND_TYPE,ASSORTMENT_TYPE,ML_FORECAST_VALUE,HYBRID_FORECAST_VALUE,OUT_PERIOD_DT,OUT_PERIOD_END_DT
0,600001,800001,600001,600001,2015-01-02,2015-02-01,name1,78.434058,0,old,94.334636,16.237587,2015-01-02,2015-01-05
1,600001,800001,600001,600001,2015-01-02,2015-02-01,name1,78.434058,0,old,94.334636,16.237587,2015-01-06,2015-01-12
2,600001,800001,600001,600001,2015-01-02,2015-02-01,name1,78.434058,0,old,94.334636,16.237587,2015-01-13,2015-01-19
3,600001,800001,600001,600001,2015-01-02,2015-02-01,name1,78.434058,0,old,94.334636,16.237587,2015-01-20,2015-01-26
4,600001,800001,600001,600001,2015-01-02,2015-02-01,name1,78.434058,0,old,94.334636,16.237587,2015-01-27,2015-02-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10119,602000,802000,602000,602000,2181-08-02,2181-09-01,name1,96.753685,0,old,7.602299,74.409482,2181-08-02,2181-08-06
10120,602000,802000,602000,602000,2181-08-02,2181-09-01,name1,96.753685,0,old,7.602299,74.409482,2181-08-07,2181-08-13
10121,602000,802000,602000,602000,2181-08-02,2181-09-01,name1,96.753685,0,old,7.602299,74.409482,2181-08-14,2181-08-20
10122,602000,802000,602000,602000,2181-08-02,2181-09-01,name1,96.753685,0,old,7.602299,74.409482,2181-08-21,2181-08-27


In [867]:
def split_forecast(data_filled):
    def share_forecast(x, target):
        return x[target] * ((x['OUT_PERIOD_END_DT'] - x['OUT_PERIOD_DT']) / np.timedelta64(1, 'D') + 1) / \
    ((x['PERIOD_END_DT'] - x['PERIOD_DT']) / np.timedelta64(1, 'D') + 1)

    data_filled['VF_FORECAST_VALUE_SHARED'] = data_filled.apply(lambda x: share_forecast(x, 'VF_FORECAST_VALUE'), axis=1)
    data_filled['ML_FORECAST_VALUE_SHARED'] = data_filled.apply(lambda x: share_forecast(x, 'ML_FORECAST_VALUE'), axis=1)
    data_filled['HYBRID_FORECAST_VALUE_SHARED'] = data_filled.apply(lambda x: share_forecast(x, 'HYBRID_FORECAST_VALUE'), axis=1)

    data_filled = data_filled.drop(['PERIOD_DT', 'PERIOD_END_DT'], axis=1)
    data_filled = data_filled.rename(columns={'OUT_PERIOD_DT': 'PERIOD_DT', 'OUT_PERIOD_END_DT': 'PERIOD_END_DT'})
    data_filled = data_filled.set_index(['PERIOD_DT', 'PERIOD_END_DT']).reset_index()

    return data_filled



In [868]:
data_filled = split_forecast(data_filled)
data_filled

Unnamed: 0,PERIOD_DT,PERIOD_END_DT,PRODUCT_LVL_ID6,LOCATION_LVL_ID8,CUSTOMER_LVL_ID6,DISTR_CHANNEL_LVL_ID6,SEGMENT_NAME,VF_FORECAST_VALUE,DEMAND_TYPE,ASSORTMENT_TYPE,ML_FORECAST_VALUE,HYBRID_FORECAST_VALUE,VF_FORECAST_VALUE_SHARED,ML_FORECAST_VALUE_SHARED,HYBRID_FORECAST_VALUE_SHARED
0,2015-01-02,2015-01-05,600001,800001,600001,600001,name1,78.434058,0,old,94.334636,16.237587,10.120524,12.172211,2.095173
1,2015-01-06,2015-01-12,600001,800001,600001,600001,name1,78.434058,0,old,94.334636,16.237587,17.710916,21.301369,3.666552
2,2015-01-13,2015-01-19,600001,800001,600001,600001,name1,78.434058,0,old,94.334636,16.237587,17.710916,21.301369,3.666552
3,2015-01-20,2015-01-26,600001,800001,600001,600001,name1,78.434058,0,old,94.334636,16.237587,17.710916,21.301369,3.666552
4,2015-01-27,2015-02-01,600001,800001,600001,600001,name1,78.434058,0,old,94.334636,16.237587,15.180785,18.258317,3.142759
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10119,2181-08-02,2181-08-06,602000,802000,602000,602000,name1,96.753685,0,old,7.602299,74.409482,15.605433,1.226177,12.001529
10120,2181-08-07,2181-08-13,602000,802000,602000,602000,name1,96.753685,0,old,7.602299,74.409482,21.847606,1.716648,16.802141
10121,2181-08-14,2181-08-20,602000,802000,602000,602000,name1,96.753685,0,old,7.602299,74.409482,21.847606,1.716648,16.802141
10122,2181-08-21,2181-08-27,602000,802000,602000,602000,name1,96.753685,0,old,7.602299,74.409482,21.847606,1.716648,16.802141


In [869]:
class Disaccumulation:
    def __init__(self, data, out_time_lvl):
        """
        Provide forecasts at the required time granularity level.
        
        Parameters
        ----------
        data : pd.DataFrame
            Table with ID, Period and Forecast columns
        
        out_time_lvl : string
            Required time granularity level
            
            Possible values:
            
            D - days
            W - weeks (default starting from Sunday)
            W-MON/W-TUE/.../W-SUN - weeks, starting from specified day of week
            M - months
        """
        self.data = data
        self.data_splitted = data
        self.out_time_lvl = out_time_lvl
        self.FINAL_GRANULARITY_DELIVERED = True
        
        
    def check_granulatiry(self):
        """
        Check whether period_dt and period_end_dt in AGG_HYB_FCST correspond to out_time_lvl
        
        Returns
        -------
        bool
            Returns flag which shows whether forecast split needed or not
        """
        if self.out_time_lvl == 'D':
            if (self.data['PERIOD_DT'] != self.data['PERIOD_END_DT']).any():
                self.FINAL_GRANULARITY_DELIVERED = False
        elif 'W' in self.out_time_lvl:
            if (self.data['PERIOD_DT'].apply(lambda x: x - pd.Timedelta(f'{x.dayofweek}D')) != \
                self.data['PERIOD_END_DT'].apply(lambda x: x - pd.Timedelta(f'{x.dayofweek}D'))).any():
                self.FINAL_GRANULARITY_DELIVERED = False
        if self.out_time_lvl == 'M':
            if (self.data['PERIOD_DT'].apply(lambda x: x - pd.Timedelta(f'{x.day - 1}D')) != \
                self.data['PERIOD_END_DT'].apply(lambda x: x - pd.Timedelta(f'{x.day - 1}D'))).any():
                self.FINAL_GRANULARITY_DELIVERED = False
                
        return self.FINAL_GRANULARITY_DELIVERED
            
        
    def change_granularity(self):
        """
        If FINAL_GRANULARITY_DELIVERED == False then transform original table by splitting forecast periods
        to more granular time stamps
        
        Returns
        -------
        pd.DataFrame
            Splitted data to more granular time stamps
        """
        df = self.data.copy()
        df['OUT_PERIOD_DT'] = df['PERIOD_DT']
        df['OUT_PERIOD_END_DT'] = df['PERIOD_END_DT']

        for ind, row in tqdm(self.data.iterrows(), total=self.data.shape[0]):
            cur_dates = pd.to_datetime(np.array([row['PERIOD_DT'], row['PERIOD_END_DT']]))
            split_dates = pd.period_range(cur_dates[0], cur_dates[1], freq=self.out_time_lvl).to_timestamp()
            taken_dates = split_dates[(split_dates > cur_dates[0]) & (split_dates < cur_dates[1])]
            needed_dates = np.append(taken_dates, cur_dates)
            needed_dates.sort()
            start_ind = 1
            if self.out_time_lvl == 'D':
                start_ind = 0
                df.loc[ind, 'OUT_PERIOD_END_DT'] = needed_dates[1] - pd.Timedelta('1D')
            else:
                df.loc[ind, 'OUT_PERIOD_END_DT'] = needed_dates[1]

            for i in range(start_ind, len(needed_dates) - 1):        
                new_row = row.copy()
                new_row['OUT_PERIOD_DT'] = needed_dates[i] + pd.Timedelta('1D')
                new_row['OUT_PERIOD_END_DT'] = needed_dates[i + 1]
                df = df.append(new_row, ignore_index=True)

        self.data_filled = df.sort_values(df.columns[df.columns.str.contains('_ID')].to_list() + ['PERIOD_DT']).reset_index(drop=True)
        
        return self.data_filled
    
    def share_forecast(self):
        """
        Calculate forecast share and volume of VF_FORECAST_VALUE, ML_FORECAST_VALUE,
        HYBRID_FORECAST proportionally to number of days in interval [PERIOD_DT, PERIOD_END_DT]
        
        Returns
        -------
        pd.DataFrame
            Data with shared forecast
        """
        def split(x, target):
            return x[target] * ((x['OUT_PERIOD_END_DT'] - x['OUT_PERIOD_DT']) / np.timedelta64(1, 'D') + 1) / \
        ((x['PERIOD_END_DT'] - x['PERIOD_DT']) / np.timedelta64(1, 'D') + 1)

        self.data_filled['VF_FORECAST_VALUE'] = self.data_filled.apply(lambda x: split(x, 'VF_FORECAST_VALUE'), axis=1)
        self.data_filled['ML_FORECAST_VALUE'] = self.data_filled.apply(lambda x: split(x, 'ML_FORECAST_VALUE'), axis=1)
        self.data_filled['HYBRID_FORECAST_VALUE'] = self.data_filled.apply(lambda x: split(x, 'HYBRID_FORECAST_VALUE'), axis=1)

        self.data_filled = self.data_filled.drop(['PERIOD_DT', 'PERIOD_END_DT'], axis=1)
        self.data_filled = self.data_filled.rename(columns={'OUT_PERIOD_DT': 'PERIOD_DT', 'OUT_PERIOD_END_DT': 'PERIOD_END_DT'})
        self.data_filled = self.data_filled.set_index(['PERIOD_DT', 'PERIOD_END_DT']).reset_index()
        self.data_splitted = self.data_filled
        
        return self.data_filled


    def split_forecasts(self):
        """
        Main function that calls all others to get answer
        
        Returns
        -------
        pd.DataFrame
            Data with shared forecast
        """
        self.check_granulatiry()
        if not self.FINAL_GRANULARITY_DELIVERED:
            self.change_granularity()
            self.share_forecast()
        
        return self.data_splitted
    
    

In [871]:
Dis = Disaccumulation(AGG_HYB_FCST, 'W')

In [872]:
data_splitted = Dis.split_forecasts()

  0%|          | 0/2000 [00:00<?, ?it/s]

In [873]:
Dis.data

Unnamed: 0,PRODUCT_LVL_ID6,LOCATION_LVL_ID8,CUSTOMER_LVL_ID6,DISTR_CHANNEL_LVL_ID6,PERIOD_DT,PERIOD_END_DT,SEGMENT_NAME,VF_FORECAST_VALUE,DEMAND_TYPE,ASSORTMENT_TYPE,ML_FORECAST_VALUE,HYBRID_FORECAST_VALUE
0,600001,800001,600001,600001,2015-01-02,2015-02-01,name1,78.434058,0,old,94.334636,16.237587
1,600002,800002,600002,600002,2015-02-02,2015-03-01,name1,30.282416,1,new,9.056554,80.811102
2,600003,800003,600003,600003,2015-03-02,2015-04-01,name1,3.719096,0,old,69.376420,74.197846
3,600004,800004,600004,600004,2015-04-02,2015-05-01,name1,24.284059,1,new,93.552082,5.556441
4,600005,800005,600005,600005,2015-05-02,2015-06-01,name1,19.226696,0,new,17.354831,81.377174
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,601996,801996,601996,601996,2181-04-02,2181-05-01,name1,14.184972,1,old,82.542409,55.577960
1996,601997,801997,601997,601997,2181-05-02,2181-06-01,name1,5.198137,1,old,78.491837,84.543585
1997,601998,801998,601998,601998,2181-06-02,2181-07-01,name1,28.018850,1,new,50.701640,40.174174
1998,601999,801999,601999,601999,2181-07-02,2181-08-01,name1,94.511561,1,new,94.283929,2.781974


In [874]:
Dis.data_splitted

Unnamed: 0,PERIOD_DT,PERIOD_END_DT,PRODUCT_LVL_ID6,LOCATION_LVL_ID8,CUSTOMER_LVL_ID6,DISTR_CHANNEL_LVL_ID6,SEGMENT_NAME,VF_FORECAST_VALUE,DEMAND_TYPE,ASSORTMENT_TYPE,ML_FORECAST_VALUE,HYBRID_FORECAST_VALUE
0,2015-01-02,2015-01-05,600001,800001,600001,600001,name1,10.120524,0,old,12.172211,2.095173
1,2015-01-06,2015-01-12,600001,800001,600001,600001,name1,17.710916,0,old,21.301369,3.666552
2,2015-01-13,2015-01-19,600001,800001,600001,600001,name1,17.710916,0,old,21.301369,3.666552
3,2015-01-20,2015-01-26,600001,800001,600001,600001,name1,17.710916,0,old,21.301369,3.666552
4,2015-01-27,2015-02-01,600001,800001,600001,600001,name1,15.180785,0,old,18.258317,3.142759
...,...,...,...,...,...,...,...,...,...,...,...,...
10119,2181-08-02,2181-08-06,602000,802000,602000,602000,name1,15.605433,0,old,1.226177,12.001529
10120,2181-08-07,2181-08-13,602000,802000,602000,602000,name1,21.847606,0,old,1.716648,16.802141
10121,2181-08-14,2181-08-20,602000,802000,602000,602000,name1,21.847606,0,old,1.716648,16.802141
10122,2181-08-21,2181-08-27,602000,802000,602000,602000,name1,21.847606,0,old,1.716648,16.802141
