In [1]:
# import pandas for data manipulation
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Read in sales data
sales = pd.read_csv('fake_data_customer_view.csv', encoding='utf-8')
sales = sales[sales.filter(regex='^(?!Unnamed)').columns]

# Remove unecessary 'STATE' column
sales.drop(columns=['STATE'])

Unnamed: 0,STORE,WEEK,SESSION,PROPOSITION,SALES,CUST_ID,ACTIVE_SUB
0,1,1,Morning,pick-up,230,70,1
1,1,1,Morning,delivery,152,26,0
2,1,1,Morning,in-store,336,50,1
3,1,1,Afternoon,pick-up,304,13,0
4,1,1,Afternoon,delivery,297,33,0
...,...,...,...,...,...,...,...
139,3,4,Night,delivery,366,31,1
140,3,4,Night,in-store,366,70,1
141,3,4,Night,delivery,279,50,0
142,3,4,Night,delivery,144,60,1


In [3]:
# Default columns (and relevant functions) to aggregate on
DEFAULT_AGGREGATIONS = {'SALES': 'sum', 'CUST_ID': 'count', 'ACTIVE_SUB': 'sum', 
                        'SALES_DELTA': 'sum', 'SUB_DELTA': 'sum'}

def group_on_variable(cols, df):
    """
    Helper method for grouping on and removing columns from
    the sales dataframe. We bake in the particular necessary aggregations 
    for our columns.
    """
    
    aggregation_functions = {
        col: f for (col, f) in DEFAULT_AGGREGATIONS.items() if col in df
    }

    df = df.groupby(cols).aggregate(aggregation_functions)
    
    # Rename CUST_ID to TOTAL_CUST and ACTIVE_SUB to TOTAL_ACTIVE
    df.rename(columns={'CUST_ID': 'TOTAL_CUST', 'ACTIVE_SUB': 'TOTAL_ACTIVE'}, inplace=True)
    df.sort_values(['WEEK'])
    return df

In [4]:
# Example grouping of data
display(group_on_variable(['WEEK', 'SESSION'], sales).head(), 
        group_on_variable(['WEEK', 'PROPOSITION'], sales).head(), 
        group_on_variable(['WEEK', 'SESSION', 'PROPOSITION'], sales).head(), 
        group_on_variable(['WEEK', 'STORE'], sales).head())

Unnamed: 0_level_0,Unnamed: 1_level_0,SALES,TOTAL_CUST,TOTAL_ACTIVE
WEEK,SESSION,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Afternoon,1823,9,4
1,Morning,2134,9,6
1,Night,4445,18,10
2,Afternoon,1940,9,5
2,Morning,2316,9,5


Unnamed: 0_level_0,Unnamed: 1_level_0,SALES,TOTAL_CUST,TOTAL_ACTIVE
WEEK,PROPOSITION,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,delivery,3644,16,9
1,in-store,2721,11,6
1,pick-up,2037,9,5
2,delivery,4186,15,5
2,in-store,2721,12,4


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SALES,TOTAL_CUST,TOTAL_ACTIVE
WEEK,SESSION,PROPOSITION,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Afternoon,delivery,562,3,2
1,Afternoon,in-store,473,3,1
1,Afternoon,pick-up,788,3,1
1,Morning,delivery,654,3,1
1,Morning,in-store,1019,3,3


Unnamed: 0_level_0,Unnamed: 1_level_0,SALES,TOTAL_CUST,TOTAL_ACTIVE
WEEK,STORE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,2768,12,7
1,2,3056,12,7
1,3,2578,12,6
2,1,2946,12,5
2,2,2532,12,6


In [5]:
# IMPORTANT: Transform sales to be grouped in the order 'WEEK', 'STORE', 'PROPOSITION', 'SESSION'.
sales = group_on_variable(['WEEK', 'STORE', 'PROPOSITION', 'SESSION'], sales)

In [6]:
# Number of unique possible values for propositions, sessions and stores
# NB: Assumes that all propositions/sessions/stores exist for every week in base-table
NUM_PROPOSITIONS = 3
NUM_SESSIONS = 3
NUM_STORES = 3

In [9]:
def week_over_week(sales, num_weeks, granularity):
    """
    Primary function which takes sales data, number of weeks (for block comparison)
    and granularity as its inputs. Returns the transformed dataframe with new 'DELTA'
    column.
    
    Granularity is a list. To inspect at a STORE/PROPOSITION level, define:
    granularity = ['STORE', 'PROPOSITION']

    """
    
    # Compute highest level granularity sales and active subscriber diff.
    # This will then be squashed according to the required granularity.
    sales["SALES_DELTA"] = sales.SALES.diff(num_weeks*NUM_PROPOSITIONS*NUM_SESSIONS*NUM_STORES)
    sales["SUB_DELTA"] =  sales.TOTAL_ACTIVE.diff(num_weeks*NUM_PROPOSITIONS*NUM_SESSIONS*NUM_STORES)
    sales = group_on_variable(granularity, sales)
    
    sales.rename(columns={'SALES_DELTA': "{}_WEEKS_SALES_DELTA".format(num_weeks), 
                         'SUB_DELTA': "{}_WEEKS_SUB_DELTA".format(num_weeks)}, inplace=True)
    return sales

In [13]:
# Example 2-week block comparison at a STORE level
display(week_over_week(sales, 2, ['WEEK', 'SESSION', 'STORE']))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SALES,2_WEEKS_SALES_DELTA,2_WEEKS_SUB_DELTA
WEEK,SESSION,STORE,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Afternoon,1,717,0.0,0.0
1,Afternoon,2,552,0.0,0.0
1,Afternoon,3,554,0.0,0.0
1,Morning,1,718,0.0,0.0
1,Morning,2,804,0.0,0.0
1,Morning,3,612,0.0,0.0
1,Night,1,1333,0.0,0.0
1,Night,2,1700,0.0,0.0
1,Night,3,1412,0.0,0.0
2,Afternoon,1,652,0.0,0.0
