### Imports

#### Import Packages

In [117]:
import polars as pl
import altair as alt
import numpy as np
from great_tables import GT
import gc

# alt.JupyterChart.enable_offline()
# alt.renderers.enable("jupyter", offline=True)

#### Import Panel Data

In [118]:
grocery_lf = pl.scan_csv(source="data/panel-datasets/edible_grocery.csv",
                         has_header=True,
                         separator=",",
                         schema={'panel_id': pl.Int32,
                                 'trans_id': pl.Int32,
                                 'week': pl.Int16,
                                 'sku_id': pl.Categorical,
                                 'units': pl.Int16,
                                 'price': pl.Float32,
                                 'brand': pl.Categorical})
grocery_lf.head().collect()

panel_id,trans_id,week,sku_id,units,price,brand
i32,i32,i16,cat,i16,f32,cat
3102011,1569,6,"""19""",1,2.79,"""Alpha"""
3102012,4301,15,"""15""",1,3.19,"""Alpha"""
3102012,4301,15,"""15""",1,3.19,"""Alpha"""
3102012,4301,15,"""38""",1,3.49,"""Bravo"""
3102012,4301,15,"""44""",1,3.49,"""Bravo"""


In [119]:
sku_lf = pl.scan_csv(source="data/panel-datasets/sku_weight.csv",
                         has_header=True,
                         separator=",",
                         schema={'sku_id': pl.Categorical,
                                 'weight': pl.Int16})
sku_lf.head().collect()

sku_id,weight
cat,i16
"""1""",400
"""2""",400
"""3""",400
"""4""",250
"""5""",1000


In [120]:
kiwi_lf = pl.scan_csv(source="data/panel-datasets/kiwibubbles_trans.csv",
                      has_header=True,
                      separator=",",
                      schema={'ID': pl.Int16,
                              'Market': pl.Categorical,
                              'Week': pl.Int16,
                              'Day': pl.Int16,
                              'Units': pl.Int16})
kiwi_lf.head().collect()

ID,Market,Week,Day,Units
i16,cat,i16,i16,i16
10001,"""1""",19,3,1
10002,"""1""",12,5,1
10003,"""1""",37,7,1
10004,"""1""",30,6,1
10004,"""1""",47,3,1


#### Reusable Functions

In [121]:
def weekly_plot(dataframe, y, color=None, title="", y_axis_label="", pct=False, legend=False):
    
    # Configure the color encoding only if color is provided
    if color is not None:
        color_encoding = alt.Color(
            f'{color}:N',  # N = a discrete unordered category
            legend=alt.Legend(title=color) if legend else None  # Add legend conditionally
        )
    else:
        color_encoding = alt.Color()  # No color encoding    
    
    chart = alt.Chart(dataframe).mark_line(strokeWidth=1).encode(
        x = alt.X(
            'week',
            axis=alt.Axis(
                values=np.arange(0, 104 + 1, 13), # Explicitly specify quarter-end weeks
                labelExpr="datum.value", # Show only these labels
                title='Week'
            )
        ),
        y = alt.Y(
            f'{y}:Q', # Q = a continuous real-valued quantity
            title=y_axis_label,
            axis=alt.Axis(format="$,.0f") if not pct else alt.Axis(format=",.0%")
        ),
        color = color_encoding
    ).properties(
        width=650,
        height=250,
        title=title
    ).configure_view(
        stroke=None
    ).configure_axisY(
        # grid=False # turn off y-axis grid if required
    )

    return alt.JupyterChart(chart)

### Preliminaries

#### Weekly Sales Pattern

In [122]:
# Grocery Sales LazyFrame (Query Plan): Category Weekly 'spend' - units x price = spend
grocery_spend_category = (
    grocery_lf
    .select(
        ['week', 'units', 'price']
    )
    .with_columns(
        ((pl.col('units') * pl.col('price')).cast(pl.Float64)).alias('spend')
    )
    .group_by('week')
    .agg(
        pl.col("spend").sum().cast(pl.Float64).alias('Weekly Spend')
    )
    .sort('week')
)

In [123]:
# Grocery Sales LazyFrame (Query Plan): Weekly 'spend' by Brand - units x price = spend
grocery_spend_brand = (
    grocery_lf
    .select(
        ['week', 'units', 'price', 'brand']
    )
    .with_columns(
         ((pl.col('units') * pl.col('price')).cast(pl.Float64)).alias('spend')
    )
    .group_by('week', 'brand')
    .agg(
        pl.col("spend").sum().cast(pl.Float64).alias('Weekly Spend')
    )
    .sort('week')
)

In [124]:
# Weekly Sales Pivot Table - Polars DataFrame
# For visualizing and inspecting only
weekly_sales_pivot = grocery_spend_brand.collect().pivot(
    on="brand",
    index="week",
    values="Weekly Spend",
    sort_columns=True,
).with_columns(
    pl.sum_horizontal(pl.exclude('week')).cast(pl.Float64).alias("Total") # Row total
).sort("week")

weekly_sales_pivot

week,Alpha,Bravo,Charlie,Delta,Other,Total
i16,f64,f64,f64,f64,f64,f64
1,331.459999,247.780003,51.79,17.64,18.75,667.420002
2,567.249997,398.640007,45.43,34.19,23.23,1068.740004
3,497.969999,639.980008,51.11,39.39,14.55,1243.000006
4,1389.960004,472.500004,51.87,85.04,39.779999,2039.150007
5,358.239999,252.540003,40.21,69.54,45.93,766.460002
…,…,…,…,…,…,…
100,692.480006,420.479996,69.600001,54.590001,47.97,1285.120004
101,265.770002,264.659999,64.030001,35.05,26.88,656.390003
102,379.610005,456.78999,43.5,20.4,43.61,943.909995
103,532.500005,440.999996,76.400001,40.44,45.55,1135.890002


In [125]:
plotdata = grocery_spend_category.collect()

weekly_plot(dataframe=plotdata, 
            y='Weekly Spend', 
            title='Category - Weekly Revenue', 
            y_axis_label='Spend ($)',
            pct=False,
            legend=False)

JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300, 'stroke': None}, 'axis…

In [126]:
plotdata = grocery_spend_brand.filter(
    pl.col('brand') == 'Alpha'
).collect()

weekly_plot(dataframe=plotdata, 
            y='Weekly Spend', 
            color='brand',
            title='Alpha - Weekly Revenue', 
            y_axis_label='Spend ($)',
            pct=False,
            legend=False)

JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300, 'stroke': None}, 'axis…

In [127]:
plotdata = grocery_spend_brand.filter(
    pl.col('brand') == 'Bravo'
).collect()

weekly_plot(dataframe=plotdata, 
            y='Weekly Spend', 
            color='brand',
            title='Bravo - Weekly Revenue', 
            y_axis_label='Spend ($)',
            pct=False,
            legend=False)

JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300, 'stroke': None}, 'axis…

#### Weekly (Volume) Market Share

In [128]:
# Grocery Volume LazyFrame: Weekly 'volume' - units x weight = volume
grocery_volume = (
    grocery_lf
    .join(
        other=sku_lf,
        left_on="sku_id",
        right_on="sku_id"
    )
    .select(
        ['week', 'units', 'brand', 'weight']
    )
    .with_columns(
        # volume column that is the product of weight of each SKU and the units of SKU sold
        (((pl.col('units') * pl.col('weight'))/1000).cast(pl.Float64)).alias('volume') # # weight from grams to kilograms
    )
    .group_by('week', 'brand')
    .agg(
        pl.col("volume").sum().cast(pl.Float64).alias('Weekly Volume')
    )
    .sort('week')
)

In [129]:
# Weekly (volume) market share
pct_volume =(
    grocery_volume
    .collect()
    .pivot(
        on='brand',
        index='week',
        values='Weekly Volume',
        sort_columns=True
    )
    .with_columns(
        (pl.col("*").exclude("week")) / pl.sum_horizontal(pl.exclude('week')).cast(pl.Float64)
    )
)

pct_volume

week,Alpha,Bravo,Charlie,Delta,Other
i16,f64,f64,f64,f64,f64
1,0.481077,0.355011,0.11194,0.02532,0.026652
2,0.524953,0.371525,0.059391,0.029696,0.014435
3,0.389338,0.508812,0.060676,0.032505,0.008668
4,0.677032,0.230511,0.036686,0.044984,0.010787
5,0.428735,0.324306,0.07574,0.119348,0.051871
…,…,…,…,…,…
100,0.545707,0.31711,0.080381,0.039087,0.017715
101,0.36891,0.40552,0.140805,0.059138,0.025627
102,0.342581,0.543494,0.071147,0.019689,0.023089
103,0.424609,0.415495,0.100574,0.042429,0.016893


In [130]:
pct_volume_plot = (
    grocery_volume
    .group_by('week')
    .agg(
        pl.col('Weekly Volume').sum().alias('Total Weekly Volume')
    )
    .join(
        other=grocery_volume,
        on='week',
    )
    .filter(
        (pl.col('brand') == 'Alpha') |
        (pl.col("brand") == 'Bravo')
    )
    .with_columns(
        # compute brand wise % of total volume sale
        (pl.col('Weekly Volume') / pl.col('Total Weekly Volume')).alias('pct_volume')
    )
).collect()

weekly_plot(dataframe=pct_volume_plot, 
            y='pct_volume', 
            color='brand', 
            title="", 
            y_axis_label="", 
            pct=True, 
            legend=True)

JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300, 'stroke': None}, 'axis…

In [131]:
corr_matrix = (
    pct_volume.select(
        pl.col('*').exclude('week')  # Exclude 'week' column
    )
    .corr()  # Compute the correlation matrix
    .with_columns(
        pl.Series(pct_volume.columns[1:]).alias("index")
    )
)

(
    GT(corr_matrix, rowname_col='index')
    .tab_header(title="Correlation Matrix of Weekly (Volume) Market Share")
    .fmt_number(columns=['Alpha', 'Bravo', 'Charlie', 'Delta', 'Other'])
    .data_color(
        domain=[-1, 1],
        palette=["rebeccapurple", "white", 'orange']
    )
)

Correlation Matrix of Weekly (Volume) Market Share,Correlation Matrix of Weekly (Volume) Market Share,Correlation Matrix of Weekly (Volume) Market Share,Correlation Matrix of Weekly (Volume) Market Share,Correlation Matrix of Weekly (Volume) Market Share,Correlation Matrix of Weekly (Volume) Market Share
Unnamed: 0_level_1,Alpha,Bravo,Charlie,Delta,Other
Alpha,1.00,−0.85,−0.34,−0.11,−0.15
Bravo,−0.85,1.00,−0.11,−0.23,−0.10
Charlie,−0.34,−0.11,1.00,0.13,0.24
Delta,−0.11,−0.23,0.13,1.00,0.31
Other,−0.15,−0.10,0.24,0.31,1.00


#### Annual Sales

In [132]:
# Annual sales by brand
annual_sales = (
    grocery_spend_brand
    .select(
        ['week', 'Weekly Spend', 'brand']
    )
    .with_columns(
        (pl.col("week") / 52).ceil().cast(pl.Int32).alias('year')
    )
    .group_by(['year', 'brand'])
    .agg(
        pl.col("Weekly Spend").sum().cast(pl.Float64).alias('Yearly Sales')
    )
    .sort('year')
)

In [133]:
annual_sales_pivot = annual_sales.collect().pivot(
    on='brand',
    index='year',
    sort_columns=True
).with_columns(
    pl.sum_horizontal(pl.all()).alias("Total") # add totals column
)

(
    GT(annual_sales_pivot, rowname_col="year")
    .tab_header(title="Annual Sales ($) by Brand")
    .tab_stubhead(label="Year")
    .fmt_currency()
    .data_color(
        columns=annual_sales_pivot.columns[:-1],
        domain=[100, 36_000],
        palette=["white", "rebeccapurple"]
    )    
)

Annual Sales ($) by Brand,Annual Sales ($) by Brand,Annual Sales ($) by Brand,Annual Sales ($) by Brand,Annual Sales ($) by Brand,Annual Sales ($) by Brand,Annual Sales ($) by Brand
Year,Alpha,Bravo,Charlie,Delta,Other,Total
1,"$33,570.94","$28,603.35","$5,120.87","$3,271.51","$1,535.23","$72,102.90"
2,"$35,250.75","$26,926.87","$3,922.68","$2,820.81","$1,739.82","$70,662.93"


In [134]:
annual_change = annual_sales_pivot.with_columns(
    pl.col('*').exclude('year').pct_change()
).filter(
    pl.col('year') == 2
).unpivot(
    index="year",  # Keep 'year' as a fixed identifier
    value_name='% Change',  
    variable_name='brand'
)

(
    GT(annual_change, rowname_col='brand')
    .tab_header(title='% Change in Annual Sales')
    .tab_stubhead(label="Brands")
    .fmt_percent()
    .data_color(
        domain=[-0.3, 0.3],
        palette=['orange', 'white', 'rebeccapurple']
    ) 
    .cols_hide('year')   
    .cols_label(brand='Y-o-Y % Change')
)

% Change in Annual Sales,% Change in Annual Sales
Brands,% Change
Alpha,5.00%
Bravo,−5.86%
Charlie,−23.40%
Delta,−13.78%
Other,13.33%
Total,−2.00%


In [135]:
chart = alt.Chart(annual_sales.collect()).mark_bar().encode(
    x=alt.X("brand:N", 
            axis=alt.Axis(labelAngle=0)),
    xOffset="year:N",
    y=alt.Y("Yearly Sales:Q",
            axis=alt.Axis(format="$,.0f")),
    color=alt.Color("year:N"),
).properties(
    width=650,
    height=250,
    title='Year 1 & Year 2 Revenues'
).show()

alt.JupyterChart(chart)

JupyterChart()

In [136]:
base = alt.Chart(annual_change).encode(
    x=alt.X("brand:N", 
            axis=alt.Axis(labelAngle=0)
    ),
    y=alt.Y("% Change:Q",
            axis=alt.Axis(format=".0%")
    ),
    color=alt.condition(
        alt.datum["% Change"] > 0, # fixed this 
        alt.value("green"),  # The positive color
        alt.value("red")  # The negative color
    ),
    text=alt.Text(
        '% Change',
        format=(".0%")
    )
        
).properties(
    width=650,
    height=250,
    title='Y-o-Y % Change in Revenue'
)

chart = base.mark_bar() + base.mark_text(align='center', 
                                 baseline=alt.expr(alt.expr.if_(alt.datum['% Change'] >= 0, 'bottom', 'top')),
                                 dy=alt.expr(alt.expr.if_(alt.datum['% Change'] >= 0, -2, 2)),
                                 dx=0)

alt.JupyterChart(chart)

JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300}}, 'layer': [{'mark': {…

#### Annual Market Share

In [137]:
market_share = annual_sales_pivot.with_columns(
    pl.col('*').exclude('year') / pl.col('Total')
)

(
    GT(market_share, rowname_col="year")
    .tab_header(title="Annual Market Share (%)")
    .tab_stubhead(label="Year")
    .fmt_percent()
    .cols_hide('Total')
    .data_color(
        domain=[0, 0.5],
        palette=["white", "rebeccapurple"]
    )    
)

Annual Market Share (%),Annual Market Share (%),Annual Market Share (%),Annual Market Share (%),Annual Market Share (%),Annual Market Share (%)
Year,Alpha,Bravo,Charlie,Delta,Other
1,46.56%,39.67%,7.10%,4.54%,2.13%
2,49.89%,38.11%,5.55%,3.99%,2.46%


In [138]:
annual_change = market_share.drop('Total').with_columns(
    pl.col('*').exclude('year').pct_change()
).filter(
    pl.col('year') == 2
).unpivot(
    index="year",  # Keep 'year' as a fixed identifier
    value_name='% Change',  
    variable_name='brand'
)

(
    GT(annual_change, rowname_col='brand')
    .tab_header(title='% Change in Market Share')
    .tab_stubhead(label="Brands")
    .fmt_percent()
    .data_color(
        domain=[-0.3, 0.3],
        palette=['orange', 'white', 'rebeccapurple']
    ) 
    .cols_hide('year')   
    .cols_label(brand='Y-o-Y % Change')
)

% Change in Market Share,% Change in Market Share
Brands,% Change
Alpha,7.14%
Bravo,−3.94%
Charlie,−21.84%
Delta,−12.02%
Other,15.64%


In [139]:
base = alt.Chart(annual_change).encode(
    x=alt.X("brand:N", 
            axis=alt.Axis(labelAngle=0)
    ),
    y=alt.Y("% Change:Q",
            axis=alt.Axis(format=".0%")
    ),
    color=alt.condition(
        alt.datum["% Change"] > 0, # fixed this 
        alt.value("green"),  # The positive color
        alt.value("red")  # The negative color
    ),
    text=alt.Text(
        '% Change',
        format=(".0%")
    )
        
).properties(
    width=650,
    height=250,
    title='% Change in Annual Market Share'
)

chart = base.mark_bar() + base.mark_text(align='center', 
                                 baseline=alt.expr(alt.expr.if_(alt.datum['% Change'] >= 0, 'bottom', 'top')),
                                 dy=alt.expr(alt.expr.if_(alt.datum['% Change'] >= 0, -3, 3)),
                                 dx=0)

alt.JupyterChart(chart)

JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300}}, 'layer': [{'mark': {…

#### Average SKU Price

In [140]:
avg_sku_price = grocery_lf.select(
    ['week', 'sku_id', 'price', 'brand']
).filter(
    (pl.col('week') <= 52) & # pricing in the first year
    (pl.col('brand') == 'Alpha') # Filter by Alpha brand 
).group_by('brand', 'sku_id').agg(
    pl.col('price').cast(pl.Float64).mean()
).drop('brand').sort(
    pl.col('sku_id').cast(pl.Int8)
)

(
    GT(avg_sku_price.collect(), rowname_col='sku_id') 
    .tab_header(title='Alpha - Average SKU Retail Prices')
    .tab_stubhead(label="Brands")
    .fmt_currency()
    .data_color(
        domain=[1, 15],
        palette=['rebeccapurple', 'white', 'orange']
    )  
    .cols_label(price='Average Price ($)')
)

Alpha - Average SKU Retail Prices,Alpha - Average SKU Retail Prices
Brands,Average Price ($)
4,$1.85
5,$3.44
6,$2.61
7,$5.10
8,$12.43
9,$6.59
10,$10.49
11,$2.27
12,$2.79
14,$1.99


In [141]:
avg_sku_price.drop(pl.col('sku_id')).describe()

statistic,price
str,f64
"""count""",18.0
"""null_count""",0.0
"""mean""",4.042794
"""std""",2.947076
"""min""",1.84705
"""25%""",2.582056
"""50%""",2.99
"""75%""",3.438107
"""max""",12.43


#### Garbage Collect

In [142]:
exceptions = ['grocery_lf', 'sku_lf', 'kiwi_lf', 'In', 'exceptions', 'active_variables']

active_variables = [
    var for var, value in globals().items()
    if not var.startswith('_')   # Exclude variables that start with "_"
    and var not in exceptions    # Exclude variables in the exceptions list
    and isinstance(value, (pl.LazyFrame, pl.DataFrame, pl.Series, alt.Chart, alt.LayerChart, list, int, float, str, np.ndarray, np.int64, np.float32))  # Remove these types only
]

for var in active_variables:
    del globals()[var]
del active_variables, exceptions, var

gc.collect()

2702

### Exploring Variation in Buyer Behaviour

#### Panellist-Level Data Preparation

<div style="max-width:600px;margin-left: auto; margin-right: auto;">
<img src="references\Consumer-Panel-Data-Relationship.png" width="600"/>
</div>

In [143]:
def trans_summary(brand, lf, year):
    
    # Primary Step: Filter by Year 1 and Remove Unused Columns
    filtered_lf = lf.filter(
        (pl.col('week') <= (year * 52)) &
        (pl.col('week') >= ((year - 1) * 52))
    ).drop(
        pl.col('week','sku_id')
    )

    # Intermediate Step: Group by trans_id, panel_id, and brand
    group_trans = filtered_lf.drop(
        pl.col('price', 'units')
    ).group_by(
        'trans_id', 'panel_id', 'brand'
    ).n_unique()
    
    if brand == "Category":
        # Panellist-level category transaction summary
        trans = group_trans.group_by(
            'panel_id'
        ).n_unique()
    else:
        # Panellist-level brand transaction summary
        trans = group_trans.filter(
            pl.col('brand') == brand
        ).group_by(
            'panel_id'
        ).n_unique()
    
    return trans.collect()

In [144]:
def trans_pivot(lf, year):
    
    # Primary Step: Filter by Year 1 and Remove Unused Columns
    filtered_lf = lf.filter(
        (pl.col('week') <= (year * 52)) &
        (pl.col('week') >= ((year - 1) * 52))
    ).drop(
        pl.col('week','sku_id')
    )

    # Intermediate Step: Group by trans_id, panel_id, and brand
    group_trans = filtered_lf.drop(
        pl.col('price', 'units')
    ).group_by(
        'trans_id', 'panel_id', 'brand'
    ).n_unique()
    
    trans = group_trans.collect().pivot(
        on='brand',
        index='panel_id',
        values='panel_id',
        aggregate_function="len"
        
    ).join(
        other=group_trans.group_by('panel_id').n_unique().drop('brand').collect(),
        on='panel_id'
    ).rename(
        {'trans_id': 'Category'}
    )
    return trans.sort('panel_id')

In [145]:
buyers = trans_pivot(grocery_lf, 1).drop('panel_id').count().select(
    'Alpha', 'Bravo', 'Charlie', 'Delta', 'Other', 'Category'
).unpivot(
    variable_name='brand'
)

(
    GT(buyers, rowname_col='brand')
    .tab_header(title='Purchase Occasions by Buyers')
    .tab_stubhead(label="Brands/Category")
    .fmt_integer()
    .data_color(
        rows=['Alpha', 'Bravo', 'Charlie', 'Delta', 'Other'],
        domain=[100, 3_000],
        palette=['orange', 'white', 'rebeccapurple']
    )  
    .cols_label(value='# Of Transactions')
)

Purchase Occasions by Buyers,Purchase Occasions by Buyers
Brands/Category,# Of Transactions
Alpha,2624
Bravo,2562
Charlie,813
Delta,380
Other,176
Category,4574


In [146]:
transactions = trans_pivot(grocery_lf, 1).drop('panel_id').sum().select(
    'Alpha', 'Bravo', 'Charlie', 'Delta', 'Other', 'Category'
).unpivot(
    variable_name='brand'
)

(
    GT(transactions, rowname_col='brand')
    .tab_header(title='Purchase Occasions by Buyers')
    .tab_stubhead(label="Brands/Category")
    .fmt_integer()
    .data_color(
        rows=['Alpha', 'Bravo', 'Charlie', 'Delta', 'Other'],
        domain=[400, 10_000],
        palette=['orange', 'white', 'rebeccapurple']
    )  
    .cols_label(value='# Of Transactions')
)

Purchase Occasions by Buyers,Purchase Occasions by Buyers
Brands/Category,# Of Transactions
Alpha,9060
Bravo,8255
Charlie,1882
Delta,859
Other,422
Category,20030
