In [1]:
import io
import pandas as pd

# Data as a multiline string
data = """
timestamp	dim	clicks	impression
2018-01-01	A	150	1000
2018-01-01	B	150	2000
2018-02-01	A	200	1000
2018-02-01	B	300	2000
2019-01-01	A	120	1100
2019-01-01	B	200	2150
2019-02-01	A	242	1100
2019-02-01	B	323	2150
"""

traffic_stats = pd.read_csv(io.StringIO(data), sep='\t')
traffic_stats['timestamp'] = pd.to_datetime(traffic_stats['timestamp'])
traffic_stats

Unnamed: 0,timestamp,dim,clicks,impression
0,2018-01-01,A,150,1000
1,2018-01-01,B,150,2000
2,2018-02-01,A,200,1000
3,2018-02-01,B,300,2000
4,2019-01-01,A,120,1100
5,2019-01-01,B,200,2150
6,2019-02-01,A,242,1100
7,2019-02-01,B,323,2150


In [2]:
import itertools

traffic = pd.DataFrame(itertools.chain(
    *[[{'timestamp': r['timestamp'], 'dim': r['dim'], 'click': True} for _ in range(r['clicks'])] +
      [{'timestamp': r['timestamp'], 'dim': r['dim'], 'click': False} for _ in range(r['impression'] - r['clicks'])]
      for r in traffic_stats.to_dict(orient='records')]
))
traffic = traffic.assign(
    year=traffic.timestamp.dt.year,
    month=traffic.timestamp.dt.month
)
traffic.head()

Unnamed: 0,timestamp,dim,click,year,month
0,2018-01-01,A,True,2018,1
1,2018-01-01,A,True,2018,1
2,2018-01-01,A,True,2018,1
3,2018-01-01,A,True,2018,1
4,2018-01-01,A,True,2018,1


In [3]:
traffic.groupby(['timestamp', 'dim'])['click'].agg(['sum', 'size'])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,size
timestamp,dim,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,A,150,1000
2018-01-01,B,150,2000
2018-02-01,A,200,1000
2018-02-01,B,300,2000
2019-01-01,A,120,1100
2019-01-01,B,200,2150
2019-02-01,A,242,1100
2019-02-01,B,323,2150


In [37]:
import ibis
import munpack

ibis.options.interactive = True

unpacker = munpack.MeanUnpacker(
    fact='click',
    period=['year', 'month'],
    group='dim',
)

# unpack = unpack.mutate(
#     inner=unpack['share'] * (unpack['ratio'] - unpack['ratio_lag']),
#     mix=(unpack['share'] - unpack['share_lag']) * (unpack['ratio_lag'] - unpack['global_ratio'])
# )

unpack = unpacker(ibis.memtable(traffic, name='traffics'))
unpack

In [29]:
import functools

fact = 'click'
period = 'timestamp'
dimensions = ['month', 'dim']

table = ibis.memtable(traffic, name='traffics')

unpack = table.aggregate(
    by=[*dimensions, period],
    sum=table[fact].sum(),
    count=table[fact].count()
)

# Artificially add rows with 0s when there are no data points for a given group at a given
# period. For instance, there might not be any dentist claims in 2022, but if there some in
# 2021, then we want to have a 0 recorded so that we can measure the difference.
# cartesian_product = functools.reduce(lambda x, y: x.cross_join(y), [table[[d]].distinct() for d in [*dimensions, period]])
# unpack = cartesian_product.left_join(unpack, cartesian_product.columns)[unpack.columns]
# unpack = unpack.mutate(
#     sum=unpack['sum'].fillna(0),
#     count=unpack['count'].fillna(0)
# )

unpack = unpack.mutate(ratio=(unpack['sum'] / unpack['count']).fillna(0))

g = ['timestamp']
#g = ['year', 'month']
yearly_figures = unpack.group_by(g).aggregate(
    sum_sum=unpack['sum'].sum(),
    count_sum=unpack['count'].sum()
)
unpack = unpack.left_join(yearly_figures, g)
unpack = unpack.mutate(
    share=unpack['count'] / unpack['count_sum'],
    global_ratio=unpack['sum_sum'] / unpack['count_sum']
)

# Calculate lag values
unpack = unpack.group_by(dimensions).order_by(period).mutate(
    ratio_lag=unpack['ratio'].lag(1),
    share_lag=unpack['share'].lag(1),
    global_ratio_lag=unpack['global_ratio'].lag(1)
)
unpack = unpack.mutate(
    inner=unpack['share'] * (unpack['ratio'] - unpack['ratio_lag']),
    mix=(unpack['share'] - unpack['share_lag']) * (unpack['ratio_lag'] - unpack['global_ratio'])
)
(
    unpack
    .order_by([period, *dimensions])
    #.select([period, *dimensions, 'inner', 'mix'])
        #.dropna(how="any")
)

In [30]:
import functools

fact = 'click'
period = 'year'
dimensions = ['month', 'dim']

table = ibis.memtable(traffic, name='traffics')

unpack = table.aggregate(
    by=[*dimensions, period],
    sum=table[fact].sum(),
    count=table[fact].count()
)

# Artificially add rows with 0s when there are no data points for a given group at a given
# period. For instance, there might not be any dentist claims in 2022, but if there some in
# 2021, then we want to have a 0 recorded so that we can measure the difference.
cartesian_product = functools.reduce(lambda x, y: x.cross_join(y), [table[[d]].distinct() for d in [*dimensions, period]])
unpack = cartesian_product.left_join(unpack, cartesian_product.columns)[unpack.columns]
unpack = unpack.mutate(
    sum=unpack['sum'].fillna(0),
    count=unpack['count'].fillna(0)
)

unpack = unpack.mutate(ratio=(unpack['sum'] / unpack['count']).fillna(0))

g = ['year']
g = ['year', 'month']
yearly_figures = unpack.group_by(g).aggregate(
    sum_sum=unpack['sum'].sum(),
    count_sum=unpack['count'].sum()
)
unpack = unpack.left_join(yearly_figures, g)
unpack = unpack.mutate(
    share=unpack['count'] / unpack['count_sum'],
    global_ratio=unpack['sum_sum'] / unpack['count_sum']
)

# Calculate lag values
unpack = unpack.group_by(dimensions).order_by(period).mutate(
    ratio_lag=unpack['ratio'].lag(1),
    share_lag=unpack['share'].lag(1),
    global_ratio_lag=unpack['global_ratio'].lag(1)
)
unpack = unpack.mutate(
    inner=unpack['share'] * (unpack['ratio'] - unpack['ratio_lag']),
    mix=(unpack['share'] - unpack['share_lag']) * (unpack['ratio_lag'] - unpack['global_ratio'])
)
(
    unpack
    .order_by([period, *dimensions])
    #.select([period, *dimensions, 'inner', 'mix'])
        #.dropna(how="any")
)

In [36]:
import functools

fact = 'click'
period = ['year', 'month']
dimensions = ['dim']

table = ibis.memtable(traffic, name='traffics')

unpack = table.aggregate(
    by=[*dimensions, *period],
    sum=table[fact].sum(),
    count=table[fact].count()
)

# Artificially add rows with 0s when there are no data points for a given group at a given
# period. For instance, there might not be any dentist claims in 2022, but if there some in
# 2021, then we want to have a 0 recorded so that we can measure the difference.
cartesian_product = functools.reduce(lambda x, y: x.cross_join(y), [table[[d]].distinct() for d in [*dimensions, *period]])
unpack = cartesian_product.left_join(unpack, cartesian_product.columns)[unpack.columns]
unpack = unpack.mutate(
    sum=unpack['sum'].fillna(0),
    count=unpack['count'].fillna(0)
)

unpack = unpack.mutate(ratio=(unpack['sum'] / unpack['count']).fillna(0))

g = ['year']
g = ['year', 'month']
yearly_figures = unpack.group_by(g).aggregate(
    sum_sum=unpack['sum'].sum(),
    count_sum=unpack['count'].sum()
)
unpack = unpack.left_join(yearly_figures, g)
unpack = unpack.mutate(
    share=unpack['count'] / unpack['count_sum'],
    global_ratio=unpack['sum_sum'] / unpack['count_sum']
)

# Calculate lag values
unpack = unpack.group_by([*dimensions, *period[1:]]).order_by(period).mutate(
    ratio_lag=unpack['ratio'].lag(1),
    share_lag=unpack['share'].lag(1),
    global_ratio_lag=unpack['global_ratio'].lag(1)
)
unpack = unpack.mutate(
    inner=unpack['share'] * (unpack['ratio'] - unpack['ratio_lag']),
    mix=(unpack['share'] - unpack['share_lag']) * (unpack['ratio_lag'] - unpack['global_ratio'])
)
(
    unpack
    .order_by([*period, *dimensions])
    .select([*period, *dimensions, 'inner', 'mix'])
    .dropna(how="any")
)

0.333333  0.100000
0.666667  0.100000
0.333333  0.166667
0.666667  0.166667
0.338462  0.098462
0.661538  0.098462
0.338462  0.173846
0.661538  0.173846

inner	mix
-1,38%	0,03%
1,19%	0,01%
	
0,68%	0,02%
0,00%	0,01%