In [1]:
import random
import pandas as pd

random.seed(42)

# Function to generate a random cost based on the claim type and year
def generate_claim_cost(claim_type, year):
    if claim_type == 'Dentist':
        base_cost = 100
    elif claim_type == 'Psychiatrist':
        base_cost = 150
    elif claim_type == 'General Physician':
        base_cost = 80
    elif claim_type == 'Physiotherapy':
        base_cost = 120
    else:
        base_cost = 50

    # Adjust cost based on year
    if year == 2021:
        base_cost *= 1.2
    elif year == 2023:
        base_cost *= 1.5

    # Add some random variation
    cost = random.uniform(base_cost - 20, base_cost + 20)
    return round(cost, 2)

# Generating sample data
claim_types = ['Dentist', 'Psychiatrist', 'General Physician', 'Physiotherapy']
years = [2021, 2022, 2023]
people = ['John', 'Jane', 'Michael', 'Emily', 'William', 'Emma', 'Daniel', 'Olivia', 'Lucas', 'Ava']

data = []
for year in years:
    for person in people:
        num_claims = random.randint(1, 5)  # Random number of claims per person per year
        for _ in range(num_claims):
            claim_type = random.choice(claim_types)
            cost = generate_claim_cost(claim_type, year)
            date = pd.to_datetime(f"{random.randint(1, 12)}/{random.randint(1, 28)}/{year}", format='%m/%d/%Y')
            data.append([person, claim_type, date, year, cost])

# Create the DataFrame
columns = ['person', 'claim_type', 'date', 'year', 'amount']
claims_df = pd.DataFrame(data, columns=columns)
claims_df.sample(5)

Unnamed: 0,person,claim_type,date,year,amount
70,Daniel,General Physician,2023-02-01,2023,118.29
62,William,Dentist,2023-01-02,2023,159.92
10,Emma,General Physician,2021-10-28,2021,109.18
55,Jane,Physiotherapy,2023-08-08,2023,167.34
48,Lucas,Physiotherapy,2022-03-24,2022,108.57


## Sum with gaps

In [2]:
claims_df = claims_df.drop(index=claims_df.query('year == 2021 and claim_type == "Dentist"').index)
claims_df = claims_df.drop(index=claims_df.query('year == 2022 and claim_type == "Physiotherapy"').index)

In [3]:
(
    claims_df
    .groupby('year')
    .agg({'amount': 'sum'})
    .assign(diff=lambda x: x.amount.diff())
    .reset_index()
)

Unnamed: 0,year,amount,diff
0,2021,2710.12,
1,2022,2550.84,-159.28
2,2023,4178.03,1627.19


In [4]:
import munpack
import ibis

ibis.options.interactive = True

unpacker = munpack.SumUnpacker(
    fact='amount',
    period='year',
    dimensions=['claim_type']
)
unpack = unpacker(ibis.memtable(claims_df))
unpack.execute().groupby('year')[['inner', 'mix']].sum().sum(axis=1)

year
2022    -159.28
2023    1627.19
dtype: float64

In [26]:
import functools

table = ibis.memtable(claims_df, name='foo')

unpack = table.aggregate(
    by=[*unpacker.dimensions, unpacker.period],
    mean=table[unpacker.fact].mean(),
    count=table[unpacker.fact].count()
)

cartesian_product = functools.reduce(lambda x, y: x.cross_join(y), [table[[d]].distinct() for d in ['claim_type', 'year']])
unpack = cartesian_product.left_join(unpack, cartesian_product.columns)[unpack.columns]
unpack = unpack.mutate(
    mean=unpack['mean'].fillna(0),
    count=unpack['count'].fillna(0)
)

unpack = (
    unpack
    .group_by(*unpacker.dimensions)
    .order_by(unpacker.period)
    .mutate(
        mean_lag=unpack['mean'].lag(1),
        count_lag=unpack['count'].lag(1)
    )
)
unpack = unpack.mutate(
    inner=unpack['count_lag'] * (unpack['mean'] - unpack['mean_lag']),
    mix=(unpack['count'] - unpack['count_lag']) * unpack['mean']
)
(
    unpack
    .order_by([unpacker.period, *unpacker.dimensions])
    .select([unpacker.period, *unpacker.dimensions, 'inner', 'mix'])
    .dropna(how="any")
    .execute()
    .groupby('year')
    .apply(lambda x: (x.inner + x.mix).sum())
    .rename('diff')
    .reset_index()
)


Unnamed: 0,year,diff
0,2022,-159.28
1,2023,1627.19


In [24]:
(
    unpack.execute()
    .groupby('year')
    .apply(lambda x: (x.inner + x.mix).sum())
    .rename('diff')
    .reset_index()
)


Unnamed: 0,year,diff
0,2021,0.0
1,2022,20.02
2,2023,825.41


In [86]:
import functools
import ibis

ibis.options.interactive = True

table = ibis.memtable(claims_df, name='claims')

unpack = table.aggregate(
    by=[*unpacker.dimensions, unpacker.period],
    sum=table[unpacker.fact].sum(),
    count=table[unpacker.fact].count()
)
cartesian_product = functools.reduce(lambda x, y: x.cross_join(y), [table[[d]].distinct() for d in ['claim_type', 'year']])
unpack = cartesian_product.left_join(unpack, cartesian_product.columns)[unpack.columns]
unpack = unpack.mutate(
    sum=unpack['sum'].fillna(0),
    count=unpack['count'].fillna(0)
)

unpack = unpack.mutate(ratio=(unpack['sum'] / unpack['count']).fillna(0))

# Period figures
yearly_figures = unpack.group_by(unpacker.period).aggregate(
    sum_sum=unpack['sum'].sum(),
    count_sum=unpack['count'].sum()
)
unpack = unpack.left_join(yearly_figures, unpack[unpacker.period] == yearly_figures[unpacker.period])
unpack = unpack.mutate(
    share=unpack['count'] / unpack['count_sum'],
    global_ratio=unpack['sum_sum'] / unpack['count_sum']
)

# Calculate lag values
unpack = unpack.group_by(*unpacker.dimensions).order_by(unpacker.period).mutate(
    ratio_lag=unpack['ratio'].lag(1),
    share_lag=unpack['share'].lag(1),
    global_ratio_lag=unpack['global_ratio'].lag(1)
)
unpack = unpack.mutate(
    inner=unpack['share'] * (unpack['ratio'] - unpack['ratio_lag']),
    mix=(unpack['share'] - unpack['share_lag']) * (unpack['ratio_lag'] - unpack['global_ratio_lag'])
)

unpack.order_by([unpacker.period, *unpacker.dimensions]).execute()


Unnamed: 0,claim_type,year,sum,count,ratio,year_right,sum_sum,count_sum,share,global_ratio,ratio_lag,share_lag,global_ratio_lag,inner,mix
0,General Physician,2021,594.44,6,99.073333,2021,2710.12,19,0.315789,142.637895,,,,,
1,Physiotherapy,2021,801.78,6,133.63,2021,2710.12,19,0.315789,142.637895,,,,,
2,Psychiatrist,2021,1313.9,7,187.7,2021,2710.12,19,0.368421,142.637895,,,,,
3,Dentist,2022,622.48,6,103.746667,2022,2550.84,23,0.26087,110.906087,,,,,
4,General Physician,2022,749.08,9,83.231111,2022,2550.84,23,0.391304,110.906087,99.073333,0.315789,142.637895,-6.19913,-3.289772
5,Psychiatrist,2022,1179.28,8,147.41,2022,2550.84,23,0.347826,110.906087,187.7,0.368421,142.637895,-14.013913,-0.928053
6,Dentist,2023,1440.99,9,160.11,2023,4178.03,26,0.346154,160.693462,103.746667,0.26087,110.906087,19.510385,-0.610586
7,General Physician,2023,826.18,7,118.025714,2023,4178.03,26,0.269231,160.693462,83.231111,0.391304,110.906087,9.367778,3.378383
8,Physiotherapy,2023,1049.15,6,174.858333,2023,4178.03,26,0.230769,160.693462,133.63,0.315789,142.637895,9.514231,0.765853
9,Psychiatrist,2023,861.71,4,215.4275,2023,4178.03,26,0.153846,160.693462,147.41,0.347826,110.906087,10.464231,-7.081027


In [87]:
unpack.execute().groupby('year')[['inner', 'mix']].sum().sum(axis=1)


year
2021     0.000000
2022   -24.430868
2023    45.309248
dtype: float64

In [28]:
import munpack
import ibis

unpacker = munpack.SumUnpacker(
    fact='amount',
    period='year',
    dimensions=['claim_type']
)
unpack = unpacker(ibis.memtable(claims_df))
(
    unpack.execute()
    .groupby('year')
    .apply(lambda x: (x.inner + x.mix).sum())
    .rename('diff')
    .reset_index()
)

Unnamed: 0,year,diff
0,2022,20.02
1,2023,825.41


	year	diff
0	2022	20.02
1	2023	578.04

## Sum

In [4]:
import locale
import numpy as np

locale.setlocale(locale.LC_MONETARY, 'en_US.UTF-8')
def fmt_currency(x):
    return 'N/A' if np.isnan(x) else locale.currency(x, grouping=True)

(
    claims_df.groupby('year')
    .agg({'amount': 'sum'})
    .assign(diff=lambda x: x['amount'].diff())
    .style.format(fmt_currency)
)

Unnamed: 0_level_0,amount,diff
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2021,"$3,814.54",
2022,"$2,890.29",-$924.25
2023,"$4,178.03","$1,287.74"


In [5]:
(
    claims_df.groupby(['year', 'claim_type'])
    .agg({'amount': 'sum'})
    .assign(diff=lambda x: x.groupby('claim_type')['amount'].diff())
    .style.format(fmt_currency)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,amount,diff
year,claim_type,Unnamed: 2_level_1,Unnamed: 3_level_1
2021,Dentist,"$1,104.42",
2021,General Physician,$594.44,
2021,Physiotherapy,$801.78,
2021,Psychiatrist,"$1,313.90",
2022,Dentist,$622.48,-$481.94
2022,General Physician,$749.08,$154.64
2022,Physiotherapy,$339.45,-$462.33
2022,Psychiatrist,"$1,179.28",-$134.62
2023,Dentist,"$1,440.99",$818.51
2023,General Physician,$826.18,$77.10


In [6]:
import munpack

unpacker = munpack.SumUnpacker(
    fact='amount',
    period='year',
    dimensions=['claim_type']
)
unpack = unpacker.transform(claims_df)
unpack.style.format(fmt_currency)

AttributeError: 'SumUnpacker' object has no attribute 'transform'

In [63]:
(
    unpack
    .groupby('year')
    .apply(lambda x: (x.inner + x.mix).sum())
    .to_frame('sum')
    .style.format(fmt_currency)
)

Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
2022,-$924.25
2023,"$1,287.74"


## Ratio

In [7]:
averages = claims_df.groupby('year')['amount'].mean()
averages = pd.DataFrame({
    'average': averages,
    'diff': averages - averages.shift()
})
averages.style.format(fmt_currency)

Unnamed: 0_level_0,average,diff
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2021,$136.23,
2022,$111.16,-$25.07
2023,$160.69,$49.53


In [12]:
unpacker = munpack.MeanUnpacker(
    fact='amount',
    period='year',
    dimensions=['claim_type']
)
unpack = unpacker(claims_df)
unpack.style.format(fmt_currency)

Unnamed: 0_level_0,Unnamed: 1_level_0,inner,mix
year,claim_type,Unnamed: 2_level_1,Unnamed: 3_level_1
2022,Dentist,-$4.38,$1.23
2022,General Physician,-$5.48,-$4.90
2022,Physiotherapy,-$2.36,$0.26
2022,Psychiatrist,-$12.40,$2.97
2023,Dentist,$19.51,-$0.86
2023,General Physician,$9.37,$2.15
2023,Physiotherapy,$14.24,$0.23
2023,Psychiatrist,$10.46,-$5.58


In [66]:
(
    unpack
    .groupby('year')
    .apply(lambda x: (x.inner + x.mix).sum())
    .to_frame('sum')
    .style.format(fmt_currency)
)

Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
2022,-$25.07
2023,$49.53


## Funnel

In [None]:
https://www.kaggle.com/code/paultimothymooney/how-to-query-the-google-analytics-sample-dataset?scriptVersionId=5165120

In [73]:
import pandas as pd

traffic = pd.DataFrame({
    'date': ['2018-01-01', '2018-01-01', '2018-01-01', '2019-01-01', '2019-01-01', '2019-01-01', '2018-02-01', '2018-02-01', '2018-02-01', '2019-02-01', '2019-02-01', '2019-02-01'],
    'group': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C'],
    'impressions': [1000, 2000, 2500, 1000, 2150, 2000, 50, 2000, 2500, 2500, 2150, 2000],
    'clicks': [150, 150, 250, 120, 200, 400, 20, 300, 250, 1000, 323, 320],
    'conversions': [120, 150, 125, 100, 145, 166, 10, 150, 125, 500, 145, 166],
    'revenue': ['$8,600', '$9,400', '$10,750', '$9,055', '$8,739', '$10,147', '$500', '$11,400', '$8,750', '$50,000', '$10,739', '$12,147'],
})
traffic['date'] = pd.to_datetime(traffic['date'])
traffic['revenue'] = traffic['revenue'].str.replace('$', '', regex=False).str.replace(',', '', regex=False).astype(float)
traffic.style.format({'revenue': fmt_currency})

Unnamed: 0,date,group,impressions,clicks,conversions,revenue
0,2018-01-01 00:00:00,A,1000,150,120,"$8,600.00"
1,2018-01-01 00:00:00,B,2000,150,150,"$9,400.00"
2,2018-01-01 00:00:00,C,2500,250,125,"$10,750.00"
3,2019-01-01 00:00:00,A,1000,120,100,"$9,055.00"
4,2019-01-01 00:00:00,B,2150,200,145,"$8,739.00"
5,2019-01-01 00:00:00,C,2000,400,166,"$10,147.00"
6,2018-02-01 00:00:00,A,50,20,10,$500.00
7,2018-02-01 00:00:00,B,2000,300,150,"$11,400.00"
8,2018-02-01 00:00:00,C,2500,250,125,"$8,750.00"
9,2019-02-01 00:00:00,A,2500,1000,500,"$50,000.00"


In [74]:
(
    traffic
    .assign(year=traffic.date.dt.year)
    .groupby('year')
    .agg({'revenue': 'sum'})
    .diff()
    .dropna()
    .style.format(fmt_currency)
)

Unnamed: 0_level_0,revenue
year,Unnamed: 1_level_1
2019,"$51,427.00"


In [139]:
class FunnelUnpacker:
    def __init__(self, funnel, period, dimensions):
        self.funnel = funnel
        self.period = period
        self.dimensions = dimensions

    def transform(self, facts):

        unpack = (
            facts.groupby([self.period, *self.dimensions])
            [self.funnel]
            .sum().sort_values(self.period)
        )

        ratios = {
            (f'{num}_by_{den}' if den else num): (num, den)
            for den, num in [(None, self.funnel[0]), *zip(self.funnel, self.funnel[1:])]
        }

        for ratio_name, (num, den) in ratios.items():
            if den:
                unpack[ratio_name] = unpack[num] / unpack[den]
            unpack[f'{ratio_name}_lag'] = unpack.groupby(self.dimensions)[ratio_name].shift(1)

        ratio_names = list(ratios)
        for i, _ in enumerate(ratio_names):
            before = ratio_names[:i]
            current = f'({ratio_names[i]} - {ratio_names[i]}_lag)'
            after = [f'{x}_lag' for x in ratio_names[i+1:]]
            formula = ' * '.join(filter(None, [*before, current, *after]))
            unpack[f'{ratio_names[i]}_contribution'] = unpack.eval(formula)

        return (
            unpack[[col for col in unpack.columns if '_contribution' in col]]
            .rename(columns=lambda x: x.replace('_contribution', ''))
            .dropna()
        )


unpacker = FunnelUnpacker(
    funnel=['impressions', 'clicks', 'conversions', 'revenue'],
    period='year',
    dimensions=['month', 'group']
)
traffic = traffic.assign(
    month=traffic.date.dt.month_name(),
    year=traffic.date.dt.year
)
unpack = unpacker.transform(traffic)
unpack.style.format(fmt_currency)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,impressions,clicks_by_impressions,conversions_by_clicks,revenue_by_conversions
year,month,group,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019,February,A,"$24,500.00",$0.00,$0.00,"$25,000.00"
2019,February,B,$855.00,$19.00,"-$1,254.00",-$281.00
2019,February,C,"-$1,750.00","$4,200.00",$420.00,$527.00
2019,January,A,$0.00,"-$1,720.00",$286.67,"$1,888.33"
2019,January,B,$705.00,"$2,428.33","-$3,446.67",-$347.67
2019,January,C,"-$2,150.00","$8,600.00","-$2,924.00","-$4,129.00"


In [142]:
(
    unpack
    .groupby('year').sum().sum(axis=1)
    .to_frame('sum')
    .style.format(fmt_currency)
)

Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
2019,"$51,427.00"
