# Health insurance claims

In [2]:
import random
import pandas as pd

random.seed(42)

# Function to generate a random cost based on the claim type and year
def generate_claim_cost(claim_type, year):
    if claim_type == 'Dentist':
        base_cost = 100
    elif claim_type == 'Psychiatrist':
        base_cost = 150
    elif claim_type == 'General Physician':
        base_cost = 80
    elif claim_type == 'Physiotherapy':
        base_cost = 120
    else:
        base_cost = 50

    # Adjust cost based on year
    if year == 2021:
        base_cost *= 1.2
    elif year == 2023:
        base_cost *= 1.5

    # Add some random variation
    cost = random.uniform(base_cost - 20, base_cost + 20)
    return round(cost, 2)

# Generating sample data
claim_types = ['Dentist', 'Psychiatrist', 'General Physician', 'Physiotherapy']
years = [2021, 2022, 2023]
people = ['John', 'Jane', 'Michael', 'Emily', 'William', 'Emma', 'Daniel', 'Olivia', 'Lucas', 'Ava']

data = []
for year in years:
    for person in people:
        num_claims = random.randint(1, 5)  # Random number of claims per person per year
        for _ in range(num_claims):
            claim_type = random.choice(claim_types)
            cost = generate_claim_cost(claim_type, year)
            date = pd.to_datetime(f"{random.randint(1, 12)}/{random.randint(1, 28)}/{year}", format='%m/%d/%Y')
            data.append([person, claim_type, date, year, cost])

# Create the DataFrame
columns = ['person', 'claim_type', 'date', 'year', 'amount']
claims_df = pd.DataFrame(data, columns=columns)
claims_df.sample(5)

Unnamed: 0,person,claim_type,date,year,amount
47,Lucas,General Physician,2022-04-09,2022,62.83
14,Olivia,General Physician,2021-06-07,2021,82.51
38,Emma,Dentist,2022-09-25,2022,119.12
56,Jane,Dentist,2023-09-04,2023,147.73
69,Daniel,General Physician,2023-07-05,2023,109.55


## Sum with gaps

In [3]:
claims_df = claims_df.drop(index=claims_df.query('year == 2021 and claim_type == "Dentist"').index)
claims_df = claims_df.drop(index=claims_df.query('year == 2022 and claim_type == "Physiotherapy"').index)

In [4]:
(
    claims_df
    .groupby('year')
    .agg({'amount': 'sum'})
    .assign(diff=lambda x: x.amount.diff())
    .reset_index()
)

Unnamed: 0,year,amount,diff
0,2021,2710.12,
1,2022,2550.84,-159.28
2,2023,4178.03,1627.19


In [5]:
import mexplanation
import ibis

ibis.options.interactive = True

explainer = mexplanation.SumExplainer(
    fact='amount',
    period='year',
    dimensions=['claim_type']
)
explanation = explainer(ibis.memtable(claims_df))
explanation.execute().groupby('year')[['inner', 'mix']].sum().sum(axis=1)

ModuleNotFoundError: No module named 'mexplanation'

In [6]:
import functools

table = ibis.memtable(claims_df, name='foo')

explanation = table.aggregate(
    by=[*explainer.dimensions, explainer.period],
    mean=table[explainer.fact].mean(),
    count=table[explainer.fact].count()
)

cartesian_product = functools.reduce(lambda x, y: x.cross_join(y), [table[[d]].distinct() for d in ['claim_type', 'year']])
explanation = cartesian_product.left_join(explanation, cartesian_product.columns)[explanation.columns]
explanation = explanation.mutate(
    mean=explanation['mean'].fillna(0),
    count=explanation['count'].fillna(0)
)

explanation = (
    explanation
    .group_by(*explainer.dimensions)
    .order_by(explainer.period)
    .mutate(
        mean_lag=explanation['mean'].lag(1),
        count_lag=explanation['count'].lag(1)
    )
)
explanation = explanation.mutate(
    inner=explanation['count_lag'] * (explanation['mean'] - explanation['mean_lag']),
    mix=(explanation['count'] - explanation['count_lag']) * explanation['mean']
)
(
    explanation
    .order_by([explainer.period, *explainer.dimensions])
    .select([explainer.period, *explainer.dimensions, 'inner', 'mix'])
    .dropna(how="any")
    .execute()
    .groupby('year')
    .apply(lambda x: (x.inner + x.mix).sum())
    .rename('diff')
    .reset_index()
)


NameError: name 'ibis' is not defined

In [7]:
(
    explanation.execute()
    .groupby('year')
    .apply(lambda x: (x.inner + x.mix).sum())
    .rename('diff')
    .reset_index()
)


NameError: name 'explanation' is not defined

In [8]:
import functools
import ibis

ibis.options.interactive = True

table = ibis.memtable(claims_df, name='claims')

explanation = table.aggregate(
    by=[*explainer.dimensions, explainer.period],
    sum=table[explainer.fact].sum(),
    count=table[explainer.fact].count()
)
cartesian_product = functools.reduce(lambda x, y: x.cross_join(y), [table[[d]].distinct() for d in ['claim_type', 'year']])
explanation = cartesian_product.left_join(explanation, cartesian_product.columns)[explanation.columns]
explanation = explanation.mutate(
    sum=explanation['sum'].fillna(0),
    count=explanation['count'].fillna(0)
)

explanation = explanation.mutate(ratio=(explanation['sum'] / explanation['count']).fillna(0))

# Period figures
yearly_figures = explanation.group_by(explainer.period).aggregate(
    sum_sum=explanation['sum'].sum(),
    count_sum=explanation['count'].sum()
)
explanation = explanation.left_join(yearly_figures, explanation[explainer.period] == yearly_figures[explainer.period])
explanation = explanation.mutate(
    share=explanation['count'] / explanation['count_sum'],
    global_ratio=explanation['sum_sum'] / explanation['count_sum']
)

# Calculate lag values
explanation = explanation.group_by(*explainer.dimensions).order_by(explainer.period).mutate(
    ratio_lag=explanation['ratio'].lag(1),
    share_lag=explanation['share'].lag(1),
    global_ratio_lag=explanation['global_ratio'].lag(1)
)
explanation = explanation.mutate(
    inner=explanation['share'] * (explanation['ratio'] - explanation['ratio_lag']),
    mix=(explanation['share'] - explanation['share_lag']) * (explanation['ratio_lag'] - explanation['global_ratio_lag'])
)

explanation.order_by([explainer.period, *explainer.dimensions]).execute()


NameError: name 'explainer' is not defined

In [None]:
explanation.execute().groupby('year')[['inner', 'mix']].sum().sum(axis=1)


year
2021     0.000000
2022   -24.430868
2023    45.309248
dtype: float64

In [9]:
import mexplanation
import ibis

explainer = mexplanation.SumExplainer(
    fact='amount',
    period='year',
    dimensions=['claim_type']
)
explanation = explainer(ibis.memtable(claims_df))
(
    explanation.execute()
    .groupby('year')
    .apply(lambda x: (x.inner + x.mix).sum())
    .rename('diff')
    .reset_index()
)

ModuleNotFoundError: No module named 'mexplanation'

	year	diff
0	2022	20.02
1	2023	578.04

## Sum

In [10]:
import locale
import numpy as np

locale.setlocale(locale.LC_MONETARY, 'en_US.UTF-8')
def fmt_currency(x):
    return 'N/A' if np.isnan(x) else locale.currency(x, grouping=True)

(
    claims_df.groupby('year')
    .agg({'amount': 'sum'})
    .assign(diff=lambda x: x['amount'].diff())
    .style.format(fmt_currency)
)

Unnamed: 0_level_0,amount,diff
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2021,"$2,710.12",
2022,"$2,550.84",-$159.28
2023,"$4,178.03","$1,627.19"


In [11]:
(
    claims_df.groupby(['year', 'claim_type'])
    .agg({'amount': 'sum'})
    .assign(diff=lambda x: x.groupby('claim_type')['amount'].diff())
    .style.format(fmt_currency)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,amount,diff
year,claim_type,Unnamed: 2_level_1,Unnamed: 3_level_1
2021,General Physician,$594.44,
2021,Physiotherapy,$801.78,
2021,Psychiatrist,"$1,313.90",
2022,Dentist,$622.48,
2022,General Physician,$749.08,$154.64
2022,Psychiatrist,"$1,179.28",-$134.62
2023,Dentist,"$1,440.99",$818.51
2023,General Physician,$826.18,$77.10
2023,Physiotherapy,"$1,049.15",$247.37
2023,Psychiatrist,$861.71,-$317.57


In [12]:
import mexplanation

explainer = mexplanation.SumExplainer(
    fact='amount',
    period='year',
    dimensions=['claim_type']
)
explanation = explainer.transform(claims_df)
explanation.style.format(fmt_currency)

ModuleNotFoundError: No module named 'mexplanation'

In [13]:
(
    explanation
    .groupby('year')
    .apply(lambda x: (x.inner + x.mix).sum())
    .to_frame('sum')
    .style.format(fmt_currency)
)

NameError: name 'explanation' is not defined

## Ratio

In [14]:
averages = claims_df.groupby('year')['amount'].mean()
averages = pd.DataFrame({
    'average': averages,
    'diff': averages - averages.shift()
})
averages.style.format(fmt_currency)

Unnamed: 0_level_0,average,diff
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2021,$142.64,
2022,$110.91,-$31.73
2023,$160.69,$49.79


In [15]:
explainer = mexplanation.MeanExplainer(
    fact='amount',
    period='year',
    dimensions=['claim_type']
)
explanation = explainer(claims_df)
explanation.style.format(fmt_currency)

NameError: name 'mexplanation' is not defined

In [16]:
(
    explanation
    .groupby('year')
    .apply(lambda x: (x.inner + x.mix).sum())
    .to_frame('sum')
    .style.format(fmt_currency)
)

NameError: name 'explanation' is not defined

## Funnel

In [17]:
https://www.kaggle.com/code/paultimothymooney/how-to-query-the-google-analytics-sample-dataset?scriptVersionId=5165120

SyntaxError: invalid syntax (3498806826.py, line 1)

In [18]:
import pandas as pd

traffic = pd.DataFrame({
    'date': ['2018-01-01', '2018-01-01', '2018-01-01', '2019-01-01', '2019-01-01', '2019-01-01', '2018-02-01', '2018-02-01', '2018-02-01', '2019-02-01', '2019-02-01', '2019-02-01'],
    'group': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C'],
    'impressions': [1000, 2000, 2500, 1000, 2150, 2000, 50, 2000, 2500, 2500, 2150, 2000],
    'clicks': [150, 150, 250, 120, 200, 400, 20, 300, 250, 1000, 323, 320],
    'conversions': [120, 150, 125, 100, 145, 166, 10, 150, 125, 500, 145, 166],
    'revenue': ['$8,600', '$9,400', '$10,750', '$9,055', '$8,739', '$10,147', '$500', '$11,400', '$8,750', '$50,000', '$10,739', '$12,147'],
})
traffic['date'] = pd.to_datetime(traffic['date'])
traffic['revenue'] = traffic['revenue'].str.replace('$', '', regex=False).str.replace(',', '', regex=False).astype(float)
traffic.style.format({'revenue': fmt_currency})

Unnamed: 0,date,group,impressions,clicks,conversions,revenue
0,2018-01-01 00:00:00,A,1000,150,120,"$8,600.00"
1,2018-01-01 00:00:00,B,2000,150,150,"$9,400.00"
2,2018-01-01 00:00:00,C,2500,250,125,"$10,750.00"
3,2019-01-01 00:00:00,A,1000,120,100,"$9,055.00"
4,2019-01-01 00:00:00,B,2150,200,145,"$8,739.00"
5,2019-01-01 00:00:00,C,2000,400,166,"$10,147.00"
6,2018-02-01 00:00:00,A,50,20,10,$500.00
7,2018-02-01 00:00:00,B,2000,300,150,"$11,400.00"
8,2018-02-01 00:00:00,C,2500,250,125,"$8,750.00"
9,2019-02-01 00:00:00,A,2500,1000,500,"$50,000.00"


In [28]:
(
    traffic
    .assign(year=traffic.date.dt.year)
    .groupby('year')
    .agg({'revenue': 'sum'})
    .diff()
    .dropna()
    .style.format(fmt_currency)
)

Unnamed: 0_level_0,revenue
year,Unnamed: 1_level_1
2019,"$51,427.00"


In [29]:
class FunnelExplainer:
    def __init__(self, funnel, period, dimensions):
        self.funnel = funnel
        self.period = period
        self.dimensions = dimensions

    def transform(self, facts):

        explanation = (
            facts.groupby([self.period, *self.dimensions])
            [self.funnel]
            .sum().sort_values(self.period)
        )

        ratios = {
            (f'{num}_by_{den}' if den else num): (num, den)
            for den, num in [(None, self.funnel[0]), *zip(self.funnel, self.funnel[1:])]
        }

        for ratio_name, (num, den) in ratios.items():
            if den:
                explanation[ratio_name] = explanation[num] / explanation[den]
            explanation[f'{ratio_name}_lag'] = explanation.groupby(self.dimensions)[ratio_name].shift(1)

        ratio_names = list(ratios)
        for i, _ in enumerate(ratio_names):
            before = ratio_names[:i]
            current = f'({ratio_names[i]} - {ratio_names[i]}_lag)'
            print(current)
            after = [f'{x}_lag' for x in ratio_names[i+1:]]
            formula = ' * '.join(filter(None, [*before, current, *after]))
            explanation[f'{ratio_names[i]}_contribution'] = explanation.eval(formula)

        return (
            #explanation[[col for col in explanation.columns if '_contribution' in col]]
            explanation
            #.rename(columns=lambda x: x.replace('_contribution', ''))
            .dropna()
        )


explainer = FunnelExplainer(
    funnel=['impressions', 'clicks', 'conversions', 'revenue'],
    period='year',
    dimensions=['month', 'group']
)
traffic = traffic.assign(
    month=traffic.date.dt.month_name(),
    year=traffic.date.dt.year
)
explanation = explainer.transform(traffic)
#explanation.style.format(fmt_currency)
explanation

(impressions - impressions_lag)
(clicks_by_impressions - clicks_by_impressions_lag)
(conversions_by_clicks - conversions_by_clicks_lag)
(revenue_by_conversions - revenue_by_conversions_lag)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,impressions,clicks,conversions,revenue,impressions_lag,clicks_by_impressions,clicks_by_impressions_lag,conversions_by_clicks,conversions_by_clicks_lag,revenue_by_conversions,revenue_by_conversions_lag,impressions_contribution,clicks_by_impressions_contribution,conversions_by_clicks_contribution,revenue_by_conversions_contribution
year,month,group,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2019,February,A,2500,1000,500,50000.0,50.0,0.4,0.4,0.5,0.5,100.0,50.0,24500.0,0.0,0.0,25000.0
2019,February,B,2150,323,145,10739.0,2000.0,0.150233,0.15,0.448916,0.5,74.062069,76.0,855.0,19.0,-1254.0,-281.0
2019,February,C,2000,320,166,12147.0,2500.0,0.16,0.1,0.51875,0.5,73.174699,70.0,-1750.0,4200.0,420.0,527.0
2019,January,A,1000,120,100,9055.0,1000.0,0.12,0.15,0.833333,0.8,90.55,71.666667,0.0,-1720.0,286.666667,1888.333333
2019,January,B,2150,200,145,8739.0,2000.0,0.093023,0.075,0.725,1.0,60.268966,62.666667,705.0,2428.333333,-3446.666667,-347.666667
2019,January,C,2000,400,166,10147.0,2500.0,0.2,0.1,0.415,0.5,61.126506,86.0,-2150.0,8600.0,-2924.0,-4129.0


In [21]:
(
    explanation
    .groupby('year').sum().sum(axis=1)
    .to_frame('sum')
    .style.format(fmt_currency)
)

Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
2019,"$51,427.00"
