In [233]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy import stats
from tqdm import tqdm

In [234]:
def build_groups(n_users: int, min_orders: int = 0) -> list:
    """Generating data with simulated AOV metric"""

    result = list()

    user_orders = np.random.randint(min_orders, 10, n_users)
    avg_costs = np.random.normal(1000, 100, n_users)

    for orders, cost in zip(user_orders, avg_costs):
        result.append(np.random.normal(cost, 100, orders))
        
    return result


Let's calculate the statistical significance using a t-test in a naive way expanding the purchases of each user

And let’s estimate the share of errors of the first type

In [239]:
p_values = list()

for _ in tqdm(range(1000)):

    a = np.hstack(build_groups(1000, 1))
    b = np.hstack(build_groups(1000, 1))
    
    p_value = stats.ttest_ind(a, b)[1]
    p_values.append(p_value < 0.05)

100%|██████████████████████████████████████| 1000/1000 [00:01<00:00, 522.72it/s]


In [240]:
np.mean(pvalues)

0.5015

The share of type I errors is greater than the expected value of 0.05

Let's calculate the statistical significance of the t-test by averaging the user's purchase costs (average of the average)

And let’s estimate the share of errors of the first type

In [241]:
p_values = list()

for _ in tqdm(range(1000)):

    a = [np.mean(val) for val in build_groups(1000, 1)]
    b = [np.mean(val) for val in build_groups(1000, 1)]
    
    p_value = stats.ttest_ind(a, b)[1]
    p_values.append(p_value < 0.05)

100%|██████████████████████████████████████| 1000/1000 [00:05<00:00, 190.76it/s]


In [242]:
np.mean(p_values)

0.053

The value of the share of errors of the first type corresponds to the expected

But such a metric is not aligned with the original metric of the average bill

In [243]:
a = {
    "user1": [1000, 1000],
    "user2": [2000]
}

b = {
    "user1": [1500, 1550],
    "user2": [1000]
}

aov_a, aov_b = np.mean(np.hstack(list(a.values()))), np.mean(np.hstack(list(b.values())))
mean_mean_a = (np.mean(a["user1"]) + np.mean(a["user2"])) / len(a)
mean_mean_b = (np.mean(b["user1"]) + np.mean(b["user2"])) / len(b)

print(f"AOV I: {aov_a}\nAOV II: {aov_b}\n\nAverage per user I: {mean_mean_a}\nAverage per user I: {mean_mean_b}")

AOV I: 1333.3333333333333
AOV II: 1350.0

Average per user I: 1500.0
Average per user I: 1262.5


Metrics are not directed equally

In [244]:
def delta_method(a: list, b: list) -> float:
    """Returns p-value calculated by the delta method"""
    
    stat_list = list()
    disp_list = list()
    
    for data in [a, b]:
        sum_values = np.array([np.sum(val) for val in data])
        cnt_values = np.array([len(val) for val in data])
    
        mu_sum = np.mean(sum_values)
        mu_cnt = np.mean(cnt_values)
    
        disp_sum = np.var(sum_values)
        disp_cnt = np.var(cnt_values)
    
        cov = np.cov(sum_values, cnt_values)[0, 1]
        
        score = np.sum(sum_values) / np.sum(cnt_values)
        
        disp = (disp_sum / mu_cnt ** 2 - 2 * (mu_sum / mu_cnt ** 3)\
                * cov + (mu_sum ** 2 / mu_cnt ** 4) * disp_cnt) / len(data)
        
        stat_list.append(score)
        disp_list.append(disp) 
    
    stat = disp_list[0] + disp_list[1]
    delta = stat_list[1] - stat_list[0]
    
    t = delta / np.sqrt(stat)
    
    p_value = (1 - stats.norm.cdf(np.abs(t))) * 2
    
    return p_value


In [245]:
p_values = list()

for _ in tqdm(range(3000)):

    a = build_groups(1000)
    b = build_groups(1000)
    
    p_value = delta_method(a, b)
    p_values.append(int(p_value < 0.05))


100%|██████████████████████████████████████| 3000/3000 [00:13<00:00, 226.07it/s]


In [246]:
np.mean(p_values)

0.05266666666666667

Type I error at expected level