In [84]:
import pandas as pd
import numpy as np
import scipy.stats as stats

def safe_divide(x, y):
    try:
        return x / y
    except ZeroDivisionError:
        return np.nan

def delta_var(numerator, denominator):
    """
    Функция для расчета дисперсии дельта-методом, numerator - вектор числитель, denominator - вектор знаменатель
    """
    x = numerator
    y = denominator
    n = len(x)
    mu_x = np.mean(x)
    mu_y = np.mean(y)
    var_x = np.var(x, ddof=1)
    var_y = np.var(y, ddof=1)
    cov_xy = np.cov(x, y, ddof=1)[0][1]
    delta_var = safe_divide(safe_divide(var_x,mu_y**2)  - 2*cov_xy*safe_divide(mu_x,mu_y**3) + var_y*safe_divide(mu_x**2,mu_y**4), n)
    return delta_var


data: pd.DataFrame = pd.read_csv('./synthetic_gmv_data_1.2.csv')

# agg_1: pd.DataFrame = df.groupby([ 'group_name', 'user_id' ], as_index=False)
#
# data: pd.DataFrame = agg_1[ 'gmv' ].sum()

control = data[data['group_name'] == 'control']
test = data[data['group_name'] == 'test']

control = control.groupby('user_id', as_index=False).agg(
    gmv_sum = ('gmv', 'sum'),
    gmv_count = ('gmv', 'count')
)

test = test.groupby('user_id', as_index=False).agg(
    gmv_sum = ('gmv', 'sum'),
    gmv_count = ('gmv', 'count')
)

control, test

(           user_id  gmv_sum  gmv_count
 0       00062h7u56      733          1
 1       000ic5j18m     2933          6
 2       00174ganru     1496          4
 3       0018yddvkm     7776          8
 4       0018yxz3f4     1671          4
 ...            ...      ...        ...
 147286  zzx6n9u92d     3657          4
 147287  zzxa8wbrzd     2118          2
 147288  zzxq8adett     2364          4
 147289  zzyhn5f9zw     1235          2
 147290  zzztoicz6c     3579          5
 
 [147291 rows x 3 columns],
           user_id  gmv_sum  gmv_count
 0      00074uxybk     3187          3
 1      000plmykri     1695          5
 2      0026yqk83k     1293          3
 3      002ioy63br     3862          6
 4      0032wrbd7c      765          3
 ...           ...      ...        ...
 49095  zzv633k6jk     3809          7
 49096  zzvup22tly     3938          3
 49097  zzx7apehbt     3422          4
 49098  zzxoenzox7     1141          2
 49099  zzyggo4hgm     1923          4
 
 [49100 rows x 3 col

In [85]:
import math
gmv_mean_c = control[ 'gmv_sum' ].mean()
gmv_mean_t = test[ 'gmv_sum' ].mean()

gmv_var_c = control[ 'gmv_sum' ].var(ddof=1)
gmv_var_t = test[ 'gmv_sum' ].var(ddof=1)

count_mean_c = control[ 'gmv_count' ].mean()
count_mean_t = test[ 'gmv_count' ].mean()

count_var_c = control[ 'gmv_count' ].var(ddof=1)
count_var_t = test[ 'gmv_count' ].var(ddof=1)

t_dist = stats.t(df=len(control) + len(test) - 2)
norm_dist = stats.norm

alfa = 0.05

print(gmv_mean_c, gmv_mean_t)
print(math.sqrt(gmv_var_c), math.sqrt(gmv_var_t))

2847.2169990019756 2870.650366598778
1888.2782395366196 1910.657840458099


In [86]:
# average gmv delta

sigma_aver = math.sqrt(gmv_var_c / len(control) + gmv_var_t / len(test))

gmv_average_delta = gmv_mean_t - gmv_mean_c

t_stat = t_dist.ppf(1 - alfa / 2)

print(gmv_average_delta)

l, r = gmv_average_delta - t_stat * sigma_aver, gmv_average_delta + t_stat * sigma_aver

round(l.item(), 3), round(r.item(), 3)

23.433367596802327


(3.975, 42.891)

In [87]:
# average gmv uplift
uplift = 100 * (gmv_mean_t - gmv_mean_c) / gmv_mean_c

# var = delta_var(control['gmv_sum'], test['gmv_sum'])

var = 1 / (gmv_mean_c ** 2) * (gmv_var_t / len(test) + (gmv_mean_t / gmv_mean_c) ** 2 * gmv_var_c / len(control))

# sigma = 1 / gmv_mean_c * math.sqrt(
#     gmv_var_t / len(test) + ((gmv_mean_t / gmv_mean_c) ** 2) * gmv_var_c / len(control)
# )

sigma = math.sqrt(var)

print(uplift)

z_stat = t_dist.ppf(1 - alfa / 2)

l, r = uplift - 100 * z_stat * sigma, uplift + 100 * z_stat * sigma

round(l.item(), 3), round(r.item(), 3)

0.823027103484432


(0.138, 1.508)

In [88]:
# average median gmv delta

delta = gmv_mean_t / count_mean_t - gmv_mean_c / count_mean_c

sigma = math.sqrt(
    delta_var(control['gmv_sum'], control['gmv_count'])
    + delta_var(test['gmv_sum'], test['gmv_count'])
)

z_stat = t_dist.ppf(1 - alfa / 2)

l, r = delta - z_stat * sigma, delta + z_stat * sigma

round(l.item(), 3), round(r.item(), 3)


(0.652, 7.313)

In [89]:
# average median gmv uplift

delta = 100 * (gmv_mean_t / count_mean_t - gmv_mean_c / count_mean_c) / (gmv_mean_c / count_mean_c)

rt = gmv_mean_t / count_mean_t
rc = gmv_mean_c / count_mean_c

var = 1 / (rc ** 2) * delta_var(test['gmv_sum'], test['gmv_count']) + (rt ** 2 / rc ** 4) * delta_var(control['gmv_sum'], control['gmv_count'])

sigma = math.sqrt(var)

z_stat = t_dist.ppf(1 - alfa / 2)

l, r = delta - 100 * z_stat * sigma, delta + 100 * z_stat * sigma

round(l.item(), 3), round(r.item(), 3)

(0.092, 1.045)