In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats


def safe_divide(x, y):
    try:
        return x / y
    except ZeroDivisionError:
        return np.nan

def delta_var(numerator, denominator):
    """
    Функция для расчета дисперсии дельта-методом, numerator - вектор числитель, denominator - вектор знаменатель
    """
    x = numerator
    y = denominator
    n = len(x)
    mu_x = np.mean(x)
    mu_y = np.mean(y)
    var_x = np.var(x, ddof=1)
    var_y = np.var(y, ddof=1)
    cov_xy = np.cov(x, y, ddof=1)[0][1]
    delta_var = safe_divide(
        safe_divide(
            var_x, mu_y**2
        ) - 2*cov_xy*safe_divide(mu_x,mu_y**3) + var_y*safe_divide(mu_x**2,mu_y**4)
        ,
        n
    )
    return delta_var


data: pd.DataFrame = pd.read_csv('./synthetic_gmv_data_1.2.csv')

# agg_1: pd.DataFrame = df.groupby([ 'group_name', 'user_id' ], as_index=False)
#
# data: pd.DataFrame = agg_1[ 'gmv' ].sum()

control = data[data['group_name'] == 'control']
test = data[data['group_name'] == 'test']

control = control.groupby('user_id', as_index=False).agg(
    gmv_sum = ('gmv', 'sum'),
    gmv_count = ('gmv', 'count')
)

test = test.groupby('user_id', as_index=False).agg(
    gmv_sum = ('gmv', 'sum'),
    gmv_count = ('gmv', 'count')
)

control, test

(           user_id  gmv_sum  gmv_count
 0       00062h7u56      733          1
 1       000ic5j18m     2933          6
 2       00174ganru     1496          4
 3       0018yddvkm     7776          8
 4       0018yxz3f4     1671          4
 ...            ...      ...        ...
 147286  zzx6n9u92d     3657          4
 147287  zzxa8wbrzd     2118          2
 147288  zzxq8adett     2364          4
 147289  zzyhn5f9zw     1235          2
 147290  zzztoicz6c     3579          5
 
 [147291 rows x 3 columns],
           user_id  gmv_sum  gmv_count
 0      00074uxybk     3187          3
 1      000plmykri     1695          5
 2      0026yqk83k     1293          3
 3      002ioy63br     3862          6
 4      0032wrbd7c      765          3
 ...           ...      ...        ...
 49095  zzv633k6jk     3809          7
 49096  zzvup22tly     3938          3
 49097  zzx7apehbt     3422          4
 49098  zzxoenzox7     1141          2
 49099  zzyggo4hgm     1923          4
 
 [49100 rows x 3 col

In [3]:
import math
gmv_mean_c = control[ 'gmv_sum' ].mean()
gmv_mean_t = test[ 'gmv_sum' ].mean()

count_mean_c = control[ 'gmv_count' ].mean()
count_mean_t = test[ 'gmv_count' ].mean()

mean_t = gmv_mean_t / count_mean_t
mean_c = gmv_mean_c / count_mean_c

var_t = delta_var(test['gmv_sum'], test['gmv_count'])
var_c = delta_var(control['gmv_sum'], control['gmv_count'])

t_stat = (mean_t - mean_c) / math.sqrt(var_c + var_t)

t_stat = round(t_stat, 3)
t_stat


np.float64(2.344)

In [4]:
t_dist = stats.t(df=len(control) + len(test) - 2)

p_value = round(1 - (t_dist.cdf(t_stat) - t_dist.cdf(-t_stat)), 3)

p_value

np.float64(0.019)

In [9]:
def linearization_type_2(x_num, x_denom, y_num, y_denom):
    n = len(x_num)
    m = len(y_num)
    x_num_bar = np.mean(x_num)
    y_num_bar = np.mean(y_num)
    x_denom_bar = np.mean(x_denom)
    y_denom_bar = np.mean(y_denom)
    x_estimator = safe_divide(x_num_bar, x_denom_bar)
    y_estimator = safe_divide(y_num_bar, y_denom_bar)
    delta_estimator = x_estimator - y_estimator
    x_linear = x_estimator  + safe_divide(1, x_denom_bar)*(np.array(x_num) - x_estimator*np.array(x_denom))
    y_linear = y_estimator  + safe_divide(1, y_denom_bar)*(np.array(y_num) - y_estimator*np.array(y_denom))
    t_stat, p_value = stats.ttest_ind(x_linear, y_linear, equal_var=False)

    return t_stat, p_value


t, p = linearization_type_2(test[ 'gmv_sum' ], test[ 'gmv_count'], control[ 'gmv_sum' ], control[ 'gmv_count' ])

round(t.item(), 3), round(p.item(), 3)

(2.344, 0.019)