In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm, mannwhitneyu
from tqdm.auto import tqdm

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

def get_bootstrap(data_column_1, data_column_2, boot_it = 1000, statistic = np.mean, bootstrap_conf_level = 0.95):
    boot_len = max([len(data_column_1), len(data_column_2)])
    boot_data = []
    for i in range(boot_it):
        samples_1 = data_column_1.sample(boot_len, replace = True).values
        samples_2 = data_column_2.sample(boot_len, replace = True).values
        
        boot_data.append(statistic(samples_1-samples_2))
        
    pd_boot_data = pd.DataFrame(boot_data)

    p_1 = norm.cdf(x = 0, loc = np.mean(boot_data), scale = np.std(boot_data))
    p_2 = norm.cdf(x = 0, loc = -np.mean(boot_data), scale = np.std(boot_data))
    p_value = min(p_1, p_2) * 2
    
    #p_value = min(sum((np.array(boot_data) <= 0)), sum((np.array(boot_data) >= 0))) * 2 / len(boot_data)
    return {'p_value': p_value}

In [3]:
hw = pd.read_csv('/mnt/HC_Volume_18315164/home-jupyter/jupyter-a-a-30/stat_lesson_9/hw_bootstrap.csv', decimal=",", sep=";")
hw.head()

Unnamed: 0.1,Unnamed: 0,value,experimentVariant
0,1,10.380495,Control
1,2,9.546867,Control
2,3,11.088215,Control
3,4,10.147274,Control
4,5,9.789808,Control


In [8]:
# Примените бутстрап (с np.mean) и критерий mann-whitney, а потом сравните p-value

x = hw['value'][(hw['experimentVariant'] == 'Control')]
y = hw['value'][(hw['experimentVariant'] == 'Treatment')]

res_mw = mannwhitneyu(x, y, alternative = 'two-sided')
for i in range(10):
    res_b_npmean = get_bootstrap(x, y)
    res_b_npmedian = get_bootstrap(x, y, statistic = np.median)
    
    print(f'Boot mean: {res_b_npmean["p_value"]}, Boot median: {res_b_npmedian["p_value"]}, MW {res_mw.pvalue}')

Boot mean: 0.038055732406179314, Boot median: 0.8345645564817015, MW 0.8592148582485579
Boot mean: 0.03253316147732088, Boot median: 0.8785591802290611, MW 0.8592148582485579
Boot mean: 0.03300309371887371, Boot median: 0.8746091269980647, MW 0.8592148582485579
Boot mean: 0.043511116477950335, Boot median: 0.8976058328561722, MW 0.8592148582485579
Boot mean: 0.03407938527966393, Boot median: 0.8998362733383636, MW 0.8592148582485579
Boot mean: 0.040658370015627575, Boot median: 0.8971696398604363, MW 0.8592148582485579
Boot mean: 0.03703260632862535, Boot median: 0.9368056004335247, MW 0.8592148582485579
Boot mean: 0.0459837059542177, Boot median: 0.9264141357606335, MW 0.8592148582485579
Boot mean: 0.03840373823003397, Boot median: 0.8596727406150881, MW 0.8592148582485579
Boot mean: 0.03237386130161351, Boot median: 0.9040509911720119, MW 0.8592148582485579
