Проанализируйте результаты эксперимента и напишите свои рекомендации менеджеру.

Mobile Games AB Testing with Cookie Cats

In [65]:
from typing import Union
from tqdm import tqdm

import pandas as pd
import numpy as np
import plotly.express as px

from scipy import stats
from statsmodels.stats.meta_analysis import effectsize_smd
from statsmodels.stats import proportion
from statsmodels.stats.power import tt_ind_solve_power
from statsmodels.stats.power import zt_ind_solve_power
     

In [66]:
data = pd.read_csv('gb_sem_9_hw.csv')

In [67]:
data

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7
0,116,gate_30,3,0,0
1,337,gate_30,38,1,0
2,377,gate_40,165,1,0
3,483,gate_40,1,0,0
4,488,gate_40,179,1,1
...,...,...,...,...,...
90184,9999441,gate_40,97,1,0
90185,9999479,gate_40,30,0,0
90186,9999710,gate_30,28,1,0
90187,9999768,gate_40,51,1,0


In [68]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90189 entries, 0 to 90188
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   userid          90189 non-null  int64 
 1   version         90189 non-null  object
 2   sum_gamerounds  90189 non-null  int64 
 3   retention_1     90189 non-null  int64 
 4   retention_7     90189 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 3.4+ MB


In [69]:
data.version.replace({'gate_30': 0, 'gate_40': 1}, inplace=True)

In [70]:
data

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7
0,116,0,3,0,0
1,337,0,38,1,0
2,377,1,165,1,0
3,483,1,1,0,0
4,488,1,179,1,1
...,...,...,...,...,...
90184,9999441,1,97,1,0
90185,9999479,1,30,0,0
90186,9999710,0,28,1,0
90187,9999768,1,51,1,0


In [71]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90189 entries, 0 to 90188
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   userid          90189 non-null  int64
 1   version         90189 non-null  int64
 2   sum_gamerounds  90189 non-null  int64
 3   retention_1     90189 non-null  int64
 4   retention_7     90189 non-null  int64
dtypes: int64(5)
memory usage: 3.4 MB


In [72]:
data.describe()

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7
count,90189.0,90189.0,90189.0,90189.0,90189.0
mean,4998412.0,0.504374,51.872457,0.44521,0.186065
std,2883286.0,0.499984,195.050858,0.496992,0.389161
min,116.0,0.0,0.0,0.0,0.0
25%,2512230.0,0.0,5.0,0.0,0.0
50%,4995815.0,1.0,16.0,0.0,0.0
75%,7496452.0,1.0,51.0,1.0,0.0
max,9999861.0,1.0,49854.0,1.0,1.0


In [73]:
def continious_result(control: pd.DataFrame,
                      treatment: pd.DataFrame,
                      column: str,
                      n_iters: int = 10_000) -> pd.DataFrame:
    # Статистика по выборкам
    size = control.loc[:, column].shape[0]
    
    control_mean = control.loc[:, column].mean()
    treatment_mean = treatment.loc[:, column].mean()
    
    control_std = control.loc[:, column].std(ddof=1)
    treatment_std = treatment.loc[:, column].std(ddof=1)
    
    # Бутсрап
    booted_diff = []
    for _ in tqdm(range(n_iters)):
        control_sample = control.loc[:, column].sample(n=size, replace=True).values
        treatment_sample = treatment.loc[:, column].sample(n=size, replace=True).values
        booted_diff.append(np.mean(control_sample - treatment_sample))
    
    # Считаем статистику после бустрапа
    md_ci, std_ci = np.mean(booted_diff), np.std(booted_diff, ddof=1)
    left_ci, right_ci = np.percentile(booted_diff, [2.5, 97.5])
    p_value_ci = 2 * (1 - stats.norm.cdf(np.abs(md_ci / std_ci)))
    
    # Считаем мощность эксперимента
    effect_size, _ = effectsize_smd(mean1=treatment_mean, sd1=treatment_std, nobs1=size,
                                    mean2=control_mean, sd2=control_std, nobs2=size)
    power = tt_ind_solve_power(effect_size=effect_size,
                               nobs1=size,
                               alpha=.05,
                               power=None,
                               ratio=1)
    # Формируем отчёт 
    result = pd.DataFrame({'effect_size': effect_size,
                           'alpha': p_value_ci, 
                           'beta': (1-power),
                           'CI': f'[{np.round(left_ci, 3)}, {np.round(right_ci, 3)}]',
                           'difference': md_ci,},
                          index=[column]) 
    return result

In [74]:
def proportion_result(control: pd.DataFrame,
                      treatment: pd.DataFrame,
                      column: str,
                      n_iters: int = 10_000) -> pd.DataFrame:
    # Вероятность событий
    size = control.loc[:, column].shape[0]
    prop_control = control.loc[:, column].sum() / size
    prop_treatment = treatment.loc[:, column].sum() / size
    
    # Бутсрап
    booted_diff = []
    for _ in tqdm(range(n_iters)):
        control_sample = stats.bernoulli.rvs(p=prop_control, size=size)
        treatment_sample = stats.bernoulli.rvs(p=prop_treatment, size=size)
        booted_diff.append(np.mean(control_sample - treatment_sample))
    
    # Считаем статистику после бустрапа
    md_ci, std_ci = np.mean(booted_diff), np.std(booted_diff, ddof=1)
    left_ci, right_ci = np.percentile(booted_diff, [2.5, 97.5])
    p_value_ci = 2 * (1 - stats.norm.cdf(np.abs(md_ci / std_ci)))
    
    # Считаем мощность эксперимента
    effect_size = proportion.proportion_effectsize(prop_control, prop_treatment)
    
    power = zt_ind_solve_power(effect_size=effect_size,
                               nobs1=size,
                               alpha=.05,
                               power=None,
                               ratio=1)
    # Формируем отчёт 
    result = pd.DataFrame({'effect_size': effect_size,
                           'alpha': p_value_ci, 
                           'beta': (1-power),
                           'CI': f'[{np.round(left_ci, 3)}, {np.round(right_ci, 3)}]',
                           'difference': md_ci,},
                          index=[column]) 
    return result

In [75]:
control = data[data.version == 0].copy(deep=True)
treatment = data[data.version == 1].copy(deep=True)

In [76]:
control.shape, treatment.shape

((44700, 5), (45489, 5))

In [None]:
fig = px.histogram(data,
                   x='sum_gamerounds',
                   color = 'version',
                   title='sum_gamerounds',
                   marginal = 'box',
                   nbins = 100,
                   barmode='overlay')

fig.show()

In [77]:
continious_result(control, treatment, 'sum_gamerounds')

100%|██████████| 10000/10000 [00:23<00:00, 418.78it/s]


Unnamed: 0,effect_size,alpha,beta,CI,difference
sum_gamerounds,-0.005915,0.39247,0.856725,"[-1.063, 4.127]",1.141434


Статистически значимых различий нет, интервал включает 0.

In [78]:
data.userid.value_counts()

116        1
6632278    1
6658202    1
6658194    1
6658134    1
          ..
3347358    1
3347337    1
3346992    1
3346979    1
9999861    1
Name: userid, Length: 90189, dtype: int64

In [79]:
### Bucketket

for _ in range(100, 1001): 
    if data.shape[0] % _ == 0:
        print(_)

911


In [80]:
n_buckets = 911
data = (data
 .sample(n=data.shape[0], replace=False)
 .reset_index(drop=True)
 .assign(bucket=list(range(n_buckets)) * int(data.shape[0] / n_buckets)))

In [81]:
data.head()

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7,bucket
0,5431787,0,3,0,0,0
1,3904068,1,6,0,0,1
2,4533797,0,3,0,0,2
3,7223636,0,7,1,0,3
4,526230,1,1,0,0,4


In [82]:
bucketed_data = data.groupby(['version', 'bucket'])['sum_gamerounds'].agg(mu=np.mean, std=np.std).reset_index()
bucketed_data

Unnamed: 0,version,bucket,mu,std
0,0,0,43.222222,92.625573
1,0,1,88.218182,205.484044
2,0,2,39.301887,73.805770
3,0,3,53.692308,94.189117
4,0,4,40.314815,51.470755
...,...,...,...,...
1817,1,906,75.186047,149.358510
1818,1,907,31.711111,62.554203
1819,1,908,83.042553,181.206339
1820,1,909,77.521739,147.930125


In [83]:
# Сравним исходное выборочное среднее и среднее бакетных средних 
round(np.mean(data["sum_gamerounds"]), 5), round(np.mean(bucketed_data["mu"]), 5)

(51.87246, 51.81999)

In [84]:
round(np.std(data["sum_gamerounds"]), 5), round(np.mean(bucketed_data["std"]), 5)

(195.04978, 97.04111)

In [85]:
control_bucket = bucketed_data[bucketed_data.version == 0]
treatment_bucket = bucketed_data[bucketed_data.version == 1]
continious_result(control_bucket, treatment_bucket, 'mu', n_iters=100000)

100%|██████████| 100000/100000 [00:26<00:00, 3755.16it/s]


Unnamed: 0,effect_size,alpha,beta,CI,difference
mu,-0.046888,0.316673,0.829881,"[-0.811, 3.739]",1.166712


In [None]:
#Testing retention_1
fig = px.histogram(data, x="retention_1",
                   color='version', barmode='group',
                   height=400)
fig.show()

In [86]:
proportion_result(control, treatment, 'retention_1')

100%|██████████| 10000/10000 [00:23<00:00, 421.76it/s]


Unnamed: 0,effect_size,alpha,beta,CI,difference
retention_1,-0.003823,0.561113,0.911819,"[-0.008, 0.005]",-0.001894


Статистически значимых различий не выявлено. Доверительный интервал включает 0

In [None]:
#Testing retention_7
fig = px.histogram(data, x="retention_7",
                   color='version', barmode='group',
                   height=500)
fig.show()

In [88]:
proportion_result(control, treatment, 'retention_7')   


100%|██████████| 10000/10000 [00:18<00:00, 532.79it/s]


Unnamed: 0,effect_size,alpha,beta,CI,difference
retention_7,0.012776,0.054285,0.519844,"[0.0, 0.01]",0.005008


В группах есть статистически значимые различия, интервал включает 0, но альфа равно 5% и бетта не высокая . По этой метрике можно анализировать.

Вывод: поскольку для всех трех метрик alpha и beta большие, а доверительный интервал включает 0 можно сделать вывод об отсутствиии статистически значимых различий. Значит, гипотеза не подтвердилась и менеджеру следует придумать что-то другое.