# Comparing against abtestguide.com

https://abtestguide.com/abtestsize/

In [9]:
import math

import statsmodels.api as sm


def how_many_visitors_do_you_need(
    conversion_rate_control, relative_improvement, hypothesis="two-sided"
):
    if hypothesis not in ("one-sided", "two-sided"):
        raise ValueError("hypothesis can only be one-sided or two-sided")
    # The null hypothesis is the B (treatment) is smaller than A (control).
    # If the test is significant, we can reject the null hypothesis,
    # which means B IS LARGER than A.
    if hypothesis == "one-sided":
        hypothesis = "smaller"

    A_conv = conversion_rate_control
    B_conv = A_conv * (1 + relative_improvement)
    effect_size = sm.stats.proportion_effectsize(A_conv, B_conv)
    sample_size = sm.stats.zt_ind_solve_power(
        effect_size=effect_size,
        nobs1=None,
        alpha=0.05,
        power=0.8,
        alternative=hypothesis,
    )
    # Round up, because visitors cannot be float.
    return math.ceil(sample_size)

In [15]:
one_sided = how_many_visitors_do_you_need(0.02, 0.15, hypothesis="one-sided")
two_sided = how_many_visitors_do_you_need(0.02, 0.15, hypothesis="two-sided")
one_sided, two_sided

(28870, 36650)

In [16]:
one_sided = how_many_visitors_do_you_need(0.7, 0.07, hypothesis="one-sided")
two_sided = how_many_visitors_do_you_need(0.7, 0.07, hypothesis="two-sided")
one_sided, two_sided

(1027, 1303)

In [43]:
from statsmodels.stats.power import zt_ind_solve_power
from statsmodels.stats.proportion import proportions_ztest


def post_test_evaluation(
    visitors_A, conversion_A, visitors_B, conversion_B, hypothesis="two-sided"
):
    if hypothesis not in ("one-sided", "two-sided"):
        raise ValueError("hypothesis can only be one-sided or two-sided")
    # The null hypothesis is the B (treatment) is smaller than A (control).
    # If the test is significant, we can reject the null hypothesis,
    # which means B IS LARGER than A.
    if hypothesis == "one-sided":
        hypothesis = "smaller"

    z_stat, p_value = proportions_ztest(
        count=[conversion_A, conversion_B],
        nobs=[visitors_A, visitors_B],  # [control, treatment]
        alternative=hypothesis,  # Alternative hypothesis: control is smaller than treatment.
    )

    cr_A = conversion_A / visitors_A
    cr_B = conversion_B / visitors_B
    se_A = math.sqrt(cr_A * (1 - cr_A) / visitors_A)
    se_B = math.sqrt(cr_B * (1 - cr_B) / visitors_B)
    se_delta = math.sqrt(se_A**2 + se_B**2)

    effect_size = sm.stats.proportion_effectsize(cr_A, cr_B)

    power = zt_ind_solve_power(
        effect_size=effect_size,
        nobs1=visitors_A,
        alpha=0.05,
        power=None,
        ratio=0,
        alternative=hypothesis,
    )

    return {
        "z_score": z_stat,
        "p_value": p_value,
        "is_significant": p_value <= 0.05,
        "visitors_A": visitors_A,
        "conversion_A": conversion_A,
        "visitors_B": visitors_B,
        "conversion_B": conversion_B,
        "conversion_rate_A": cr_A,
        "conversion_rate_B": cr_B,
        "relative_uplift_in_conversion_rate": (cr_B - cr_A) / cr_A,
        "standard_error_A": se_A,
        "standard_error_B": se_B,
        "standard_error_delta": se_delta,
        "power": power,
    }

In [44]:
one_sided = post_test_evaluation(
    conversion_A=875,
    visitors_A=1250,
    conversion_B=1000,
    visitors_B=1250,
    hypothesis="one-sided",
)
two_sided = post_test_evaluation(
    conversion_A=875,
    visitors_A=1250,
    conversion_B=1000,
    visitors_B=1250,
    hypothesis="two-sided",
)

one_sided, two_sided

({'z_score': -5.773502691896263,
  'p_value': 3.882018268965222e-09,
  'is_significant': True,
  'visitors_A': 1250,
  'conversion_A': 875,
  'visitors_B': 1250,
  'conversion_B': 1000,
  'conversion_rate_A': 0.7,
  'conversion_rate_B': 0.8,
  'relative_uplift_in_conversion_rate': 0.142857142857143,
  'standard_error_A': 0.012961481396815721,
  'standard_error_B': 0.011313708498984758,
  'standard_error_delta': 0.017204650534085254,
  'power': 0.9999999999725547},
 {'z_score': -5.773502691896263,
  'p_value': 7.764036537930444e-09,
  'is_significant': True,
  'visitors_A': 1250,
  'conversion_A': 875,
  'visitors_B': 1250,
  'conversion_B': 1000,
  'conversion_rate_A': 0.7,
  'conversion_rate_B': 0.8,
  'relative_uplift_in_conversion_rate': 0.142857142857143,
  'standard_error_A': 0.012961481396815721,
  'standard_error_B': 0.011313708498984758,
  'standard_error_delta': 0.017204650534085254,
  'power': 0.9999999997838817})

In [45]:
one_sided = post_test_evaluation(
    conversion_A=875,
    visitors_A=1250,
    conversion_B=888,
    visitors_B=1250,
    hypothesis="one-sided",
)
two_sided = post_test_evaluation(
    conversion_A=875,
    visitors_A=1250,
    conversion_B=888,
    visitors_B=1250,
    hypothesis="two-sided",
)

one_sided, two_sided

({'z_score': -0.5702344571435011,
  'p_value': 0.28425934424193905,
  'is_significant': False,
  'visitors_A': 1250,
  'conversion_A': 875,
  'visitors_B': 1250,
  'conversion_B': 888,
  'conversion_rate_A': 0.7,
  'conversion_rate_B': 0.7104,
  'relative_uplift_in_conversion_rate': 0.014857142857142966,
  'standard_error_A': 0.012961481396815721,
  'standard_error_B': 0.012829086951143483,
  'standard_error_delta': 0.018236926056767352,
  'power': 0.2009052297836434},
 {'z_score': -0.5702344571435011,
  'p_value': 0.5685186884838781,
  'is_significant': False,
  'visitors_A': 1250,
  'conversion_A': 875,
  'visitors_B': 1250,
  'conversion_B': 888,
  'conversion_rate_A': 0.7,
  'conversion_rate_B': 0.7104,
  'relative_uplift_in_conversion_rate': 0.014857142857142966,
  'standard_error_A': 0.012961481396815721,
  'standard_error_B': 0.012829086951143483,
  'standard_error_delta': 0.018236926056767352,
  'power': 0.1271858084138313})