In [2]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
from dataclasses import dataclass
import utils

In [72]:
daily_users = 104
n_control = int(daily_users*20*np.random.uniform(0.98, 1.02))
n_variation = int(daily_users*20*np.random.uniform(0.98, 1.02))
data_control = lognorm.rvs(0.5, loc=0, scale=np.exp(1)*10.5, size=n_control)
data_variation = lognorm.rvs(0.5, loc=0, scale=np.exp(1)*11.01, size=n_variation)    

control_dict = { "user_type": "control", "session_duration": data_control}
variation_dict = { "user_type": "variation", "session_duration": data_variation}

# Load the data from the test
data = utils.run_ab_test_background_color(n_days=20)

# Print the first 10 rows
data.head(10)

2061


Unnamed: 0,user_id,user_type,session_duration
0,MA70KX0BSX,variation,15.528769
1,X4A8UC7N1Q,variation,32.28759
2,30I3SC8VII,variation,43.718217
3,WX226C2LWX,variation,49.519702
4,D8LXC47OG1,control,61.709028
5,54IFDLYW2X,variation,71.779283
6,OVKT2N58GM,variation,23.291835
7,YERW03OHKD,control,25.219461
8,DRO908AFJT,control,26.240482
9,W4XGTH9ZVS,variation,20.780244


In [16]:
# Separate the data from the two groups (sd stands for session duration)
control_sd_data = data[data["user_type"]=="control"]["session_duration"]
variation_sd_data = data[data["user_type"]=="variation"]["session_duration"]

print(f"{len(control_sd_data)} users saw the original website old color palette with an average duration of {control_sd_data.mean():.2f} minutes\n")
print(f"{len(variation_sd_data)} users saw the new website new color patette with an average duration of {variation_sd_data.mean():.2f} minutes")


2069 users saw the original website old color palette with an average duration of 32.92 minutes

2117 users saw the new website new color patette with an average duration of 33.83 minutes


In [18]:
# Data class is usually simply a class with data only. Instead of data class tuple or dict could be used.
@dataclass
class estimation_metrics_cont:
    n: int
    xbar: float
    s: float
        
    def __repr__(self):
        return f"sample_params(n={self.n}, xbar={self.xbar:.3f}, s={self.s:.3f})"

In [20]:
def compute_continuous_metrics(data):
    """Computes the relevant metrics out of a sample for continuous data.

    Args:
        data (pandas.core.series.Series): The sample data. In this case the average session duration for each user.

    Returns:
        estimation_metrics_cont: The metrics saved in a dataclass instance.
    """
    
    ### START CODE HERE ###
    metrics = estimation_metrics_cont( 
        n=len(data),
        xbar=np.mean(data),
        s=np.std(data, ddof=1)
    )
    ### END CODE HERE ###
    
    return metrics

In [22]:
# Test your code

cm = compute_continuous_metrics(np.array([1,2,3,4,5]))
print(f"n={cm.n}, xbar={cm.xbar:.2f} and s={cm.s:.2f} for example array\n")

control_metrics = compute_continuous_metrics(control_sd_data)
print(f"n={control_metrics.n}, xbar={control_metrics.xbar:.2f} and s={control_metrics.s:.2f} for control data\n")

variation_metrics = compute_continuous_metrics(variation_sd_data)
print(f"n={variation_metrics.n}, xbar={variation_metrics.xbar:.2f} and s={variation_metrics.s:.2f} for variation data")

n=5, xbar=3.00 and s=1.58 for example array

n=2069, xbar=32.92 and s=17.54 for control data

n=2117, xbar=33.83 and s=18.24 for variation data


In [24]:
def degrees_of_freedom(control_metrics, variation_metrics):
    """Computes the degrees of freedom for two samples.

    Args:
        control_metrics (estimation_metrics_cont): The metrics for the control sample.
        variation_metrics (estimation_metrics_cont): The metrics for the variation sample.

    Returns:
        numpy.float: The degrees of freedom.
    """
    
    ### START CODE HERE ###
    
    n1, s1 = control_metrics.n, control_metrics.s
    n2, s2 = variation_metrics.n, variation_metrics.s

    dof = np.square((s1**2 / n1 + s2**2 / n2)) / ((s1**2 / n1)**2 / (n1 - 1) + (s2**2 / n2)**2 / (n2 - 1))

    
    ### END CODE HERE ###
    
    
    return dof

In [26]:
# Test your code
test_m1, test_m2 = compute_continuous_metrics(np.array([1,2,3])), compute_continuous_metrics(np.array([4,5]))
dof = degrees_of_freedom(test_m1, test_m2)
print(f"DoF for example arrays: {dof:.2f}\n")

dof = degrees_of_freedom(control_metrics, variation_metrics)
print(f"DoF for AB test samples: {dof:.2f}")

DoF for example arrays: 2.88

DoF for AB test samples: 4182.97


In [28]:
def t_statistic_diff_means(control_metrics, variation_metrics):
    """Compute the t-statistic for the difference of two means.

    Args:
        control_metrics (estimation_metrics_cont): The metrics for the control sample.
        variation_metrics (estimation_metrics_cont): The metrics for the variation sample.

    Returns:
        numpy.float: The value of the t-statistic.
    """
    
    ### START CODE HERE ###
    
    n1, xbar1, s1 = control_metrics.n, control_metrics.xbar, control_metrics.s
    n2, xbar2, s2 = variation_metrics.n, variation_metrics.xbar, variation_metrics.s

    t = (xbar1 - xbar2) / np.sqrt((s1 ** 2) / n1 + (s2 ** 2) / n2)
    ### END CODE HERE ###
    
    return t


In [30]:
# Test your code

t = t_statistic_diff_means(test_m1, test_m2)
print(f"t statistic for example arrays: {t:.2f}\n")

t = t_statistic_diff_means(control_metrics, variation_metrics)
print(f"t statistic for AB test: {t:.2f}")

t statistic for example arrays: -3.27

t statistic for AB test: -1.64


In [32]:
def reject_nh_t_statistic(t_statistic, dof, alpha=0.05):
    """Decide whether to reject (or not) the null hypothesis of the t-test.

    Args:
        t_statistic (numpy.float): The computed value of the t-statistic for the two samples.
        dof (numpy.float): The computed degrees of freedom for the two samples.
        alpha (float, optional): The desired level of significancy. Defaults to 0.05.

    Returns:
        bool: True if the null hypothesis should be rejected. False otherwise.
    """
    
    reject = False
    ### START CODE HERE ###
    p_value = 2 * (1 - stats.t.cdf(abs(t_statistic), df=dof))

    if p_value < alpha:
        reject = True
    ### END CODE HERE ###
        
    return reject

In [34]:

# Test your code

alpha = 0.05
reject_nh = reject_nh_t_statistic(t, dof, alpha)

print(f"The null hypothesis can be rejected at the {alpha} level of significance: {reject_nh}\n")

msg = "" if reject_nh else " not"
print(f"There is{msg} enough statistical evidence against H0.\nIt can be concluded that there is{msg} a statistically significant difference between the means of the two samples.")


The null hypothesis can be rejected at the 0.05 level of significance: False

There is not enough statistical evidence against H0.
It can be concluded that there is not a statistically significant difference between the means of the two samples.


In [36]:
# PROPORTION

# Compute the sample size required to compare the actual vs desired CVR
required_sample_size = utils.sample_size_diff_proportions(0.12, 0.14)
required_sample_size

4438

In [38]:
daily_active_users = 1038

n_days = math.ceil((required_sample_size*2)/daily_active_users)

print(f"AB test should run for {n_days} days to gather enough data")

AB test should run for 9 days to gather enough data


In [41]:
data = utils.run_ab_test_personalized_feed(n_days)

data.head(5)

Unnamed: 0,user_id,user_type,converted
0,QN4IVF8JUJ,variation,0
1,P2P67LOTP0,control,1
2,N04H8IPGZH,variation,0
3,64P7OMM5FV,control,0
4,B050TALX0N,control,0


In [43]:
control_data = data[data["user_type"]=="control"]["converted"]
variation_data = data[data["user_type"]=="variation"]["converted"]

print(f"{len(control_data)} users saw the original app with an average CVR of {control_data.mean():.4f}\n")
print(f"{len(variation_data)} users saw the app with the new feature with an average CVR of {variation_data.mean():.4f}")


4632 users saw the original app with an average CVR of 0.1244

4728 users saw the app with the new feature with an average CVR of 0.1519


In [45]:
@dataclass
class estimation_metrics_prop:
    n: int
    x: int
    p: float
        
    def __repr__(self):
        return f"sample_params(n={self.n}, x={self.x}, p={self.p:.3f})"

In [47]:
def compute_proportion_metrics(data):
    """Computes the relevant metrics out of a sample for proportion-like data.

    Args:
        data (pandas.core.series.Series): The sample data. In this case 1 if the user converted and 0 otherwise.

    Returns:
        estimation_metrics_prop: The metrics saved in a dataclass instance.
    """
    
    ### START CODE HERE ###
    metrics = estimation_metrics_prop( 
        n=len(data),
        x=data.sum(),
        p=data.mean()
    )
    ### END CODE HERE ###
    
    return metrics

In [49]:
# Test your code
cm = compute_proportion_metrics(np.array([1,0,0,1]))
print(f"n={cm.n}, x={cm.x} and p={cm.p:.4f} for sample array\n")

control_metrics = compute_proportion_metrics(control_data)
print(f"n={control_metrics.n}, x={control_metrics.x} and p={control_metrics.p:.4f} for control data\n")

variation_metrics = compute_proportion_metrics(variation_data)
print(f"n={variation_metrics.n}, x={variation_metrics.x} and p={variation_metrics.p:.4f} for variation data")

n=4, x=2 and p=0.5000 for sample array

n=4632, x=576 and p=0.1244 for control data

n=4728, x=718 and p=0.1519 for variation data


In [51]:
def pooled_proportion(control_metrics, variation_metrics):
    """Compute the pooled proportion for the two samples.

    Args:
        control_metrics (estimation_metrics_prop): The metrics for the control sample.
        variation_metrics (estimation_metrics_prop): The metrics for the variation sample.

    Returns:
        numpy.float: The pooled proportion.
    """
    
    ### START CODE HERE ###
    
    x1, n1 = control_metrics.x, control_metrics.n
    x2, n2 = variation_metrics.x, variation_metrics.n

    pp = (x1 + x2) / (n1 + n2)

    return pp

    
    ### END CODE HERE ###
    
    return pp


In [53]:
# Test your code

test_m1, test_m2 = compute_proportion_metrics(np.array([1,0,1])), compute_proportion_metrics(np.array([1,1,1,0]))
pp = pooled_proportion(test_m1, test_m2)
print(f"pooled proportion for example arrays: {pp:.4f}\n")

pp = pooled_proportion(control_metrics, variation_metrics)
print(f"pooled proportion for AB test samples: {pp:.4f}")

pooled proportion for example arrays: 0.7143

pooled proportion for AB test samples: 0.1382


In [55]:
def z_statistic_diff_proportions(control_metrics, variation_metrics):
    """Compute the z-statistic for the difference of two proportions.

    Args:
        control_metrics (estimation_metrics_prop): The metrics for the control sample.
        variation_metrics (estimation_metrics_prop): The metrics for the variation sample.

    Returns:
        numpy.float: The z-statistic.
    """
    
    ### START CODE HERE ###
    
    pp = pooled_proportion(control_metrics, variation_metrics)
    
    n1, p1 = control_metrics.n, control_metrics.p
    n2, p2 = variation_metrics.n, variation_metrics.p
    
    z = (p1 - p2) / np.sqrt(pp * (1 - pp) * ((1 / n1) + (1 / n2)))
    
    ### END CODE HERE ###
    
    
    return z

In [57]:
# Test your code

z = z_statistic_diff_proportions(test_m1, test_m2)
print(f"z statistic for example arrays: {z:.4f}\n")

z = z_statistic_diff_proportions(control_metrics, variation_metrics)
print(f"z statistic for AB test: {z:.4f}")

z statistic for example arrays: -0.2415

z statistic for AB test: -3.8551


In [59]:
def reject_nh_z_statistic(z_statistic, alpha=0.05):
    """Decide whether to reject (or not) the null hypothesis of the z-test.

    Args:
        z_statistic (numpy.float): The computed value of the z-statistic for the two proportions.
        alpha (float, optional): The desired level of significancy. Defaults to 0.05.

    Returns:
        bool: True if the null hypothesis should be rejected. False otherwise.
    """
    reject = False
    p_value = 2 * (1 - stats.norm.cdf(np.abs(z_statistic)))
    
    if p_value < alpha:
        reject = True

    ### END CODE HERE ###
        
    return reject

In [61]:
# Test your code

alpha = 0.05
reject_nh = reject_nh_z_statistic(z, alpha)

print(f"The null hypothesis can be rejected at the {alpha} level of significance: {reject_nh}\n")

msg = "" if reject_nh else " not"
print(f"There is{msg} enough statistical evidence against H0.\nThus it can be concluded that there is{msg} a statistically significant difference between the two proportions.")

The null hypothesis can be rejected at the 0.05 level of significance: True

There is enough statistical evidence against H0.
Thus it can be concluded that there is a statistically significant difference between the two proportions.


In [63]:
def confidence_interval_proportion(metrics, alpha=0.05):
    """Compute the confidende interval for a proportion-like sample.

    Args:
        metrics (estimation_metrics_prop): The metrics for the sample.
        alpha (float, optional): The desired level of significance. Defaults to 0.05.

    Returns:
        (numpy.float, numpy.float): The lower and upper bounds of the confidence interval.
    """
    
    ### START CODE HERE ###
    n, p = metrics.n, metrics.p
    
    distance = stats.norm.ppf(1 - alpha / 2) * np.sqrt(p * (1 - p) / n)
    
    lower = p - distance
    upper = p + distance
    ### END CODE HERE ###
    
    return lower, upper

In [65]:
# Test your code

c_lower, c_upper = confidence_interval_proportion(control_metrics)
print(f"Confidence interval for control group: [{c_lower:.3f}, {c_upper:.3f}]\n")

v_lower, v_upper = confidence_interval_proportion(variation_metrics)
print(f"Confidence interval for variation group: [{v_lower:.3f}, {v_upper:.3f}]")

Confidence interval for control group: [0.115, 0.134]

Confidence interval for variation group: [0.142, 0.162]


In [67]:
utils.AB_test_dashboard(z_statistic_diff_proportions, reject_nh_z_statistic)

interactive(children=(IntText(value=4632, description='Users A:'), IntText(value=576, description='Conversions…