# Chapter 2: A/B Testing: Evaluating a Change to the System 

In [None]:
import numpy as np
import scipy
import scipy.stats
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
mpl.rcParams['figure.dpi']= 300

clr1 = "#333333"
clr2 = "#777777"
clr3 = "#AAAAAA"
clr4 = "#DDDDDD"
clrs = [clr1, clr2, clr3, clr4]
arrow_props = {'width':1, 'color': clr1,
                'headwidth': 5, 'headlength': 7}


fig_dir = "/Users/dsweet2/Desktop/Tuning Up/Chapter 2/"
def save_fig_named(name):
    plt.tight_layout()
    for ext in ["eps", "png"]:
        plt.savefig(f"{name}.{ext}")
        
def save_fig(fig_num):
    save_fig_named(f"{fig_dir}/CH02_F{fig_num:02d}_sweet")

In [None]:
def horizonal_line(y0):
    c = plt.axis()
    plt.autoscale(False)
    plt.plot([c[0], c[1]], [y0, y0], '--', linewidth=1, color=clr3);

## 2.1	Design I: Randomize to remove measurement bias

In [None]:
# Note that Python converts between booleans and floats like this:
print (float(True), float(False))
print (bool(0), bool(1))

### 2.1.1	A problematic design

In [None]:
def cost(strategy_A, server_1):
    return 10 + float(strategy_A) - 2*float(server_1)

In [None]:
cost(strategy_A=True, server_1=True)

In [None]:
def biased_experiment():
    cost_A = cost(strategy_A=True, server_1=True)
    cost_B = cost(strategy_A=False, server_1=False)

    return cost_B - cost_A

In [None]:
biased_experiment()

### 2.1.2	An unbiased design

In [None]:
def unbiased_experiment():
    cost_A_1 = cost(strategy_A=True, server_1=True)
    cost_A_2 = cost(strategy_A=True, server_1=False)
    cost_B_1 = cost(strategy_A=False, server_1=False)
    cost_B_2 = cost(strategy_A=False, server_1=True)    
    cost_A = (cost_A_1 + cost_A_2)/2
    cost_B = (cost_B_1 + cost_B_2)/2
    
    return cost_B - cost_A

In [None]:
unbiased_experiment()

In [None]:
def customer_order_is_for_ABC():
    return bool(np.random.randint(2))
def randomized_experiment():
    cost_A = cost(strategy_A=True, server_1=customer_order_is_for_ABC())
    cost_B = cost(strategy_A=False, server_1=customer_order_is_for_ABC())
            
    return cost_B - cost_A

In [None]:
np.random.seed(17)
print (randomized_experiment())
print (randomized_experiment())
print (randomized_experiment())

## 2.2	Design II: Replicate to reduce variation

### 2.2.1	Replication reduces variation

In [None]:
np.random.choice([-1,1], size=(10,))

In [None]:
def cost_complex(strategy_A, num_nuisance_factors):
    NF = np.random.choice([-1,1], size=(num_nuisance_factors,))
    return float(strategy_A) + NF.sum()/20

def randomized_experiment_complex(num_nuisance_factors):
    cost_A = cost_complex(True, num_nuisance_factors)
    cost_B = cost_complex(False, num_nuisance_factors)
            
    return cost_B - cost_A

In [None]:
np.random.seed(17)
print (randomized_experiment_complex(num_nuisance_factors=100))
print (randomized_experiment_complex(num_nuisance_factors=100))
print (randomized_experiment_complex(num_nuisance_factors=100))

In [None]:
np.random.seed(17);
data_rec_10000 = np.array([
    randomized_experiment_complex(num_nuisance_factors=100)
                           for _ in range(10000)])

In [None]:
print (data_rec_10000.mean() - 2*data_rec_10000.std())
print (data_rec_10000.mean() + 2*data_rec_10000.std())

In [None]:
plt.hist(data_rec_10000, 15, color=clr1);
plt.xlabel(r'$cost_B - cost_A$')
plt.ylabel('count');
save_fig(7)

In [None]:
def aggregate_measurement(num_measurements):
    measurements = [randomized_experiment_complex(num_nuisance_factors=100)
                    for _ in range(num_measurements)]
    return np.array(measurements).mean()

In [None]:
np.random.seed(17);
print (aggregate_measurement(10))
print (aggregate_measurement(10))
print (aggregate_measurement(10))

In [None]:
def bootstrap_mean(data, num_measurements):
    # Compute means by resampling from data rather than generating new data.
    # This is done here just to speed up figure generation in this notebook. 
    # It saves the time that would be required to
    #  generate new data on each call to this function.
    i = np.random.randint(data.shape[0], size=(num_measurements,))
    return data[i].mean()

In [None]:
np.random.seed(17);
data_10 = np.array([bootstrap_mean(data_rec_10000, 10) for _ in range(10000)])

In [None]:
plt.hist(data_rec_10000,15,color=clr1);
plt.hist(data_10,15,color=clr2);
plt.xlabel(r'$cost_B - cost_A$')
plt.ylabel('count');
plt.legend(['single measurement', 'average of\n10 measurements'], fontsize=8);
save_fig(8)

In [None]:
np.random.seed(17);
data_100 = np.array([bootstrap_mean(data_rec_10000, 100) for _ in range(10000)])

In [None]:
plt.hist(data_rec_10000,15,color=clr1);
plt.hist(data_10,15,color=clr2);
plt.hist(data_100,15,color="#BBBBBB");
plt.xlabel(r'$cost_B - cost_A$')
plt.ylabel('count');
plt.legend(['single measurement', 'average of\n10 measurements',
           'average of\n100 measurements'], fontsize=8);
save_fig(9)

### 2.2.2	Quantify variation with standard error

#### ESTIMATE SD(1)

In [None]:
def calc_SD1():
    num_measurements = 1000
    measurements_A = [cost_complex(strategy_A=True,
                            num_nuisance_factors=100)
                    for _ in range(num_measurements)]

    std_A = np.array(measurements_A).std()
    return np.sqrt(2)*std_A

np.random.seed(17); calc_SD1()

## 2.3	Design III: Determine the number of individual measurements to take

In [None]:
def se_vs_N(data_rec_10000, expectation, hline=None, histogram=True, N_range=(1, 10000), k=None):
    N = np.arange(N_range[0], N_range[1])
    if histogram:
        data = []
        for n in N:
            for _ in range(10):
                m = bootstrap_mean(data_rec_10000, n)
                data.append( (n,m) )
        data = np.array(data)
        plt.plot(data[:,0], expectation + data[:,1] + 1, '.', markersize=1, color=clr3)
        
    sd = data_rec_10000.std()
    if N_range[1]-N_range[0] <= 100:
        fmt = ".--"
        fmtk = ":"
    else:
        fmt = "--"
        fmtk = ":"
        
    if k is None:
        clr = clr1
    else:
        clr = clr2
    plt.plot(N, expectation + sd/np.sqrt(N), fmt, color=clr, label='-PS + S.E.')
    plt.plot(N, expectation - sd/np.sqrt(N), fmt, color=clr)
    if k is not None:
        sk = f"{k:.2f}"[1:]
        plt.plot(N, expectation + k*sd/np.sqrt(N), fmtk, color=clr1, label=f'-PS + {sk}xS.E.')
        plt.plot(N, expectation - k*sd/np.sqrt(N), fmtk, color=clr1)
        plt.legend()


    plt.xlabel('number of individual measurements, N')
    plt.ylabel('$cost_B - cost_A$')
    
    if histogram:
        plt.legend(['aggregate measurement', 'standard error (S.E.)'], fontsize=8,
                  loc = 'lower right');

    if hline is not None:
        horizonal_line(hline)


In [None]:
np.random.seed(7)
se_vs_N(data_rec_10000, expectation=-1)
save_fig(10)

### 2.3.1	Minimize measurement costs

In [None]:
np.random.seed(7)
se_vs_N(data_rec_10000, expectation=-1, hline=0)

plt.annotate("$cost_B - cost_A > 0$", xy=[100, .07],
             xytext=[2000, -.6],
              arrowprops=arrow_props
            )


save_fig(11)

### 2.3.2	Limiting incorrect rejection (false negatives)

In [None]:
PS = .3
se_vs_N(data_rec_10000, expectation=-PS, hline=0, histogram=False)
save_fig(12)

In [None]:
se_vs_N(data_rec_10000, expectation=-PS, hline=0, histogram=False,
        N_range=(1,10))
save_fig(13)

In [None]:
PS = .3
np.random.seed(17)
SD1 = calc_SD1()
N = (SD1/PS)**2
print(N)

In [None]:
se_vs_N(data_rec_10000, expectation=-PS, hline=0, histogram=False,
        N_range=(1,10), k=.84)
save_fig(14)

### 2.3.3	Calculate the false-negatives threshold

#### STEP 1: UNDERSTAND THE  DISTRIBUTION OF AGGREGATE MEASUREMENTS

In [None]:
x = np.array([0, 0, 0, .5, .5, 1])
np.random.seed(17)
fig, axs = plt.subplots(2, 3)
axs=axs.flatten()
for i, N in enumerate([1, 3, 10, 30, 100, 300]):
    axs[i].set(adjustable='datalim')#, aspect='equal')
    axs[i].axis('square')
    axs[i].set_xticks([])
    axs[i].set_yticks([])
    y = [np.random.choice(x, N).mean() for _ in range(1000)]
    n, bins = np.histogram(y,10)
    n = n/n.max()
    # axs[i].axis([-.1, 1.1, 0, 1.1])
    axs[i].axis([-.1, 1.1, 0, 1.1])
    axs[i].bar(bins[:-1], n, width=.03, color=clr1)
    axs[i].set_title(f"N={N}", fontsize=7);
    
save_fig_named(f"{fig_dir}/CH02_Fsidebar")

#### STEP 2: SIMULATE AGGREGATE MEASUREMENTS

In [None]:
plt.hist(np.random.normal(size=(10000,)), 25, color=clr1);
plt.hist(4 + .5*np.random.normal(size=(10000,)), 25, color=clr2);
c = plt.axis()
plt.axis([c[0], c[1], 0, 1700])
plt.legend(['np.random.normal(size=(10000,))', '4 + .5*np.random.normal(size=(10000,))'], loc='upper left');
save_fig(16)

In [None]:
def aggregate_measurement_model(N):
    PS = .3
    SD1 = .707
    SE = SD1/np.sqrt(N)
    return -PS + SE*np.random.normal()

In [None]:
np.random.seed(17)
print (aggregate_measurement_model(N=6))
print (aggregate_measurement_model(N=6))
print (aggregate_measurement_model(N=6))

In [None]:
def probability_false_negative(N):
    samples = np.array([aggregate_measurement_model(N)
                        for _ in range(10000)])
    return len(np.where(samples > 0)[0]) / len(samples)

In [None]:
np.random.seed(17)
probability_false_negative(N=6)

In [None]:
np.random.seed(17)
probability_false_negative(N=4)

In [None]:
def probability_above_k(mean, standard_deviation, k):
    samples = mean + standard_deviation*np.random.normal(size=(100000,))
    threshold = mean + k*standard_deviation
    return len(np.where(samples > threshold)[0]) / len(samples)

In [None]:
probability_above_k(-3, .707, .84)

In [None]:
def probability_above_k(k):
    samples = np.random.normal(size=(100000,))
    return len(np.where(samples > k)[0]) / len(samples)

In [None]:
probability_above_k(.84)

**STEP 3: FIND THE THRESHOLD THAT YIELDS 20% FALSE NEGATIVES**

In [None]:
def overlap_fn_fp():
    SD1 = .707
    PS = .3
    data = []
    thresh = None
    best_N = None
    for N in range(1,100):
        SE = SD1 / np.sqrt(N)
        upper_fp_alpha = 0 + 1.96*SE
        lower_fp_alpha = 0 - 1.96*SE
        upper_fn_beta = -PS + .84*SE
        lower_fn_beta = -PS - .84*SE
        if thresh is None and upper_fn_beta <= lower_fp_alpha:
            thresh = (upper_fn_beta + lower_fp_alpha) / 2
            best_N = N
        data.append( (N, upper_fp_alpha, lower_fp_alpha, upper_fn_beta, lower_fn_beta) )
    return np.array(data), thresh, best_N

In [None]:
data, thresh, best_N = overlap_fn_fp()

In [None]:
PS = .3
SD1 = .707
N = (.84 * SD1 / PS)**2
print (N)

In [None]:
plt.fill_between(data[:,0], data[:,3], data[:,4], color=clr3, alpha=.75, linewidth=1)


plt.xlabel('number of measurements, N')
plt.ylabel('$cost_B - cost_A$')
plt.legend([r'$-.3 \pm .84 S.E$'])
horizonal_line(0)
save_fig(17)

### 2.3.4	Limiting incorrect acceptance (false positives)

In [None]:
plt.fill_between(data[:,0], data[:,1], data[:,2], color=clr2, alpha=.75, linewidth=1)


plt.xlabel('number of measurements, N')
plt.ylabel('$cost_B - cost_A$')
plt.legend([r'$0 \pm 1.96 S.E$'])

save_fig(18)

### 2.3.5	Limiting false negatives and false positives simultaneously

In [None]:
print (thresh, best_N)

In [None]:
data, thresh, best_N = overlap_fn_fp()
plt.fill_between(data[:,0], data[:,3], data[:,4], color=clr3, alpha=.75, linewidth=1)
plt.fill_between(data[:,0], data[:,1], data[:,2], color=clr2, alpha=.75, linewidth=1)


circle = mpl.patches.Ellipse( (best_N, thresh), .2*25, .2, color=clr1, fill=False)#, transform=plt.gca().transAxes)
plt.gcf().gca().add_artist(circle)

plt.xlabel('number of measurements, N')
plt.ylabel('$cost_B - cost_A$')
plt.legend([r'$-.3 \pm .84 S.E$', r'$0 \pm 1.96 S.E$'],
           fontsize=8,
          loc = 'lower right');

save_fig(19)

## 2.4	Run and analyze the A/B test

### 2.4.1	Run a small-sized A/A test

In [None]:
# Table 2.1
alpha = .05
table = []
for N_small in [10, 30, 100, 300, 1000]:
    k = scipy.stats.t.ppf(1-alpha/2, df=N_small)
    table.append( (N_small, k) )
table = np.array(table)
print(table)

In [None]:
alpha = .05
data = []
for N_small in np.arange(10,1000,10):
    k = scipy.stats.t.ppf(1-alpha/2, df=N_small)
    data.append( (N_small, k) )
data = np.array(data)

In [None]:
plt.plot(data[:,0], data[:,1], '-', color=clr1);
plt.plot(table[:,0], table[:,1], '.', color=clr1)
plt.xlabel('k')
plt.ylabel('$N_{small}$')
# plt.annotate("1.96", xy=[0, 1.96])
horizonal_line(1.96)
save_fig(21)

### 2.4.2	Run a small-sized A/B test

### 2.4.3	Run and analyze the full-sized A/B test

## 2.4	Early stopping produces invalid conclusions

In [None]:
def t_stat_vs_n():
    measurements = np.array([])
    t_stat = []
    threshold = []
    alpha = .05
    num_individual_measurements = 100
    for n in range(1, num_individual_measurements):
        measurements = np.append(measurements, np.random.normal())
        if n > 1:
            mu = measurements[:n].mean()
            sd = measurements[:n].std()
            t = np.sqrt(n) * mu / sd
        else:
            t = np.nan
        t_stat.append(t)
        threshold.append(scipy.stats.t.ppf(1-alpha/2, df=n))
    t_stat = np.array(t_stat)
    threshold = np.array(threshold)
    return t_stat, threshold

In [None]:
seed = 179
np.random.seed(seed)

t_stat, threshold = t_stat_vs_n()

plt.plot(threshold, '--k', color=clr1)
plt.plot(t_stat, color=clr2, linewidth=1);
plt.plot(-threshold, '--k', color=clr1)
plt.xlabel('n, index to individual measurement')
plt.legend(['threshold, k', 't statistic'])

i = np.where(t_stat > threshold)[0]
print (i, t_stat[-1] > threshold[-1])

save_fig(22)

In [None]:
def false_positive_rates():
    num_ab_tests = 10000
    fp_at_end = 0
    fp_early_stopping = 0
    for _ in range(num_ab_tests):
        t_stat, threshold = t_stat_vs_n() 
        if abs(t_stat[-1]) > threshold[-1]:
            fp_at_end += 1
        i = np.where(abs(t_stat[1:]) > threshold[1:])[0]
        if len(i) > 0:
            fp_early_stopping += 1
    return fp_at_end / num_ab_tests, fp_early_stopping / num_ab_tests
        

In [None]:
np.random.seed(17); false_positive_rates()

In [None]:
# Faster verions of t_stat_vs_sample() and false_positive_rates() used to
#  generate data for Figure 2.19.  The original versions were easier to use for
#  teaching, but too slow to generate the figure in a reasonable amount of time.
def t_stat_vs_sample_fast(N):
    measurements = np.random.normal(size=(N-1,))
    N = np.arange(2, N+1)
    sx = np.cumsum(measurements)
    sxx = np.cumsum(measurements**2)
    mu = sx/N
    sd = np.sqrt(sxx/N - mu**2)
    t_stats = np.sqrt(N) * mu/sd
    return t_stats

def false_positive_rates_fast(N):
    num_ab_tests = 10000
    fp_at_end = 0
    fp_early_stopping = 0
    for _ in range(num_ab_tests):
        t_stat = t_stat_vs_sample_fast(N)    
        if abs(t_stat[-1]) > 1.96:
            fp_at_end += 1
        i = np.where(abs(t_stat) > 1.96)[0]
        if len(i) > 0:
            fp_early_stopping += 1
    return fp_at_end / num_ab_tests, fp_early_stopping / num_ab_tests
        

In [None]:
np.random.seed(17)
false_positive_rates_fast(1000)

In [None]:
np.random.seed(17)
fpr = []
for N in [10, 30, 100, 300, 1000, 3000, int(1e4), int(3e4), int(1e5)]:
    fp_N = false_positive_rates_fast(N)[1]
    print (N, fp_N)
    fpr.append( (N, fp_N)) 

In [None]:
fpr = np.array(fpr)
plt.semilogx(fpr[:,0], fpr[:,1], '.--', color=clr1);
plt.xlabel('N')
plt.ylabel('false positive rate')
save_fig(23)