# Chapter 2: A/B testing: Evaluating a modification of your system 

In [None]:
import numpy as np
import scipy
import scipy.stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import tuningup

In [None]:
mpl.rcParams['figure.dpi']= 300
tu = tuningup.TuningUp(chapter=2)

## 2.1	Run an ad hoc experiment

### 2.1.1	Simulate the trading system

In [None]:
def trading_system(exchange):
    if exchange == "ASDAQ":
        execution_cost = 12
    elif exchange == "BYSE":
        execution_cost = 10
    execution_cost += np.random.normal()
    return execution_cost

In [None]:
np.random.seed(17)
trading_system("ASDAQ")

### 2.1.2	Compare execution costs

In [None]:
np.random.seed(17)
print (trading_system("ASDAQ"))
print (trading_system("BYSE"))

In [None]:
np.random.seed(18)
print (trading_system("ASDAQ"))
print (trading_system("BYSE"))

#### Variation

In [None]:
np.random.seed(17)
a = np.array([trading_system("ASDAQ") for _ in range(1000)])
b = np.array([trading_system("BYSE") for _ in range(1000)])
plt.hist(a, 25, color=tu.clr1);
plt.hist(b, 25, color=tu.clr2);
plt.legend(['ASDAQ', 'BYSE'])
plt.xlabel('execution cost (mips)')
tu.save_fig(2)

In [None]:
i = np.where(b < a)[0]
len(i)/len(b)

In [None]:
np.random.seed(17)
print(np.array([trading_system("ASDAQ")
        for _ in range(100)]).mean())
print(np.array([trading_system("BYSE")
        for _ in range(100)]).mean())

In [None]:
print(np.array([trading_system("ASDAQ")
        for _ in range(100)]).mean())
print(np.array([trading_system("BYSE")
        for _ in range(100)]).mean())

#### Bias

In [None]:
def trading_system_tod(exchange, time_of_day):
    if time_of_day == "morning":
        bias = 0.0
    elif time_of_day == "afternoon":
        bias = 2.5
    return bias + trading_system(exchange)

In [None]:
np.random.seed(17)
print(np.array([trading_system_tod("ASDAQ", "morning")
        for _ in range(100)]).mean())
print(np.array([trading_system_tod("ASDAQ", "afternoon")
        for _ in range(100)]).mean())

In [None]:
np.random.seed(17)
print(np.array([trading_system_tod("ASDAQ", "morning")
        for _ in range(100)]).mean())
print(np.array([trading_system_tod("BYSE", "afternoon")
        for _ in range(100)]).mean())

In [None]:
print(np.array([trading_system_tod("ASDAQ", "morning")
        for _ in range(1000)]).mean())
print(np.array([trading_system_tod("BYSE", "afternoon")
        for _ in range(1000)]).mean())

In [None]:
def randomized_measurement():
    asdaq = []
    byse = []
    for tod in ["morning", "afternoon"]:
        for _ in range(100):
            if np.random.randint(2)==0:
                asdaq.append(trading_system_tod("ASDAQ", tod))
            else:
                byse.append(trading_system_tod("BYSE", tod))
    return np.array(asdaq).mean(), np.array(byse).mean()

In [None]:
np.random.seed(17)
randomized_measurement()

## 2.1.3	Mitigate variation with replication

In [None]:
np.random.seed(17)
c = np.array([trading_system("ASDAQ") for _ in range(3)])
print(c)

In [None]:
c.mean()

In [None]:
print (c-12)

In [None]:
c.mean()-12

In [None]:
np.sqrt(((c-12)**2).mean())

In [None]:
np.sqrt(((c-c.mean())**2).mean())

In [None]:
c.std()

In [None]:
def aggregate_measurement(exchange, num_individual_measurements):
    individual_measurements = np.array([
        trading_system(exchange)
        for _ in range(num_individual_measurements)
    ])
    return individual_measurements.mean()

In [None]:
plt.hist(np.array([trading_system("ASDAQ") for _ in range(1000)]), color=tu.clr1)
plt.hist(np.array([aggregate_measurement("ASDAQ", 3) for _ in range(1000)]), color=tu.clr2);
plt.xlabel('execution cost (mps)')
print (plt.axis())
plt.legend(['individual', 'aggregate of 3'])
tu.save_fig(3)

In [None]:
plt.hist(np.array([trading_system("ASDAQ") for _ in range(1000)]), color=tu.clr1)
plt.hist(np.array([aggregate_measurement("ASDAQ", 3) for _ in range(1000)]), color=tu.clr2);
plt.hist(np.array([aggregate_measurement("ASDAQ", 30) for _ in range(1000)]), color=tu.clr3);
plt.hist(np.array([aggregate_measurement("ASDAQ", 300) for _ in range(1000)]), color=tu.clr4);
plt.xlabel('execution cost (mps)')
plt.legend(['individual', 'aggregate of 3', 'aggregate of 30', 'aggregate of 300'])
tu.save_fig(4)

In [None]:
np.random.seed(17)
print (aggregate_measurement("ASDAQ", 300))
print (aggregate_measurement("BYSE", 300))

In [None]:
print (aggregate_measurement("ASDAQ", 300))
print (aggregate_measurement("BYSE", 300))

In [None]:
plt.hist(np.array([aggregate_measurement("ASDAQ", 300) for _ in range(1000)]), color=tu.clr1);
plt.hist(np.array([aggregate_measurement("BYSE", 300) for _ in range(1000)]), color=tu.clr2);

# (6.6141395990492065, 13.526618148811357, 0.0, 270.9)
c = plt.axis()
plt.axis([6.614, 13.52, c[2], c[3]])

plt.xlabel('execution cost (mps)')

plt.legend(['ASDAQ', 'BYSE'])
tu.save_fig(5)

In [None]:
np.random.seed(17)
a3 = np.array([aggregate_measurement("ASDAQ", 3)
               for _ in range(1000)])
a30 = np.array([aggregate_measurement("ASDAQ", 30)
                for _ in range(1000)])
a300 = np.array([aggregate_measurement("ASDAQ", 300)
                 for _ in range(1000)])

print (a3.std(), a30.std(), a300.std())

In [None]:
def aggregate_measurement_with_se(exchange, num_individual_measurements):
    individual_measurements = np.array([
        trading_system(exchange)
        for _ in range(num_individual_measurements)
    ])
    aggregate_measurement = individual_measurements.mean()
    se = individual_measurements.std() / np.sqrt(
        num_individual_measurements)
    return aggregate_measurement, se

In [None]:
np.random.seed(17)
print (aggregate_measurement_with_se("ASDAQ", 300))
print (aggregate_measurement_with_se("BYSE", 300))

In [None]:
10.05 + .057

In [None]:
12.00 - .060

# 2.2	Run an A/B test

In [None]:
np.random.seed(17)
plt.hist(np.array([aggregate_measurement_with_se("ASDAQ", 10)[0] for _ in range(1000)]), 20, color=tu.clr1)
plt.xlabel('potential ASDAQ\naggregate measurement\nvalues (mips)')
tu.vertical_line(102 + 10)
plt.annotate("actual\naggregate\nmeasurement", xy=[112.3, 90],
             xytext=[120, 110],
             arrowprops=tu.arrow_props
            )
tu.save_fig(7)

In [None]:
np.random.seed(17)
num_individual_measurements = 10
asdaq, se_asdaq = aggregate_measurement_with_se("ASDAQ", num_individual_measurements)
byse, se_byse = aggregate_measurement_with_se("BYSE", num_individual_measurements)
delta = byse - asdaq
se_delta = np.sqrt(se_byse**2 + se_asdaq**2)

In [None]:
z = np.random.normal(size=(10000,))
plt.hist(z, 30, color=tu.clr1)
plt.xlabel('z')
# tu.vertical_line(1.64)
tu.vertical_line(-1.64)
# plt.annotate("5%", xy=[2.25, 112],
#              xytext=[3, 600],
#              arrowprops=tu.arrow_props
#             )
plt.annotate("5%", xy=[-2.25, 112],
             xytext=[-3, 600],
             arrowprops=tu.arrow_props
            )

tu.save_fig(8)

In [None]:
def ab_test_design(sd1_delta, PS):
    num_individual_measurements = (1.64 * sd1_delta / PS)**2
    return np.ceil(num_individual_measurements)

In [None]:
np.random.seed(17)
sd1_asdaq = np.array([trading_system("ASDAQ") for _ in range(100)]).std()
sd1_byse = sd1_asdaq
sd1_delta = np.sqrt(sd1_asdaq**2 + sd1_byse**2)
PS = 1.0
ab_test_design(sd1_delta, PS)

#### False negatives

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2)

z = np.random.normal(size=(10000,)) 
ax1.hist(z, 30, color=tu.clr1)
ax1.set_xlabel('z')
tu.vertical_line(-1.64, ax=ax1)

ax1.annotate("5%", xy=[-2.25, 112],
             xytext=[-3.7, 600],
             arrowprops=tu.arrow_props
            )

ax1.text(-3.5, 900,'(a)')
tu.aspect_square(ax1)


z = np.random.normal(size=(10000,)) 
ax2.hist(z, 30, color=tu.clr1, alpha=.5)
ax2.set_xlabel('z')
tu.vertical_line(-1.64, clr=tu.clr1, ax=ax2)
ax2.annotate("5%", xy=[-2.25, 112],
             xytext=[-6, 600],
             arrowprops=tu.arrow_props
            )

ax2.annotate("20%", xy=[0, 112],
             xytext=[3, 600],
             arrowprops=tu.arrow_props
            )
ax2.hist(-(1.64 + .84) + z, 30, color=tu.clr2, alpha=.5)
c = ax2.axis()
ax2.axis([-8, 5.5, 0, c[3]])
ax2.text(-7, 800,'(b)')
tu.aspect_square(ax2)

tu.save_fig(10)

In [None]:
1.64 + .84

In [None]:
def ab_test_design2(sd1_delta, PS):
    num_individual_measurements = ( 2.48 * sd1_delta / PS)**2
    return np.ceil(num_individual_measurements)

In [None]:
np.random.seed(17)
sd1_asdaq = np.array([trading_system("ASDAQ") for _ in range(100)]).std()
sd1_byse = sd1_asdaq
sd1_delta = np.sqrt(sd1_asdaq**2 + sd1_byse**2)
PS = 1.0
ab_test_design2(sd1_delta, PS)

In [None]:
def measure(min_individual_measurements):
    asdaq = []
    byse = []
    while (len(asdaq) < min_individual_measurements
           and len(byse) < min_individual_measurements):
        if np.random.randint(2)==0:
            asdaq.append(trading_system("ASDAQ"))
        else:
            byse.append(trading_system("BYSE"))
    return np.array(asdaq), np.array(byse)

In [None]:
def analyze(asdaq, byse):
    agg_asdaq = asdaq.mean()
    se_asdaq = asdaq.std() / np.sqrt(len(asdaq))
    agg_byse = byse.mean()
    se_byse = byse.std() / np.sqrt(len(byse)) 
    
    delta = agg_byse - agg_asdaq
    se_delta = np.sqrt(se_asdaq**2 + se_byse**2)
    
    z = delta / se_delta
    return z

In [None]:
np.random.seed(17)
sd1_asdaq = np.array([trading_system("ASDAQ") for _ in range(100)]).std()
sd1_byse = sd1_asdaq
sd1_delta = np.sqrt(sd1_asdaq**2 + sd1_byse**2)
PS = 1.0
ab_test_design2(sd1_delta, PS)

In [None]:
np.random.seed(17)
asdaq, byse = measure(16)

In [None]:
byse.mean() - asdaq.mean()

In [None]:
analyze(asdaq, byse)