In [1]:
import time
from olsEmpowered import sim_data         as sd
from olsEmpowered import power_estimation as pe
from olsEmpowered import isotonic         as iso
from olsEmpowered import binary_search    as bst

In [2]:
dgps_analyzed = 0

while dgps_analyzed < 500:
    start = time.time()
    dgp = sd.create_random_dgp(max_covariates             = 10, 
                               permissible_distributions  = ['normal', 
                                                             'exponential', 
                                                             'uniform'],
                               range_of_normal_loc        = [50, 250], 
                               range_of_normal_scale      = [5, 25],
                               range_of_exponential_scale = [0.5, 4],
                               range_of_uniform           = [0,10],
                               range_of_betas             = [0, 2.5],
                               range_of_abs_mde           = [0.10, 0.20],
                               range_of_noise_loc         = [25.0, 50.0],
                               range_of_noise_scale       = [20.00, 35.00],
                               sample_size                = 5000000)
    pe_ob = pe.power_estimation(dgp)
    print("############## Starting value is n = {:,}.    ###########".format(pe_ob.starting_value))
    print("############## Starting adj. R-squared is {}. ###########".format(round(pe_ob.rsquared_adj,2)))
    if pe_ob.starting_value > 475000: 
        sd.remove_data(dgp, drop_meta_data = True)
        print("Starting value too high.")
        continue
    if pe_ob.starting_value < 500: 
        sd.remove_data(dgp, drop_meta_data = True)
        print("Starting value too low.")
        continue       
    if pe_ob.rsquared_adj > 0.95:
        sd.remove_data(dgp, drop_meta_data = True)
        print("r-squared too high.")
        continue 
    
    iso_ob = iso.isotonic(pe_ob)
    n1, p1,  df1 = iso_ob.isotonic_interpolation()
   
    naive_bst = bst.binary_search(pe_ob, informed = 0)
    n2, p2,  df2 = naive_bst.binary_search()

    informed_bst = bst.binary_search(pe_ob, informed = 1)
    n3, p3,  df3 = informed_bst.binary_search()
    
    pe.save_results(df1, iso_ob)
    del iso_ob
    pe.save_results(df2, naive_bst)
    del naive_bst
    pe.save_results(df3, informed_bst)
    del informed_bst

    dgps_analyzed += 1
    sd.remove_data(dgp, drop_meta_data = False)
    end = time.time()
    time_elapsed = end - start
    time_min = int(time_elapsed/60)
    time_sec = int(time_elapsed - (time_min*60))
    print("Assessment took {} minutes and {} seconds.".format(time_min, time_sec))    

2 additional covariates will be created.


  return ptp(axis=axis, out=out, **kwargs)


Now performing OLS to infer DGP parameters.
Simulation data was saved to: 
     /home/bknight/Documents/Power_Analysis_Techniques/v3/data/sim_data_2020_01_11_114618.csv.
Meta-data was saved to: 
     /home/bknight/Documents/Power_Analysis_Techniques/v3/data/log_files/sim_data_2020_01_11_114618_log_file.txt.
5000000 observations of simulation data created in 1 minutes and 12 seconds.
Reconstituting data object from file.
Successfully read in the .csv file specified at:
     /home/bknight/Documents/Power_Analysis_Techniques/v3/data/sim_data_2020_01_11_114618.csv.
/home/bknight/Documents/Power_Analysis_Techniques/v3/data/log_files/sim_data_2020_01_11_114618_log_file.txt
Successfully read in the meta-data specified at:
     /home/bknight/Documents/Power_Analysis_Techniques/v3/data/log_files/sim_data_2020_01_11_114618_log_file.txt.
############## Starting value is n = 267,215.    ###########
############## Starting adj. R-squared is 0.15. ###########
Estimating the effective power of n = 26

  return ptp(axis=axis, out=out, **kwargs)


The effective power of sample size n = 572,904 is 86.57%.
Estimating the effective power of n = 511,076 using 200 simulations.


  return ptp(axis=axis, out=out, **kwargs)


The effective power of sample size n = 511,076 is 85.07%.
Estimating the effective power of n = 479,544 using 200 simulations.


  return ptp(axis=axis, out=out, **kwargs)


The effective power of sample size n = 479,544 is 82.59%.
Estimating the effective power of n = 466,812 using 200 simulations.


  return ptp(axis=axis, out=out, **kwargs)


The effective power of sample size n = 466,812 is 80.1%.
Binary search commenced, naively centered on n = 2,500,000.
Estimating the effective power of n = 2,500,000 using 200 simulations.
The effective power of sample size n = 2,500,000 is 100%.
Binary search suggested the region of n = (0, 2,500,000).
Now assessing the value 1,250,000 within the range n = (0, 2,500,000), and power = (0%, 100%).
Estimating the effective power of n = 1,250,000 using 200 simulations.
The effective power of sample size n = 1,250,000 is 99.5%.
Now assessing the value 625,000 within the range n = (0, 1,250,000), and power = (0%, 99.5%).
Estimating the effective power of n = 625,000 using 200 simulations.
The effective power of sample size n = 625,000 is 91.04%.
Now assessing the value 312,500 within the range n = (0, 625,000), and power = (0%, 91.04%).
Estimating the effective power of n = 312,500 using 200 simulations.
The effective power of sample size n = 312,500 is 63.18%.
Now assessing the value 468,75

KeyboardInterrupt: 

In [49]:
import pandas as pd
import os, os.path

files_found = len([name for name in os.listdir('./results/')])
print("{} files were found in total.".format(files_found))
print("{} iterations.".format(int(files_found/3)))
isotonic_results        = []
naive_binary_results    = []
informed_binary_results = []
[isotonic_results.append(name) for name in os.listdir('./results/') if "isotonic" in name]
[naive_binary_results.append(name) for name in os.listdir('./results/') if "naive_binary" in name]
[informed_binary_results.append(name) for name in os.listdir('./results/') if "informed_binary" in name]
isotonic_results.sort()
naive_binary_results.sort()
informed_binary_results.sort()


iso_sims, iso_time, iso_n = [], [], []
for i in isotonic_results:
    df = pd.read_csv('./results/' + i)
    df['delta'] = abs(df['power'] - 0.8)
    df.sort_values('delta',ascending=True, inplace=True)
    n = df.iat[0,0]
    iso_sims.append(df['sims_used'][0])
    iso_time.append(df['seconds_used'][0])
    iso_n.append(n)
   # print(df)
    
naive_sims, naive_time, naive_n = [], [], []
for j in naive_binary_results:
    df = pd.read_csv('./results/' + j)
    df['delta'] = abs(df['power'] - 0.8)
    df.sort_values('delta',ascending=True, inplace=True)
    n = df.iat[0,0]
    naive_sims.append(df['sims_used'][0])
    naive_time.append(df['seconds_used'][0])
    naive_n.append(n)
   # print(df)
    
informed_sims, informed_time, informed_n = [], [], []
for k in informed_binary_results:
    df = pd.read_csv('./results/' + k)
    df['delta'] = abs(df['power'] - 0.8)
    df.sort_values('delta',ascending=True, inplace=True)
    n = df.iat[0,0]
    informed_sims.append(df['sims_used'][0])
    informed_time.append(df['seconds_used'][0])
    informed_n.append(n)
#     print(df)

366 files were found in total.
122 iterations.


In [52]:
results_df = pd.DataFrame([iso_sims, iso_time, iso_n, 
                           naive_sims, naive_time, naive_n, 
                           informed_sims, informed_time, informed_n]).T
results_df.columns = ['iso_sims', 'iso_time', 'iso_n', 
                      'naive_sims', 'naive_time', 'naive_n', 
                      'informed_sims', 'informed_time', 'informed_n']
results_df.sort_values(by=['iso_n'], ascending=False, inplace=True)

Unnamed: 0,iso_sims,iso_time,iso_n,naive_sims,naive_time,naive_n,informed_sims,informed_time,informed_n
count,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0
mean,750.819672,330.74001,577195.6,1490.163934,1241.29393,564831.3,1759.016393,1280.863144,575420.9
std,379.91863,240.87262,391875.2,469.993817,565.469193,384266.2,367.199385,829.406975,388889.5
min,400.0,75.617206,6672.0,600.0,285.07927,6591.0,1200.0,278.669772,8129.0
25%,400.0,154.843035,264989.2,1200.0,816.738791,246093.0,1600.0,645.201244,253786.5
50%,600.0,257.716824,531695.0,1400.0,1176.055182,546875.0,1800.0,1041.051054,552717.5
75%,1000.0,409.003227,834779.5,1800.0,1648.725082,843750.0,2000.0,1594.411918,813278.5
max,2200.0,1352.286171,1791028.0,3000.0,3411.946473,1669921.0,3000.0,4025.489231,1745263.0


In [57]:
subset0 = results_df[ (results_df['iso_n'] <= 1000000)
                    & (results_df['informed_n'] <= 1000000)
                    & (results_df['naive_n'] <= 1000000)]
subset0.describe() 

Unnamed: 0,iso_sims,iso_time,iso_n,naive_sims,naive_time,naive_n,informed_sims,informed_time,informed_n
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,764.0,313.242924,447243.98,1550.0,1270.909079,435949.69,1744.0,1176.007488,445712.06
std,402.396859,242.195488,290527.191374,468.287229,518.629337,285097.458357,357.409309,761.094903,287833.26779
min,400.0,75.617206,6672.0,800.0,404.704021,6591.0,1200.0,278.669772,8129.0
25%,400.0,150.751638,162044.25,1200.0,864.777676,158203.0,1400.0,608.789096,176987.75
50%,600.0,234.392103,476644.0,1400.0,1201.349467,441893.5,1800.0,937.296494,478684.0
75%,1000.0,379.159441,707042.25,1800.0,1685.329577,703125.0,2000.0,1512.228182,683274.25
max,2200.0,1352.286171,984466.0,3000.0,2452.588519,984375.0,3000.0,3596.498493,978115.0


In [56]:
subset1 = results_df[ (results_df['iso_n'] <= 100000)
                    & (results_df['informed_n'] <= 100000)
                    & (results_df['naive_n'] <= 100000)]
subset1.describe() 

Unnamed: 0,iso_sims,iso_time,iso_n,naive_sims,naive_time,naive_n,informed_sims,informed_time,informed_n
count,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0
mean,621.052632,154.426814,41556.157895,2157.894737,1574.160735,41530.631579,1715.789474,449.408022,42146.842105
std,404.940253,128.319642,27290.955659,429.878264,522.199109,26257.348549,260.902617,127.108421,26395.498384
min,400.0,75.617206,6672.0,1400.0,728.309509,6591.0,1200.0,278.669772,8129.0
25%,400.0,90.271731,20347.5,1900.0,1063.418372,19912.0,1600.0,363.718134,20350.5
50%,400.0,102.4829,37315.0,2200.0,1616.195602,35156.0,1600.0,417.401222,36842.0
75%,700.0,160.048984,57222.5,2400.0,1930.355553,62621.5,1800.0,503.146291,58853.5
max,1800.0,612.985641,98658.0,3000.0,2452.588519,87890.0,2200.0,775.829499,93306.0


In [21]:
subset2 = results_df[ (results_df['iso_n']      >  100000)
                    & (results_df['informed_n'] >  100000)
                    & (results_df['naive_n']    >  100000)
                    & (results_df['iso_n']      <= 1000000)
                    & (results_df['informed_n'] <= 1000000)
                    & (results_df['naive_n']    <= 1000000)]
subset2.describe() 

Unnamed: 0,iso_sims,iso_time,iso_n,naive_sims,naive_time,naive_n,informed_sims,informed_time,informed_n
count,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0
mean,797.530864,350.496086,542405.320988,1407.407407,1199.775974,528467.740741,1750.617284,1346.4444,540375.506173
std,396.85492,247.971263,236701.626361,348.488801,494.508405,234128.095255,377.5326,747.266058,233696.453345
min,400.0,102.777681,110893.0,800.0,404.704021,105468.0,1200.0,357.186569,100123.0
25%,400.0,181.895254,391229.0,1200.0,814.211961,386718.0,1400.0,780.699575,391293.0
50%,600.0,285.009515,526793.0,1400.0,1166.489655,527343.0,1800.0,1123.908861,546944.0
75%,1000.0,402.25219,726511.0,1600.0,1509.755122,703125.0,2000.0,1610.897891,716924.0
max,2200.0,1352.286171,984466.0,2400.0,2336.246153,984375.0,3000.0,3596.498493,978115.0


In [6]:
subset = results_df[ (results_df['iso_n'] <= 1000000)
                   & (results_df['informed_n'] <= 1000000)
                   & (results_df['naive_n'] <= 1000000)]
# subset = results_df
subset.describe()

Unnamed: 0,iso_sims,iso_time,iso_n,naive_sims,naive_time,naive_n,informed_sims,informed_time,informed_n
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,764.0,313.242924,447243.98,1550.0,1270.909079,435949.69,1744.0,1176.007488,445712.06
std,402.396859,242.195488,290527.191374,468.287229,518.629337,285097.458357,357.409309,761.094903,287833.26779
min,400.0,75.617206,6672.0,800.0,404.704021,6591.0,1200.0,278.669772,8129.0
25%,400.0,150.751638,162044.25,1200.0,864.777676,158203.0,1400.0,608.789096,176987.75
50%,600.0,234.392103,476644.0,1400.0,1201.349467,441893.5,1800.0,937.296494,478684.0
75%,1000.0,379.159441,707042.25,1800.0,1685.329577,703125.0,2000.0,1512.228182,683274.25
max,2200.0,1352.286171,984466.0,3000.0,2452.588519,984375.0,3000.0,3596.498493,978115.0


In [47]:
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure


fig, ax = plt.subplots()
fig.set_size_inches(12, 4.8, forward=True)



x1 = subset['iso_n']
y1 = subset['iso_time']


x2 = subset['informed_n']
y2 = subset['informed_time']

x3 = subset['naive_n'] 
y3 = subset['naive_time']

sns.regplot(x1, y1, 
#             lowess=True, 
            ci=95, 
            color = '#b300b3', 
            marker='o', 
            scatter_kws={'s':10})
sns.regplot(x2, y2, 
#             lowess=True, 
            ci=95, 
            color = '#006600', 
            marker='o', 
            scatter_kws={'s':10})
sns.regplot(x3, y3, 
#             lowess=True, 
            ci=95, 
            color = '#3333cc', 
            marker='o', 
            scatter_kws={'s':10})

labels = [
         'Isotonic Interpolation', 
         'Binary Search (with Starting Value)', 
         'Binary Search (Uninformed)'
         ]
plt.legend(labels, prop={'size': 14})
plt.title("Linear Trends of Power Estimation Runtimes \nfor 100 Simulated A/B Tests (with 95% C.I.)", 
          size = 24, y=1.08)
plt.ylabel('Estimation \nTime \nin \nSeconds', size = 18).set_rotation(0)
plt.xlabel('Estimated Sample Size Required', size = 18)
plt.xlim(0,1000000)

ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,}'.format(int(x))))

ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,}'.format(int(x))))

ax.tick_params(axis='both', which='major', labelsize=16)
ax.yaxis.set_label_coords(-0.16,0.35)

#plt.show()
plt.savefig('v1.png',bbox_inches='tight')
plt.clf()

<Figure size 864x345.6 with 0 Axes>