In [1]:
def infer_power(sample_sizes_to_eval, sub_iterations, meta_iterations,
                p_value_threshold, desired_power, data, file_out): 
    import pandas as pd
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    import numpy as np
    from scipy.optimize import curve_fit
    import time    

    start = time.time()
    
    meta_results        = []
    meta_supplement     = []
    sub_iterations_list = []
    
    max_n = len(data)
    
    master_counter = 0
    while master_counter < meta_iterations:
        try:
            master_results = []
            for k in sample_sizes_to_eval:
                counter = 0 
                pvalues = []
                while counter < sub_iterations:
                    working_df = data.sample(k, replace=False)
                    X = working_df.Treated
                    X = sm.add_constant(X)
                    Y = working_df.Order_Amt
                    model = sm.OLS(Y, X).fit(method='pinv').get_robustcov_results(
                            'cluster', groups = working_df.Customer_ID, 
                            use_correction=True, df_correction=True)
                    pvalues.append(model.pvalues[1])
                    counter += 1

                results = []
                for i in pvalues:
                    if i <= p_value_threshold:
                        results.append(1)    
                    else:
                        results.append(0)  
                out = []
                out.append(k)
                out.append(p_value_threshold)
                p_val = sum(results)/len(results)
                print(p_val)
                out.append(p_val) 
                master_results.append(out)

            def exp_func(x, a, b, c):
                return a * np.log(b * x) + c

            eta = []
            for i in master_results:
                eta.append(i[0])
            eta = np.asarray(eta)

            cdf = []
            for i in master_results:
                cdf.append(i[2])
            cdf = np.asarray(cdf)

            popt, pcov = curve_fit(exp_func, eta, cdf)


            recommended_n = int(np.exp((desired_power - popt[2])/popt[0])/popt[1])
            if recommended_n > max_n:
                recommended_n = max_n

            print("The recommended sample size for a statistical power level of " +
                  "{} is {}.".format(desired_power, recommended_n))

            final_pvalues = []
            counter_2 = 0 
            while counter_2 < 500:
                if counter_2 % 50 == 0:                       
                    print("Verification {} complete.".format(round((counter_2/500), 2)))
                working_df_2 = data.sample(recommended_n, replace=False)
                X = working_df_2.Treated
                X = sm.add_constant(X)
                Y = working_df_2.Order_Amt
                model_2 = sm.OLS(Y, X).fit(method='pinv').get_robustcov_results(
                          'cluster', groups = working_df_2.Customer_ID, 
                          use_correction=True, df_correction=True)
                final_pvalues.append(model_2.pvalues[1])
                counter_2 += 1

            final_results = []
            for i in final_pvalues:
                if i <= p_value_threshold:
                    final_results.append(1)    
                else:
                    final_results.append(0)  

            meta_results.append(sum(final_results)/len(final_results))
            meta_supplement.append(desired_power)
            sub_iterations_list.append(sub_iterations)
            master_counter += 1
            print("{} iterations sucessfully completed.".format(master_counter))
            end = time.time()
            print(str(end - start) + ' time elapsed.')
            
        except:
            print("Power curve inference failed.")        
        
    df_out = pd.DataFrame(list(zip(meta_supplement, meta_results))) 
    df_out.columns = ['Desired Power', 'Actual Power']
    df_out['Error'] = (df_out['Desired Power'] - df_out['Actual Power'])
    df_out['Absolute Error'] =  df_out['Error'].abs()
    df_out['Iterations'] = sub_iterations_list
    total_iterations = (len(sample_sizes_to_eval)*sub_iterations)+500
    df_out['Total Iterations'] = total_iterations
    df_out.to_csv(file_out)

In [4]:
import pandas as pd
data_file_to_read = pd.read_csv('data_c_1000000_mde_01.csv')
infer_power([6000, 8000,10000,12000,14000], 100, 1, 0.05, 0.8, data_file_to_read, 'test4.csv')

0.37333333333333335
0.5
0.6
The recommended sample size for a statistical power level of 0.8 is 17136.
Verification 0.0 complete.




Verification 0.1 complete.
Verification 0.2 complete.
Verification 0.3 complete.
Verification 0.4 complete.
Verification 0.5 complete.
Verification 0.6 complete.
Verification 0.7 complete.
Verification 0.8 complete.
Verification 0.9 complete.
1 iterations sucessfully completed.
1126.7036168575287 time elapsed.


In [3]:
# import pandas as pd
# data_file_to_read = pd.read_csv('data_c_1000000_mde_01.csv')
# infer_power([          8000,10000,12000],             300, 30, 0.05, 0.8, data_file_to_read, 'df7.csv')
# print("DF 7 created.")
# infer_power([     6000,8000,10000,12000,14000],       300, 30, 0.05, 0.8, data_file_to_read, 'df8.csv')
# print("DF 8 created.")
# infer_power([4000,6000,8000,10000,12000,14000,16000], 300, 30, 0.05, 0.8, data_file_to_read, 'df9.csv')
# print("DF 9 created.")