In [None]:
def infer_power(sample_sizes_to_eval, sub_iterations, meta_iterations,
                p_value_threshold, desired_power, data, file_out, seed): 
    import pandas as pd
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    import numpy as np
    import time    
    from scipy.optimize import curve_fit
    start = time.time()
    
    meta_results        = []
    meta_supplement     = []
    sub_iterations_list = []
    
    max_n = len(data)
    
    master_counter = 0
    while master_counter < meta_iterations:
        try:
            master_results = []
            for k in sample_sizes_to_eval:
                counter = 0 
                pvalues = []
                while counter < sub_iterations:
                    working_df = data.sample(k, replace=False, random_state=seed)
                    X = working_df.Treated
                    X = sm.add_constant(X)
                    Y = working_df.Order_Amt
                    model = sm.OLS(Y, X).fit(method='pinv').get_robustcov_results(
                            'cluster', groups = working_df.Customer_ID, 
                            use_correction=True, df_correction=True)
                    pvalues.append(model.pvalues[1])
                    counter += 1

                results = []
                for i in pvalues:
                    if i <= p_value_threshold:
                        results.append(1)    
                    else:
                        results.append(0)  
                out = []
                out.append(k)
                out.append(p_value_threshold)
                p_val = sum(results)/len(results)
                print(p_val)
                out.append(p_val) 
                master_results.append(out)

            def exp_func(x, a, b, c):
                return a * np.log(b * x) + c

            eta = []
            for i in master_results:
                eta.append(i[0])
            eta = np.asarray(eta)

            cdf = []
            for i in master_results:
                cdf.append(i[2])
            cdf = np.asarray(cdf)

            popt, pcov = curve_fit(exp_func, eta, cdf)


            recommended_n = int(np.exp((desired_power - popt[2])/popt[0])/popt[1])
            if recommended_n > max_n:
                recommended_n = max_n

            print("The recommended sample size for a statistical power level of " +
                  "{} is {}.".format(desired_power, recommended_n))

            final_pvalues = []
            counter_2 = 0 
            while counter_2 < 1000:
                if counter_2 % 100 == 0:                       
                    print("Verification {} complete.".format(round((counter_2/1000), 2)))
                working_df_2 = data.sample(recommended_n, replace=False, random_state=seed)
                X = working_df_2.Treated
                X = sm.add_constant(X)
                Y = working_df_2.Order_Amt
                model_2 = sm.OLS(Y, X).fit(method='pinv').get_robustcov_results(
                          'cluster', groups = working_df_2.Customer_ID, 
                          use_correction=True, df_correction=True)
                final_pvalues.append(model_2.pvalues[1])
                counter_2 += 1

            final_results = []
            for i in final_pvalues:
                if i <= p_value_threshold:
                    final_results.append(1)    
                else:
                    final_results.append(0)  

            meta_results.append(sum(final_results)/len(final_results))
            meta_supplement.append(desired_power)
            sub_iterations_list.append(sub_iterations)
            master_counter += 1
            print("{} iterations sucessfully completed.".format(master_counter))
            end = time.time()
            print(str(end - start) + ' time elapsed.')
        except:
            print("Power curve inference failed.")        
        
    df_out = pd.DataFrame(list(zip(meta_supplement, meta_results))) 
    df_out.columns = ['Desired Power', 'Actual Power']
    df_out['Error'] = (df_out['Desired Power'] - df_out['Actual Power'])
    df_out['Absolute Error'] =  df_out['Error'].abs()
    df_out['Iterations'] = sub_iterations_list
    total_iterations = (len(sample_sizes_to_eval)*sub_iterations)+1000
    df_out['Total Iterations'] = total_iterations
    df_out.to_csv(file_out)
#     return df_out

In [None]:
def multi_sim(sample_size_vector,
              simulations, meta_simulations,  
              alpha, desired_power, 
              dataframe, file_out):
    import os
    import pandas as pd
    import multiprocessing 
    
    meta_n = int(meta_simulations/5)
    
    k = int(len(dataframe)/5.0)
    
    print("Will perform {} set(s) of {} simulations each partitioned 5 ways, or {} per process".
          format(meta_simulations, simulations, meta_n))

    p1 = multiprocessing.Process(target=infer_power, 
             args=(sample_size_vector, simulations, meta_n, alpha, 
                   desired_power, dataframe.sample(k, replace=False), 
                   'multi_sim_df1.csv', 1)) 
    p2 = multiprocessing.Process(target=infer_power, 
             args=(sample_size_vector, simulations, meta_n, alpha, 
                   desired_power, dataframe.sample(k, replace=False), 
                   'multi_sim_df2.csv', 2)) 
    p3 = multiprocessing.Process(target=infer_power, 
             args=(sample_size_vector, simulations, meta_n, alpha, 
                   desired_power, dataframe.sample(k, replace=False), 
                   'multi_sim_df3.csv', 3)) 
    p4 = multiprocessing.Process(target=infer_power, 
             args=(sample_size_vector, simulations, meta_n, alpha, 
                   desired_power, dataframe.sample(k, replace=False), 
                   'multi_sim_df4.csv', 4)) 
    p5 = multiprocessing.Process(target=infer_power, 
             args=(sample_size_vector, simulations, meta_n, alpha, 
                   desired_power, dataframe.sample(k, replace=False), 
                   'multi_sim_df5.csv', 5)) 

    p1.start() 
    p2.start()
    p3.start() 
    p4.start() 
    p5.start() 

    p1.join() 
    p2.join() 
    p3.join() 
    p4.join() 
    p5.join() 

    df1 = pd.read_csv('multi_sim_df1.csv')
    df2 = pd.read_csv('multi_sim_df2.csv')
    df3 = pd.read_csv('multi_sim_df3.csv')
    df4 = pd.read_csv('multi_sim_df4.csv')
    df5 = pd.read_csv('multi_sim_df5.csv')
    df  = pd.concat([df1,df2], axis=0)
    df  = pd.concat([df ,df3], axis=0)
    df  = pd.concat([df ,df4], axis=0)
    df  = pd.concat([df ,df5], axis=0)

    try:
        os.remove('multi_sim_df1.csv')
    except:
        print('The file df1.csv could not be deleted.')
    try:
        os.remove('multi_sim_df2.csv')
    except:
        print('The file df2.csv could not be deleted.')
    try:
        os.remove('multi_sim_df3.csv')
    except:
        print('The file df3.csv could not be deleted.')
    try:
        os.remove('multi_sim_df4.csv')
    except:
        print('The file df4.csv could not be deleted.')
    try:
        os.remove('multi_sim_df5.csv')
    except:
        print('The file df5.csv could not be deleted.')
        
#     return df
    df.to_csv(file_out)

In [None]:
import pandas as pd
data_file_to_read = pd.read_csv('data_c_1000000_mde_01.csv')
multi_sim([8000,10000,12000], 100, 5, 0.05, 0.8, data_file_to_read, 'results_8k_10k_12k.csv')

In [None]:
# import pandas as pd
# data_file_to_read = pd.read_csv('data_c_1000000_mde_01.csv')
# infer_power([          8000,10000,12000],             20, 1, 0.05, 0.8, data_file_to_read, 'df1.csv')
# print("DF 1 created.")
# infer_power([     6000,8000,10000,12000,14000],       20, 1, 0.05, 0.8, data_file_to_read, 'df2.csv')
# print("DF 2 created.")
# infer_power([4000,6000,8000,10000,12000,14000,16000], 20, 1, 0.05, 0.8, data_file_to_read, 'df3.csv')
# print("DF 3 created.")


In [None]:
# fig = plt.figure(1, figsize=(12, 6))

# # Create an axes instance
# ax = fig.add_subplot(111)

# # Create the boxplot
# bp = ax.boxplot(data)
# plt.ylim(0.0,0.2)

# ax.tick_params(axis='both', which='major', labelsize=16)
# ax.tick_params(axis='x', which='major', labelsize=10)
# ax.set_xticklabels(["Sample I\n500 Iterations", "Sample II\n500 Iterations", "Sample III\n500 Iterations",
#                     "Sample IV\n500 Iterations", "Sample V\n500 Iterations", "Sample VI\n500 Iterations",
#                     "Sample VII\n500 Iterations", "Sample VIII\n500 Iterations", "Sample IX\n500 Iterations"])

# style = dict(size=14, color='blue')
# # ax.text(1, -0.1, "\u03bc +/-", ha='center', **style)
# # fig.savefig('fig1.png', bbox_inches='tight')

# 

In [None]:
# def create_data_frame(list_of_sample_sizes, sub_iterations, 
#                       meta_iterations, rejection_region, target_power,
#                       csv_in):
#     import matplotlib.pyplot as plt
#     import numpy as np
#     import scipy.stats as stats
#     import math
    
# df = infer_power(list_of_sample_sizes, sub_iterations, 
#                  meta_iterations, rejection_region, target_power, csv_in)
# mu               = df['Error'].mean()
# sigma            = df['Error'].std()
# iterations       = df['Iterations'].unique()[0]
# total_iterations = df['Total Iterations'].unique()[0]

In [None]:
# def x(dataframe, verification_sim_n)

#     k = int(len(dataframe)/5.0)    
#     while counter_2 < verification_sim_n:
#         sample_1 = data.sample(k, replace=False)
#         sample_2 = data.sample(k, replace=False)
#         sample_3 = data.sample(k, replace=False)
#         sample_4 = data.sample(k, replace=False)
#         sample_5 = data.sample(k, replace=False)

    

#         working_df = data.sample(recommended_n, replace=False)
#         X = working_df.Treated
#         X = sm.add_constant(X)
#         Y = working_df.Order_Amt
#         model = sm.OLS(Y, X).fit(method='pinv').get_robustcov_results(
#                   'cluster', groups = working_df.Customer_ID, 
#                   use_correction=True, df_correction=True)
#         final_pvalues.append(model.pvalues[1])
#         counter_2 += 1

In [None]:
# # results_sd_3_n_500 = infer_power([8000, 10000, 12000], 
# #                                   500, 2, 0.05, 0.8, 'data_c_1000000_mde_01.csv')



# x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)
# plt.plot(x, stats.norm.pdf(x, mu, sigma))
# plt.axvline(x=0, color = 'red', alpha=0.4)
# plt.xlim(-0.1,0.5)
# plt.ylim(0.0,15)
# bbox = dict(boxstyle="round", fc="0.9")
# plt.annotate('Mean Abs. Error  = {} \nError Std. = {} \nIterations per Point = {} \nTotal Iterations = {}'.
#              format(round(mu,2), round(sigma,2), iterations, total_iterations),
#             (5, 0), xytext=(195, 177),
#             xycoords='figure pixels',
#             textcoords='offset points',
#             size = 12,
#             bbox=bbox)
# plt.show()
# # plt.savefig('6k_10k_14k_n500.png')
# plt.clf()

# fig = plt.figure(1, figsize=(9, 6))

# # Create an axes instance
# ax = fig.add_subplot(111)

# # Create the boxplot
# data = [results_sd_3_n_500['Error'], [0.5, 0.6, 0.2, 0.1]]
# bp = ax.boxplot(data)
# plt.ylim(-1.0,1.0)
# ax.set_xticklabels(['Sample1', 'Sample2', 'Sample3', 'Sample4'])
# # fig.savefig('fig1.png', bbox_inches='tight')

In [None]:
data = [df1['Error'], df2['Error'], df3['Error'], df4['Error'], df5['Error'], df6['Error'],
        df7['Error'], df8['Error'], df9['Error']]