In [1]:
def create_sample_size_rec(data_src, data_labels, rejection_region, desired_power):
    
    import random
    import numpy as np
    import pandas as pd
    import statsmodels.api as sm
    
    data_src.columns = data_labels
    
    absolute_mde = data_src[data_src['Treated'] == 1]['Order_Amt'].mean() - \
                   data_src[data_src['Treated'] == 0]['Order_Amt'].mean()

    
    print("The absolute MDE was estimated as {}.".format(absolute_mde))
    
    df = data_src[data_src['Treated'] == 0]
    assignment = []
    i = 0
    while i < len(df):
        assignment.append(random.randint(0,1)) 
        i += 1
    df['Partition'] = assignment
    power_analysis_df = df[df['Partition'] == 0]
    analysis_df = df[df['Partition'] == 1]
        
    del df
    
    pa_retailer_means = pd.DataFrame(power_analysis_df.groupby(['Retailer_ID'])['Order_Amt'].mean())
    pa_retailer_means.reset_index(inplace=True)
    pa_retailer_means.columns = ['Retailer_ID', 'Mean_Retailer_Order_Amt']
    ###############################################################################
    pa_dow_means = pd.DataFrame(power_analysis_df.groupby(['Dow_Rand'])['Order_Amt'].mean())
    pa_dow_means.reset_index(inplace=True)
    pa_dow_means.columns = ['Dow_Rand', 'Mean_DOW_Order_Amt']
    ###############################################################################
    analysis_df = pd.merge(analysis_df, pa_retailer_means, on='Retailer_ID', how='left')
    analysis_df = pd.merge(analysis_df, pa_dow_means, on='Dow_Rand', how='left')
    ###############################################################################
    analysis_df = analysis_df[['Order_Amt', 'Customer_ID', 'Mean_Order_Amt', 
                               'Mean_Retailer_Order_Amt','Mean_DOW_Order_Amt']]
    
    X = analysis_df[['Mean_Order_Amt', 'Mean_Retailer_Order_Amt','Mean_DOW_Order_Amt']]
    X = sm.add_constant(X)
    Y = analysis_df[['Order_Amt']]
    residuals_df = sm.OLS(Y.astype(float), X.astype(float)).fit()
    
    X2 = analysis_df[['Customer_ID']]
    X2['Residual'] = residuals_df.resid
    X2['Constant'] = 1
    clustered_res = sm.OLS(X2['Residual'], X2['Constant']).fit(method='pinv'). \
                       get_robustcov_results('cluster', groups = X2['Customer_ID'], 
                       use_correction=True, df_correction=True)
    
    clustered_sd = clustered_res.bse[0] * np.sqrt(analysis_df.shape[0])
    effect_size = absolute_mde / clustered_sd
    recommended_n = int(sm.stats.tt_ind_solve_power(effect_size = effect_size, 
                        alpha = rejection_region, power = desired_power, 
                        alternative = 'larger'))
    print("A sample size of {} was recommended.".format(recommended_n ))
    return recommended_n, absolute_mde

In [2]:
def verify_sample_size_est(sample_size, data_src, data_labels, alpha, verify_n_times):

    import statsmodels.api as sm
    import pandas as pd
    
    SAMPLE_SIZE = sample_size
    VERIFICATION_ITERATIONS = verify_n_times
    ALPHA = alpha

    i = 0
    pvals  = []
    r_sqr  = []
    cond_n = []
    while i < VERIFICATION_ITERATIONS:
        working_df = data_src.sample(SAMPLE_SIZE, replace=False)
        ###############################################################################
        pa_retailer_means = pd.DataFrame(working_df.groupby(['Retailer_ID'])['Order_Amt'].mean())
        pa_retailer_means.reset_index(inplace=True)
        pa_retailer_means.columns = ['Retailer_ID', 'Mean_Retailer_Order_Amt']
        ###############################################################################
        pa_dow_means = pd.DataFrame(working_df.groupby(['Dow_Rand'])['Order_Amt'].mean())
        pa_dow_means.reset_index(inplace=True)
        pa_dow_means.columns = ['Dow_Rand', 'Mean_DOW_Order_Amt']
        ###############################################################################
        analysis_df = pd.merge(working_df, pa_retailer_means, on='Retailer_ID', how='left')
        analysis_df = pd.merge(analysis_df, pa_dow_means, on='Dow_Rand', how='left')
        ###############################################################################
        analysis_df = analysis_df[['Order_Amt', 'Customer_ID', 'Treated', 'Mean_Order_Amt', 
                                 'Mean_Retailer_Order_Amt','Mean_DOW_Order_Amt']]
        ###############################################################################
        X = analysis_df[['Treated', 'Mean_Order_Amt', 'Mean_Retailer_Order_Amt','Mean_DOW_Order_Amt']]
        X = sm.add_constant(X)
        Y = analysis_df[['Order_Amt']]
        model = sm.OLS(Y.astype(float), X.astype(float)).fit(method='pinv'). \
                       get_robustcov_results('cluster', groups = analysis_df['Customer_ID'], 
                       use_correction=True, df_correction=True)
        if model.pvalues[1] < ALPHA: 
            pvals.append(1)
        else:
            pvals.append(0)  
        r_sqr.append(model.rsquared_adj)
        cond_n.append(model.condition_number)
        i += 1
        if i % int((VERIFICATION_ITERATIONS)/10.0) == 0:
            completion = str(round((i/VERIFICATION_ITERATIONS)*100, 2))+'%'
            print(completion + ' complete.')
            
    # ----- Exit inner loop     
    x = ['Treated', 'Mean_Order_Amt', 'Mean_Retailer_Order_Amt','Mean_DOW_Order_Amt']
    str_out = 'Order_Amt =' 
    d = 0
    for i in x:
        if d < 1:
            k = " '" + i + "'"
        else:
            k = " + '" + i + "'"
        str_out += k
        d += 1    
    
    actual_power = sum(pvals)/len(pvals)  
    mean_r_sqr   = sum(r_sqr)/len(r_sqr)   
    mean_cond_n  = sum(cond_n)/len(cond_n)  
    print("Actual power was estimated at {}.".format(actual_power))
    return actual_power, mean_r_sqr, mean_cond_n, str_out

In [3]:
def meta_assess(sub_iterations, meta_iterations, alpha, target_power, 
                pa_file, analysis_file, file_name):

    ALPHA = alpha
    SUB_ITERATIONS = sub_iterations
    META_ITERATIONS = meta_iterations
    TARGET_POWER = target_power

    dir = './residual_dfs/'
    import pandas as pd
    import os

    i = 0
    results      = []
    r_sqr_list   = []
    cond_n_list  = []
    abs_mde      = []
    sample_sizes = []
    while i < META_ITERATIONS:

        data_src_a = pd.read_csv(dir + pa_file)
        data_src_b = pd.read_csv(dir + analysis_file)
        data_labels = ['Order_ID', 'Customer_ID', 'Mean_Order_Amt', 'Treated',
                       'Treatment_Modifier', 'Retailer_ID', 'Retailer_Scalar',
                       'Dow_Rand', 'DOW', 'Noise', 'Order_Amt']

        recommended_n, absolute_mde = create_sample_size_rec(data_src_a, data_labels, 
                                               ALPHA, TARGET_POWER)

        actual_power,r_sqr,mean_cond_n, str_out = verify_sample_size_est(recommended_n, data_src_b, 
                                                  data_labels, ALPHA, SUB_ITERATIONS)   

        results.append(actual_power)
        r_sqr_list.append(r_sqr)
        cond_n_list.append(mean_cond_n)
        abs_mde.append(absolute_mde)
        sample_sizes.append(recommended_n)
        i += 1
        print("{} of {} iterations completed.".format(i, META_ITERATIONS))
    results                          = pd.DataFrame(results)    
    results['Target Power']          = TARGET_POWER
    results.columns                  = ['Achieved Power', 'Target Power']
    results['Abs. MDE']              = absolute_mde
    results['Model']                 = str_out
    results['Delta']                 = results['Achieved Power'] - results['Target Power']
    results['N']                     = sample_sizes
    results['Mean_R_Sqr']            = r_sqr_list
    results['Mean_Condition_Number'] = cond_n_list

        # Save to .csv
    if not os.path.exists('./residual_dfs/results'):
        os.makedirs('./residual_dfs/results')
    
    results.to_csv('./residual_dfs/results/' + file_name)
    print("{} was saved to disk.".format('./residual_dfs/results/' + file_name))

In [4]:
meta_assess(500, 100, 0.05, 0.8, 
            'part_II_df_mde_0_005_n_100000_a.csv', 
            'part_II_df_mde_0_005_n_100000_b.csv', 
            'mde_0_005_n_100000_model_I.csv')

The absolute MDE was estimated as 0.42411496603367027.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


A sample size of 3882 was recommended.
10.0% complete.
20.0% complete.
30.0% complete.
40.0% complete.
50.0% complete.
60.0% complete.
70.0% complete.
80.0% complete.
90.0% complete.
100.0% complete.
Actual power was estimated at 0.426.
1 of 100 iterations completed.
The absolute MDE was estimated as 0.42411496603367027.
A sample size of 3780 was recommended.
10.0% complete.
20.0% complete.
30.0% complete.
40.0% complete.
50.0% complete.
60.0% complete.
70.0% complete.
80.0% complete.
90.0% complete.
100.0% complete.
Actual power was estimated at 0.388.
2 of 100 iterations completed.
The absolute MDE was estimated as 0.42411496603367027.
A sample size of 3861 was recommended.
10.0% complete.
20.0% complete.
30.0% complete.
40.0% complete.
50.0% complete.
60.0% complete.
70.0% complete.
80.0% complete.
90.0% complete.
100.0% complete.
Actual power was estimated at 0.38.
3 of 100 iterations completed.
The absolute MDE was estimated as 0.42411496603367027.
A sample size of 3816 was recomm

60.0% complete.
70.0% complete.
80.0% complete.
90.0% complete.
100.0% complete.
Actual power was estimated at 0.382.
26 of 100 iterations completed.
The absolute MDE was estimated as 0.42411496603367027.
A sample size of 3942 was recommended.
10.0% complete.
20.0% complete.
30.0% complete.
40.0% complete.
50.0% complete.
60.0% complete.
70.0% complete.
80.0% complete.
90.0% complete.
100.0% complete.
Actual power was estimated at 0.436.
27 of 100 iterations completed.
The absolute MDE was estimated as 0.42411496603367027.
A sample size of 3794 was recommended.
10.0% complete.
20.0% complete.
30.0% complete.
40.0% complete.
50.0% complete.
60.0% complete.
70.0% complete.
80.0% complete.
90.0% complete.
100.0% complete.
Actual power was estimated at 0.402.
28 of 100 iterations completed.
The absolute MDE was estimated as 0.42411496603367027.
A sample size of 3802 was recommended.
10.0% complete.
20.0% complete.
30.0% complete.
40.0% complete.
50.0% complete.
60.0% complete.
70.0% comple

The absolute MDE was estimated as 0.42411496603367027.
A sample size of 3832 was recommended.
10.0% complete.
20.0% complete.
30.0% complete.
40.0% complete.
50.0% complete.
60.0% complete.
70.0% complete.
80.0% complete.
90.0% complete.
100.0% complete.
Actual power was estimated at 0.388.
52 of 100 iterations completed.
The absolute MDE was estimated as 0.42411496603367027.
A sample size of 3834 was recommended.
10.0% complete.
20.0% complete.
30.0% complete.
40.0% complete.
50.0% complete.
60.0% complete.
70.0% complete.
80.0% complete.
90.0% complete.
100.0% complete.
Actual power was estimated at 0.426.
53 of 100 iterations completed.
The absolute MDE was estimated as 0.42411496603367027.
A sample size of 3836 was recommended.
10.0% complete.
20.0% complete.
30.0% complete.
40.0% complete.
50.0% complete.
60.0% complete.
70.0% complete.
80.0% complete.
90.0% complete.
100.0% complete.
Actual power was estimated at 0.444.
54 of 100 iterations completed.
The absolute MDE was estimat

20.0% complete.
30.0% complete.
40.0% complete.
50.0% complete.
60.0% complete.
70.0% complete.
80.0% complete.
90.0% complete.
100.0% complete.
Actual power was estimated at 0.41.
77 of 100 iterations completed.
The absolute MDE was estimated as 0.42411496603367027.
A sample size of 3840 was recommended.
10.0% complete.
20.0% complete.
30.0% complete.
40.0% complete.
50.0% complete.
60.0% complete.
70.0% complete.
80.0% complete.
90.0% complete.
100.0% complete.
Actual power was estimated at 0.418.
78 of 100 iterations completed.
The absolute MDE was estimated as 0.42411496603367027.
A sample size of 3824 was recommended.
10.0% complete.
20.0% complete.
30.0% complete.
40.0% complete.
50.0% complete.
60.0% complete.
70.0% complete.
80.0% complete.
90.0% complete.
100.0% complete.
Actual power was estimated at 0.386.
79 of 100 iterations completed.
The absolute MDE was estimated as 0.42411496603367027.
A sample size of 3849 was recommended.
10.0% complete.
20.0% complete.
30.0% complet