In [63]:
import numpy as np
import pandas as pd
num_retailers    = 1000
num_customers    = 2
treatment_effect = 0.01


# Create retailers, their order amount scalars, and the probability space.
retailer_ids = list(range(1, num_retailers + 1,1))
retailer_scalars = list(np.random.normal(loc = 1, scale = 0.05, size = num_retailers))
retailers = pd.DataFrame(list(zip(retailer_ids, retailer_scalars)), 
                         columns =['Retailer_ID', 'Retailer_Scalar']) 
delimiters = list(np.linspace(0.0, 1.0, num = num_retailers + 1))
retailers['a'] = delimiters[0:len(delimiters)-1]
retailers['b'] = delimiters[1:len(delimiters)]
print('Retailer data frame created.')

# Create customers, their mean order amount, whether or not they got the treatment, 
# and the number of orders placed.
customer_ids = list(range(1, num_customers + 1,1))
num_orders = [int(i*10) + 1 for i in list(np.random.exponential(scale = 1.0, size = num_customers))]
customers = pd.DataFrame(list(zip(customer_ids, num_orders)), columns =['Customer_ID', 'N_Orders']) 
customers['Mean_Order_Amt'] = list(np.random.normal(loc = 100, scale = 25, size = num_customers))
customers['Mean_Order_Amt'] = customers['Mean_Order_Amt'].apply(lambda x: round(x, 2))
treatment_probs = []
for i in list(range(0, len(customers), 1)):
    treatment_probs.append(np.random.uniform())
customers['Treatment_Prob'] = treatment_probs
customers['Treated'] = np.where(customers['Treatment_Prob'] <= 0.5, 1, 0)
customers['Treatment_Modifier'] = np.where(customers['Treatment_Prob'] <= 0.5, 1+treatment_effect, 1.0)
customers = customers[['Customer_ID', 'N_Orders', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier']]
print('Customer data frame created.')

# Explode the customers dataframe by number of orders the customer placed.
#     explodedDF = pd.DataFrame(columns = ['Customer_ID', 'N_Orders', 'Mean_Order_Amt'])
j = 0
master_customer_entry = []
for i in list(range(0, len(customers), 1)):
    counter = 1
    customer_entry = []
    while counter <= int(customers.loc[[i]]['N_Orders']):
        customer_entry.append(list(customers.loc[[i]].values)) 
        counter += 1
    master_customer_entry.append(customer_entry)    
    if i % 1000 == 0:
        j += 1
        print(str(j*1000) + ' customers allocated (expansion ' + \
              '{} complete).'.format(str((j*1000)/num_customers)))
newlist = [item for items in master_customer_entry for item in items]
newlist = [item for items in newlist for item in items]
explodedDF = pd.DataFrame(newlist)
explodedDF.columns = ['Customer_ID', 'Num_Orders', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier']
explodedDF = explodedDF[['Customer_ID', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier']]
explodedDF

Retailer data frame created.
Customer data frame created.
1000 customers allocated (expansion 500.0 complete).


Unnamed: 0,Customer_ID,Mean_Order_Amt,Treated,Treatment_Modifier
0,1.0,139.32,1.0,1.01
1,1.0,139.32,1.0,1.01
2,2.0,105.97,0.0,1.0
3,2.0,105.97,0.0,1.0
4,2.0,105.97,0.0,1.0
5,2.0,105.97,0.0,1.0
6,2.0,105.97,0.0,1.0
7,2.0,105.97,0.0,1.0


In [64]:
def create_sim_data(num_retailers, num_customers, treatment_effect, file_name):

    import numpy as np
    import pandas as pd

    # Create retailers, their order amount scalars, and the probability space.
    retailer_ids = list(range(1, num_retailers + 1,1))
    retailer_scalars = list(np.random.normal(loc = 1, scale = 0.05, size = num_retailers))
    retailers = pd.DataFrame(list(zip(retailer_ids, retailer_scalars)), 
                             columns =['Retailer_ID', 'Retailer_Scalar']) 
    delimiters = list(np.linspace(0.0, 1.0, num = num_retailers + 1))
    retailers['a'] = delimiters[0:len(delimiters)-1]
    retailers['b'] = delimiters[1:len(delimiters)]
    print('Retailer data frame created.')
    
    # Create customers, their mean order amount, whether or not they got the treatment, 
    # and the number of orders placed.
    customer_ids = list(range(1, num_customers + 1,1))
    num_orders = [int(i*10) + 1 for i in list(np.random.exponential(scale = 1.0, size = num_customers))]
    customers = pd.DataFrame(list(zip(customer_ids, num_orders)), columns =['Customer_ID', 'N_Orders']) 
    customers['Mean_Order_Amt'] = list(np.random.normal(loc = 100, scale = 25, size = num_customers))
    customers['Mean_Order_Amt'] = customers['Mean_Order_Amt'].apply(lambda x: round(x, 2))
    treatment_probs = []
    for i in list(range(0, len(customers), 1)):
        treatment_probs.append(np.random.uniform())
    customers['Treatment_Prob'] = treatment_probs
    customers['Treated'] = np.where(customers['Treatment_Prob'] <= 0.5, 1, 0)
    customers['Treatment_Modifier'] = np.where(customers['Treatment_Prob'] <= 0.5, 1+treatment_effect, 1.0)
    customers = customers[['Customer_ID', 'N_Orders', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier']]
    print('Customer data frame created.')
    
    j = 0
    master_customer_entry = []
    for i in list(range(0, len(customers), 1)):
        counter = 1
        customer_entry = []
        while counter <= int(customers.loc[[i]]['N_Orders']):
            customer_entry.append(list(customers.loc[[i]].values)) 
            counter += 1
        master_customer_entry.append(customer_entry)    
        if i % 1000 == 0:
            j += 1
            print(str(j*1000) + ' customers allocated (expansion ' + \
                  '{} complete).'.format(str((j*1000)/num_customers)))
    newlist = [item for items in master_customer_entry for item in items]
    newlist = [item for items in newlist for item in items]
    explodedDF = pd.DataFrame(newlist)
    explodedDF.columns = ['Customer_ID', 'Num_Orders', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier']
    explodedDF = explodedDF[['Customer_ID', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier']]
    print('Customer dataframe exploded to order dataframe.')

    float_list = []
    for i in list(range(0, len(explodedDF), 1)):
        float_list.append(np.random.uniform())
    explodedDF['Retailer_Prob'] = float_list
    
    # Map the orders randomly to retailers and their correponding scalar.
    a = explodedDF.Retailer_Prob.values
    bh = retailers.b.values
    bl = retailers.a.values

    i, j = np.where((a[:, None] >= bl) & (a[:, None] <= bh))

    result_df = pd.DataFrame(
                np.column_stack([explodedDF.values[i], retailers.values[j]]),
                columns=explodedDF.columns.append(retailers.columns))
    result_df = result_df[['Customer_ID', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier','Retailer_ID', 
                           'Retailer_Scalar']]
    print('Orders randomly assigned to retailers.')
    
    # Add random noise 
    result_df['Noise'] = list(np.random.normal(loc = 10.0, scale = 1.0, size = len(result_df)))
    result_df['Noise'] = result_df['Noise'].apply(lambda x: round(x, 2))
    result_df['Order_Amt'] = result_df['Mean_Order_Amt']*result_df['Treatment_Modifier']* \
                             result_df['Retailer_Scalar'] + result_df['Noise']
    result_df['Order_Amt'] = result_df['Order_Amt'].apply(lambda x: round(x, 2))
    print('Noise added.')

    # Save to .csv
    print(str(len(result_df)) + ' unique observations created.')
    #result_df = result_df.sample(sample_size, replace=True)
    result_df = result_df.reset_index(drop=True)
    result_df.to_csv(file_name)


In [65]:
import time
start_time = time.time()
create_sim_data(1000, 1000000, 0.01, 'data_c_1000000_mde_01.csv')
print("--- {} minutes ---".format(str((time.time() - start_time)/60.0)))

Retailer data frame created.
Customer data frame created.
1000 customers allocated (expansion 0.001 complete).
2000 customers allocated (expansion 0.002 complete).
3000 customers allocated (expansion 0.003 complete).
4000 customers allocated (expansion 0.004 complete).
5000 customers allocated (expansion 0.005 complete).
6000 customers allocated (expansion 0.006 complete).
7000 customers allocated (expansion 0.007 complete).
8000 customers allocated (expansion 0.008 complete).
9000 customers allocated (expansion 0.009 complete).
10000 customers allocated (expansion 0.01 complete).
11000 customers allocated (expansion 0.011 complete).
12000 customers allocated (expansion 0.012 complete).
13000 customers allocated (expansion 0.013 complete).
14000 customers allocated (expansion 0.014 complete).
15000 customers allocated (expansion 0.015 complete).
16000 customers allocated (expansion 0.016 complete).
17000 customers allocated (expansion 0.017 complete).
18000 customers allocated (expansi

152000 customers allocated (expansion 0.152 complete).
153000 customers allocated (expansion 0.153 complete).
154000 customers allocated (expansion 0.154 complete).
155000 customers allocated (expansion 0.155 complete).
156000 customers allocated (expansion 0.156 complete).
157000 customers allocated (expansion 0.157 complete).
158000 customers allocated (expansion 0.158 complete).
159000 customers allocated (expansion 0.159 complete).
160000 customers allocated (expansion 0.16 complete).
161000 customers allocated (expansion 0.161 complete).
162000 customers allocated (expansion 0.162 complete).
163000 customers allocated (expansion 0.163 complete).
164000 customers allocated (expansion 0.164 complete).
165000 customers allocated (expansion 0.165 complete).
166000 customers allocated (expansion 0.166 complete).
167000 customers allocated (expansion 0.167 complete).
168000 customers allocated (expansion 0.168 complete).
169000 customers allocated (expansion 0.169 complete).
170000 cust

302000 customers allocated (expansion 0.302 complete).
303000 customers allocated (expansion 0.303 complete).
304000 customers allocated (expansion 0.304 complete).
305000 customers allocated (expansion 0.305 complete).
306000 customers allocated (expansion 0.306 complete).
307000 customers allocated (expansion 0.307 complete).
308000 customers allocated (expansion 0.308 complete).
309000 customers allocated (expansion 0.309 complete).
310000 customers allocated (expansion 0.31 complete).
311000 customers allocated (expansion 0.311 complete).
312000 customers allocated (expansion 0.312 complete).
313000 customers allocated (expansion 0.313 complete).
314000 customers allocated (expansion 0.314 complete).
315000 customers allocated (expansion 0.315 complete).
316000 customers allocated (expansion 0.316 complete).
317000 customers allocated (expansion 0.317 complete).
318000 customers allocated (expansion 0.318 complete).
319000 customers allocated (expansion 0.319 complete).
320000 cust

452000 customers allocated (expansion 0.452 complete).
453000 customers allocated (expansion 0.453 complete).
454000 customers allocated (expansion 0.454 complete).
455000 customers allocated (expansion 0.455 complete).
456000 customers allocated (expansion 0.456 complete).
457000 customers allocated (expansion 0.457 complete).
458000 customers allocated (expansion 0.458 complete).
459000 customers allocated (expansion 0.459 complete).
460000 customers allocated (expansion 0.46 complete).
461000 customers allocated (expansion 0.461 complete).
462000 customers allocated (expansion 0.462 complete).
463000 customers allocated (expansion 0.463 complete).
464000 customers allocated (expansion 0.464 complete).
465000 customers allocated (expansion 0.465 complete).
466000 customers allocated (expansion 0.466 complete).
467000 customers allocated (expansion 0.467 complete).
468000 customers allocated (expansion 0.468 complete).
469000 customers allocated (expansion 0.469 complete).
470000 cust

602000 customers allocated (expansion 0.602 complete).
603000 customers allocated (expansion 0.603 complete).
604000 customers allocated (expansion 0.604 complete).
605000 customers allocated (expansion 0.605 complete).
606000 customers allocated (expansion 0.606 complete).
607000 customers allocated (expansion 0.607 complete).
608000 customers allocated (expansion 0.608 complete).
609000 customers allocated (expansion 0.609 complete).
610000 customers allocated (expansion 0.61 complete).
611000 customers allocated (expansion 0.611 complete).
612000 customers allocated (expansion 0.612 complete).
613000 customers allocated (expansion 0.613 complete).
614000 customers allocated (expansion 0.614 complete).
615000 customers allocated (expansion 0.615 complete).
616000 customers allocated (expansion 0.616 complete).
617000 customers allocated (expansion 0.617 complete).
618000 customers allocated (expansion 0.618 complete).
619000 customers allocated (expansion 0.619 complete).
620000 cust

752000 customers allocated (expansion 0.752 complete).
753000 customers allocated (expansion 0.753 complete).
754000 customers allocated (expansion 0.754 complete).
755000 customers allocated (expansion 0.755 complete).
756000 customers allocated (expansion 0.756 complete).
757000 customers allocated (expansion 0.757 complete).
758000 customers allocated (expansion 0.758 complete).
759000 customers allocated (expansion 0.759 complete).
760000 customers allocated (expansion 0.76 complete).
761000 customers allocated (expansion 0.761 complete).
762000 customers allocated (expansion 0.762 complete).
763000 customers allocated (expansion 0.763 complete).
764000 customers allocated (expansion 0.764 complete).
765000 customers allocated (expansion 0.765 complete).
766000 customers allocated (expansion 0.766 complete).
767000 customers allocated (expansion 0.767 complete).
768000 customers allocated (expansion 0.768 complete).
769000 customers allocated (expansion 0.769 complete).
770000 cust

902000 customers allocated (expansion 0.902 complete).
903000 customers allocated (expansion 0.903 complete).
904000 customers allocated (expansion 0.904 complete).
905000 customers allocated (expansion 0.905 complete).
906000 customers allocated (expansion 0.906 complete).
907000 customers allocated (expansion 0.907 complete).
908000 customers allocated (expansion 0.908 complete).
909000 customers allocated (expansion 0.909 complete).
910000 customers allocated (expansion 0.91 complete).
911000 customers allocated (expansion 0.911 complete).
912000 customers allocated (expansion 0.912 complete).
913000 customers allocated (expansion 0.913 complete).
914000 customers allocated (expansion 0.914 complete).
915000 customers allocated (expansion 0.915 complete).
916000 customers allocated (expansion 0.916 complete).
917000 customers allocated (expansion 0.917 complete).
918000 customers allocated (expansion 0.918 complete).
919000 customers allocated (expansion 0.919 complete).
920000 cust