In [1]:
def create_sim_data(num_retailers, num_customers, treatment_effect, file_name):

    import os
    import random 
    import numpy as np
    import pandas as pd

    # Create retailers, their order amount scalars, and the probability space.
    retailer_ids = list(range(1, num_retailers + 1,1))
    retailer_scalars = list(np.random.normal(loc = 1, scale = 0.05, size = num_retailers))
    retailers = pd.DataFrame(list(zip(retailer_ids, retailer_scalars)), 
                             columns =['Retailer_ID', 'Retailer_Scalar']) 
    delimiters = list(np.linspace(0.0, 1.0, num = num_retailers + 1))
    retailers['a'] = delimiters[0:len(delimiters)-1]
    retailers['b'] = delimiters[1:len(delimiters)]
    print('Retailer data frame created.')
    
    # Create customers, their mean order amount, whether or not they got the treatment, 
    # and the number of orders placed.
    customer_ids = list(range(1, num_customers + 1,1))
    num_orders = [int(i*10) + 1 for i in list(np.random.exponential(scale = 1.0, size = num_customers))]
    customers = pd.DataFrame(list(zip(customer_ids, num_orders)), columns =['Customer_ID', 'N_Orders']) 
    customers['Mean_Order_Amt'] = list(np.random.normal(loc = 100, scale = 25, size = num_customers))
    customers['Mean_Order_Amt'] = customers['Mean_Order_Amt'].apply(lambda x: round(x, 2))
    treatment_probs = []
    for i in list(range(0, len(customers), 1)):
        treatment_probs.append(np.random.uniform())
    customers['Treatment_Prob'] = treatment_probs
    customers['Treated'] = np.where(customers['Treatment_Prob'] <= 0.5, 1, 0)
    customers['Treatment_Modifier'] = np.where(customers['Treatment_Prob'] <= 0.5, 1+treatment_effect, 1.0)
    customers = customers[['Customer_ID', 'N_Orders', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier']]
    print('Customer data frame created.')
    
    j = 0
    master_customer_entry = []
    for i in list(range(0, len(customers), 1)):
        counter = 1
        customer_entry = []
        while counter <= int(customers.loc[[i]]['N_Orders']):
            customer_entry.append(list(customers.loc[[i]].values)) 
            counter += 1
        master_customer_entry.append(customer_entry)    
        if i % 1000 == 0:
            j += 1
            print(str(j*1000) + ' customers allocated (expansion ' + \
                  '{} complete).'.format(str((j*1000)/num_customers)))
    newlist = [item for items in master_customer_entry for item in items]
    newlist = [item for items in newlist for item in items]
    explodedDF = pd.DataFrame(newlist)
    explodedDF.columns = ['Customer_ID', 'Num_Orders', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier']
    explodedDF = explodedDF[['Customer_ID', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier']]
    print('Customer dataframe exploded to order dataframe.')

    float_list = []
    for i in list(range(0, len(explodedDF), 1)):
        float_list.append(np.random.uniform())
    explodedDF['Retailer_Prob'] = float_list
    
    # Map the orders randomly to retailers and their correponding scalar.
    a = explodedDF.Retailer_Prob.values
    bh = retailers.b.values
    bl = retailers.a.values

    i, j = np.where((a[:, None] >= bl) & (a[:, None] <= bh))

    result_df = pd.DataFrame(
                np.column_stack([explodedDF.values[i], retailers.values[j]]),
                columns=explodedDF.columns.append(retailers.columns))
    result_df = result_df[['Customer_ID', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier','Retailer_ID', 
                           'Retailer_Scalar']]
    print('Orders randomly assigned to retailers.')
    
    
    # Add DoW 
    dow = []
    z = 0
    while z < len(result_df):
        dow.append(random.randint(1, 7))
        z += 1
        
    result_df['Dow_Rand'] = dow
    result_df['DOW'] = 0
    result_df['DOW'] = np.where(result_df['Dow_Rand'] == 1, 0.7, result_df['DOW'])    
    result_df['DOW'] = np.where(result_df['Dow_Rand'] == 2, 0.8, result_df['DOW'])  
    result_df['DOW'] = np.where(result_df['Dow_Rand'] == 3, 0.9, result_df['DOW'])  
    result_df['DOW'] = np.where(result_df['Dow_Rand'] == 4, 1.0, result_df['DOW'])  
    result_df['DOW'] = np.where(result_df['Dow_Rand'] == 5, 1.1, result_df['DOW'])  
    result_df['DOW'] = np.where(result_df['Dow_Rand'] == 6, 1.2, result_df['DOW'])  
    result_df['DOW'] = np.where(result_df['Dow_Rand'] == 7, 1.3, result_df['DOW'])  
    
    # Add random noise 
    result_df['Noise'] = list(np.random.normal(loc = 25.0, scale = 5.0, size = len(result_df)))
    result_df['Noise'] = result_df['Noise'].apply(lambda x: round(x, 2))
    result_df['Order_Amt'] = result_df['Mean_Order_Amt']*result_df['Treatment_Modifier']* \
                             result_df['Retailer_Scalar']*result_df['DOW'] + result_df['Noise']
    result_df['Order_Amt'] = result_df['Order_Amt'].apply(lambda x: round(x, 2))
    print('Noise added.')
    
    # Save to .csv
    print(str(len(result_df)) + ' unique observations created.')
    #result_df = result_df.sample(sample_size, replace=True)
    result_df = result_df.reset_index(drop=True)
    
    # Save to .csv
    if not os.path.exists('./residual_dfs'):
        os.makedirs('./residual_dfs')
    
    result_df.to_csv('./residual_dfs/' + file_name)
    print("{} was saved to disk.".format('./residual_dfs/' + file_name))


In [2]:
labels = ['part_II_df_mde_0_001_n_100000_c.csv']
mde = [0.05]

In [3]:
for i in list(range(0,len(labels))):
    import time
    start_time = time.time()
    create_sim_data(1000, 500000, mde[i], labels[i])
    print("--- {} minutes ---".format(str((time.time() - start_time)/60.0)))

Retailer data frame created.
Customer data frame created.
1000 customers allocated (expansion 0.002 complete).
2000 customers allocated (expansion 0.004 complete).
3000 customers allocated (expansion 0.006 complete).
4000 customers allocated (expansion 0.008 complete).
5000 customers allocated (expansion 0.01 complete).
6000 customers allocated (expansion 0.012 complete).
7000 customers allocated (expansion 0.014 complete).
8000 customers allocated (expansion 0.016 complete).
9000 customers allocated (expansion 0.018 complete).
10000 customers allocated (expansion 0.02 complete).
11000 customers allocated (expansion 0.022 complete).
12000 customers allocated (expansion 0.024 complete).
13000 customers allocated (expansion 0.026 complete).
14000 customers allocated (expansion 0.028 complete).
15000 customers allocated (expansion 0.03 complete).
16000 customers allocated (expansion 0.032 complete).
17000 customers allocated (expansion 0.034 complete).
18000 customers allocated (expansion

152000 customers allocated (expansion 0.304 complete).
153000 customers allocated (expansion 0.306 complete).
154000 customers allocated (expansion 0.308 complete).
155000 customers allocated (expansion 0.31 complete).
156000 customers allocated (expansion 0.312 complete).
157000 customers allocated (expansion 0.314 complete).
158000 customers allocated (expansion 0.316 complete).
159000 customers allocated (expansion 0.318 complete).
160000 customers allocated (expansion 0.32 complete).
161000 customers allocated (expansion 0.322 complete).
162000 customers allocated (expansion 0.324 complete).
163000 customers allocated (expansion 0.326 complete).
164000 customers allocated (expansion 0.328 complete).
165000 customers allocated (expansion 0.33 complete).
166000 customers allocated (expansion 0.332 complete).
167000 customers allocated (expansion 0.334 complete).
168000 customers allocated (expansion 0.336 complete).
169000 customers allocated (expansion 0.338 complete).
170000 custom

302000 customers allocated (expansion 0.604 complete).
303000 customers allocated (expansion 0.606 complete).
304000 customers allocated (expansion 0.608 complete).
305000 customers allocated (expansion 0.61 complete).
306000 customers allocated (expansion 0.612 complete).
307000 customers allocated (expansion 0.614 complete).
308000 customers allocated (expansion 0.616 complete).
309000 customers allocated (expansion 0.618 complete).
310000 customers allocated (expansion 0.62 complete).
311000 customers allocated (expansion 0.622 complete).
312000 customers allocated (expansion 0.624 complete).
313000 customers allocated (expansion 0.626 complete).
314000 customers allocated (expansion 0.628 complete).
315000 customers allocated (expansion 0.63 complete).
316000 customers allocated (expansion 0.632 complete).
317000 customers allocated (expansion 0.634 complete).
318000 customers allocated (expansion 0.636 complete).
319000 customers allocated (expansion 0.638 complete).
320000 custom

452000 customers allocated (expansion 0.904 complete).
453000 customers allocated (expansion 0.906 complete).
454000 customers allocated (expansion 0.908 complete).
455000 customers allocated (expansion 0.91 complete).
456000 customers allocated (expansion 0.912 complete).
457000 customers allocated (expansion 0.914 complete).
458000 customers allocated (expansion 0.916 complete).
459000 customers allocated (expansion 0.918 complete).
460000 customers allocated (expansion 0.92 complete).
461000 customers allocated (expansion 0.922 complete).
462000 customers allocated (expansion 0.924 complete).
463000 customers allocated (expansion 0.926 complete).
464000 customers allocated (expansion 0.928 complete).
465000 customers allocated (expansion 0.93 complete).
466000 customers allocated (expansion 0.932 complete).
467000 customers allocated (expansion 0.934 complete).
468000 customers allocated (expansion 0.936 complete).
469000 customers allocated (expansion 0.938 complete).
470000 custom