In [4]:
def create_sim_data(num_retailers, num_customers, treatment_effect, file_name):

    import os
    import random 
    import numpy as np
    import pandas as pd

    # Create retailers, their order amount scalars, and the probability space.
    retailer_ids = list(range(1, num_retailers + 1,1))
    retailer_scalars = list(np.random.normal(loc = 1, scale = 0.05, size = num_retailers))
    retailers = pd.DataFrame(list(zip(retailer_ids, retailer_scalars)), 
                             columns =['Retailer_ID', 'Retailer_Scalar']) 
    delimiters = list(np.linspace(0.0, 1.0, num = num_retailers + 1))
    retailers['a'] = delimiters[0:len(delimiters)-1]
    retailers['b'] = delimiters[1:len(delimiters)]
    print('Retailer data frame created.')
    
    # Create customers, their mean order amount, whether or not they got the treatment, 
    # and the number of orders placed.
    customer_ids = list(range(1, num_customers + 1,1))
    num_orders = [int(i*10) + 1 for i in list(np.random.exponential(scale = 1.0, size = num_customers))]
    customers = pd.DataFrame(list(zip(customer_ids, num_orders)), columns =['Customer_ID', 'N_Orders']) 
    customers['Mean_Order_Amt'] = list(np.random.normal(loc = 100, scale = 25, size = num_customers))
    customers['Mean_Order_Amt'] = customers['Mean_Order_Amt'].apply(lambda x: round(x, 2))
    treatment_probs = []
    for i in list(range(0, len(customers), 1)):
        treatment_probs.append(np.random.uniform())
    customers['Treatment_Prob'] = treatment_probs
    customers['Treated'] = np.where(customers['Treatment_Prob'] <= 0.5, 1, 0)
    customers['Treatment_Modifier'] = np.where(customers['Treatment_Prob'] <= 0.5, 1+treatment_effect, 1.0)
    customers = customers[['Customer_ID', 'N_Orders', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier']]
    print('Customer data frame created.')
    
    j = 0
    master_customer_entry = []
    for i in list(range(0, len(customers), 1)):
        counter = 1
        customer_entry = []
        while counter <= int(customers.loc[[i]]['N_Orders']):
            customer_entry.append(list(customers.loc[[i]].values)) 
            counter += 1
        master_customer_entry.append(customer_entry)    
        if i % 1000 == 0:
            j += 1
            print(str(j*1000) + ' customers allocated (expansion ' + \
                  '{} complete).'.format(str((j*1000)/num_customers)))
    newlist = [item for items in master_customer_entry for item in items]
    newlist = [item for items in newlist for item in items]
    explodedDF = pd.DataFrame(newlist)
    explodedDF.columns = ['Customer_ID', 'Num_Orders', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier']
    explodedDF = explodedDF[['Customer_ID', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier']]
    print('Customer dataframe exploded to order dataframe.')

    float_list = []
    for i in list(range(0, len(explodedDF), 1)):
        float_list.append(np.random.uniform())
    explodedDF['Retailer_Prob'] = float_list
    
    # Map the orders randomly to retailers and their correponding scalar.
    a = explodedDF.Retailer_Prob.values
    bh = retailers.b.values
    bl = retailers.a.values

    i, j = np.where((a[:, None] >= bl) & (a[:, None] <= bh))

    result_df = pd.DataFrame(
                np.column_stack([explodedDF.values[i], retailers.values[j]]),
                columns=explodedDF.columns.append(retailers.columns))
    result_df = result_df[['Customer_ID', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier','Retailer_ID', 
                           'Retailer_Scalar']]
    print('Orders randomly assigned to retailers.')
    
    
    # Add DoW 
    dow = []
    z = 0
    while z < len(result_df):
        dow.append(random.randint(1, 7))
        z += 1
        
    result_df['Dow_Rand'] = dow
    result_df['DOW'] = 0
    result_df['DOW'] = np.where(result_df['Dow_Rand'] == 1, 0.7, result_df['DOW'])    
    result_df['DOW'] = np.where(result_df['Dow_Rand'] == 2, 0.8, result_df['DOW'])  
    result_df['DOW'] = np.where(result_df['Dow_Rand'] == 3, 0.9, result_df['DOW'])  
    result_df['DOW'] = np.where(result_df['Dow_Rand'] == 4, 1.0, result_df['DOW'])  
    result_df['DOW'] = np.where(result_df['Dow_Rand'] == 5, 1.1, result_df['DOW'])  
    result_df['DOW'] = np.where(result_df['Dow_Rand'] == 6, 1.2, result_df['DOW'])  
    result_df['DOW'] = np.where(result_df['Dow_Rand'] == 7, 1.3, result_df['DOW'])  
    
    # Add random noise 
    result_df['Noise'] = list(np.random.normal(loc = 25.0, scale = 5.0, size = len(result_df)))
    result_df['Noise'] = result_df['Noise'].apply(lambda x: round(x, 2))
    result_df['Order_Amt'] = result_df['Mean_Order_Amt']*result_df['Treatment_Modifier']* \
                             result_df['Retailer_Scalar']*result_df['DOW'] + result_df['Noise']
    result_df['Order_Amt'] = result_df['Order_Amt'].apply(lambda x: round(x, 2))
    print('Noise added.')
    
    # Save to .csv
    print(str(len(result_df)) + ' unique observations created.')
    #result_df = result_df.sample(sample_size, replace=True)
    result_df = result_df.reset_index(drop=True)
    
    # Save to .csv
    if not os.path.exists('./residual_dfs'):
        os.makedirs('./residual_dfs')
    
    result_df.to_csv('./residual_dfs/' + file_name)
    print("{} was saved to disk.".format('./residual_dfs/' + file_name))


In [5]:
labels = ['part_II_df_mde_0_05_n_100000_a.csv',  
          'part_II_df_mde_0_05_n_100000_b.csv',
          'part_II_df_mde_0_025_n_100000_a.csv', 
          'part_II_df_mde_0_025_n_100000_b.csv']
mde = [0.05, 0.05, 0.025, 0.025]

In [6]:
for i in list(range(0,len(labels))):
    import time
    start_time = time.time()
    create_sim_data(1000, 100000, mde[i], labels[i])
    print("--- {} minutes ---".format(str((time.time() - start_time)/60.0)))

Retailer data frame created.
Customer data frame created.
1000 customers allocated (expansion 0.01 complete).
2000 customers allocated (expansion 0.02 complete).
3000 customers allocated (expansion 0.03 complete).
4000 customers allocated (expansion 0.04 complete).
5000 customers allocated (expansion 0.05 complete).
6000 customers allocated (expansion 0.06 complete).
7000 customers allocated (expansion 0.07 complete).
8000 customers allocated (expansion 0.08 complete).
9000 customers allocated (expansion 0.09 complete).
10000 customers allocated (expansion 0.1 complete).
11000 customers allocated (expansion 0.11 complete).
12000 customers allocated (expansion 0.12 complete).
13000 customers allocated (expansion 0.13 complete).
14000 customers allocated (expansion 0.14 complete).
15000 customers allocated (expansion 0.15 complete).
16000 customers allocated (expansion 0.16 complete).
17000 customers allocated (expansion 0.17 complete).
18000 customers allocated (expansion 0.18 complete)

50000 customers allocated (expansion 0.5 complete).
51000 customers allocated (expansion 0.51 complete).
52000 customers allocated (expansion 0.52 complete).
53000 customers allocated (expansion 0.53 complete).
54000 customers allocated (expansion 0.54 complete).
55000 customers allocated (expansion 0.55 complete).
56000 customers allocated (expansion 0.56 complete).
57000 customers allocated (expansion 0.57 complete).
58000 customers allocated (expansion 0.58 complete).
59000 customers allocated (expansion 0.59 complete).
60000 customers allocated (expansion 0.6 complete).
61000 customers allocated (expansion 0.61 complete).
62000 customers allocated (expansion 0.62 complete).
63000 customers allocated (expansion 0.63 complete).
64000 customers allocated (expansion 0.64 complete).
65000 customers allocated (expansion 0.65 complete).
66000 customers allocated (expansion 0.66 complete).
67000 customers allocated (expansion 0.67 complete).
68000 customers allocated (expansion 0.68 comple

100000 customers allocated (expansion 1.0 complete).
Customer dataframe exploded to order dataframe.
Orders randomly assigned to retailers.
Noise added.
1051576 unique observations created.
./residual_dfs/part_II_df_mde_0_025_n_100000_a.csv was saved to disk.
--- 24.841423892974852 minutes ---
Retailer data frame created.
Customer data frame created.
1000 customers allocated (expansion 0.01 complete).
2000 customers allocated (expansion 0.02 complete).
3000 customers allocated (expansion 0.03 complete).
4000 customers allocated (expansion 0.04 complete).
5000 customers allocated (expansion 0.05 complete).
6000 customers allocated (expansion 0.06 complete).
7000 customers allocated (expansion 0.07 complete).
8000 customers allocated (expansion 0.08 complete).
9000 customers allocated (expansion 0.09 complete).
10000 customers allocated (expansion 0.1 complete).
11000 customers allocated (expansion 0.11 complete).
12000 customers allocated (expansion 0.12 complete).
13000 customers alloc