In [1]:
def create_sim_data(num_retailers, num_customers, treatment_effect):

    import numpy as np
    import pandas as pd

    # Create retailers, their order amount scalars, and the probability space.
    retailer_ids = list(range(1, num_retailers + 1,1))
    retailer_scalars = list(np.random.normal(loc = 1, scale = 0.05, size = num_retailers))
    retailers = pd.DataFrame(list(zip(retailer_ids, retailer_scalars)), 
                             columns =['Retailer_ID', 'Retailer_Scalar']) 
    delimiters = list(np.linspace(0.0, 1.0, num = num_retailers + 1))
    retailers['a'] = delimiters[0:len(delimiters)-1]
    retailers['b'] = delimiters[1:len(delimiters)]
    print('Retailer data frame created.')
    
    # Create customers, their mean order amount, whether or not they got the treatment, 
    # and the number of orders placed.
    customer_ids = list(range(1, num_customers + 1,1))
    num_orders = [int(i*10) + 1 for i in list(np.random.exponential(scale = 1.0, size = num_customers))]
    customers = pd.DataFrame(list(zip(customer_ids, num_orders)), columns =['Customer_ID', 'N_Orders']) 
    customers['Mean_Order_Amt'] = list(np.random.normal(loc = 100, scale = 25, size = num_customers))
    customers['Mean_Order_Amt'] = customers['Mean_Order_Amt'].apply(lambda x: round(x, 2))
    treatment_probs = []
    for i in list(range(0, len(customers), 1)):
        treatment_probs.append(np.random.uniform())
    customers['Treatment_Prob'] = treatment_probs
    customers['Treated'] = np.where(customers['Treatment_Prob'] <= 0.5, 1, 0)
    customers['Treatment_Modifier'] = np.where(customers['Treatment_Prob'] <= 0.5, 1+treatment_effect, 1.0)
    customers = customers[['Customer_ID', 'N_Orders', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier']]
    print('Customer data frame created.')
    
    # Explode the customers dataframe by number of orders the customer placed.
    explodedDF = pd.DataFrame(columns = ['Customer_ID', 'N_Orders', 'Mean_Order_Amt'])
    j = 0
    for i in list(range(0, len(customers), 1)):
        counter = 1
        while counter <= int(customers.loc[[i]]['N_Orders']):
            explodedDF = explodedDF.append(customers.loc[[i]])
            counter += 1
        if i % 1000 == 0:
            j += 1
            print(str(j*1000) + ' customers allocated (expansion {} complete).'.format(str((j*1000)/num_customers)))

    explodedDF = explodedDF[['Customer_ID', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier']]
    print('Customer dataframe exploded to order dataframe.')

    float_list = []
    for i in list(range(0, len(explodedDF), 1)):
        float_list.append(np.random.uniform())
    explodedDF['Retailer_Prob'] = float_list
    
    # Map the orders randomly to retailers and their correponding scalar.
    a = explodedDF.Retailer_Prob.values
    bh = retailers.b.values
    bl = retailers.a.values

    i, j = np.where((a[:, None] >= bl) & (a[:, None] <= bh))

    result_df = pd.DataFrame(
                np.column_stack([explodedDF.values[i], retailers.values[j]]),
                columns=explodedDF.columns.append(retailers.columns))
    result_df = result_df[['Customer_ID', 'Mean_Order_Amt', 'Treated', 'Treatment_Modifier','Retailer_ID', 
                           'Retailer_Scalar']]
    print('Orders randomly assigned to retailers.')
    
    # Add random noise 
    result_df['Noise'] = list(np.random.normal(loc = 10.0, scale = 1.0, size = len(result_df)))
    result_df['Noise'] = result_df['Noise'].apply(lambda x: round(x, 2))
    result_df['Order_Amt'] = result_df['Mean_Order_Amt']*result_df['Treatment_Modifier']* \
                             result_df['Retailer_Scalar'] + result_df['Noise']
    result_df['Order_Amt'] = result_df['Order_Amt'].apply(lambda x: round(x, 2))
    print('Noise added.')

    # Save to .csv
    print(str(len(result_df)) + ' unique observations created.')
    #result_df = result_df.sample(sample_size, replace=True)
    result_df = result_df.reset_index(drop=True)
    result_df.to_csv('data_c_16000_mde_01.csv')


In [None]:
import time
start_time = time.time()
create_sim_data(1000, 16000, 0.01)
print("--- {} minutes ---".format(str((time.time() - start_time)/60.0)))

Retailer data frame created.
Customer data frame created.
1000 customers allocated (expansion 0.0625 complete).


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


2000 customers allocated (expansion 0.125 complete).
3000 customers allocated (expansion 0.1875 complete).
4000 customers allocated (expansion 0.25 complete).
5000 customers allocated (expansion 0.3125 complete).
6000 customers allocated (expansion 0.375 complete).
7000 customers allocated (expansion 0.4375 complete).
8000 customers allocated (expansion 0.5 complete).
