In [24]:
import pandas as pd
from numpy import random 
from mimesis import Generic
from faker import Faker
from datetime import datetime

In [25]:
# define seed for all data generators
fake = Faker()
fake.seed_instance(0)
generic = Generic(seed=2)
random.seed(2)

## Mock Sales Data

### Customer table

In [116]:
# function to create customer profile
def create_customer(num):
    customers=[fake.simple_profile() for x in range(num)]

    return customers

In [117]:
# create dataframe with customer info
customer_df = pd.DataFrame(create_customer(30)) # define number of customers in generated dataset
customer_df['customer_id'] = customer_df.index+1
customer_df['customer_id'] = customer_df['customer_id'].map("{:04}".format)
customer_df

Unnamed: 0,username,name,sex,address,mail,birthdate,customer_id
0,tpennington,Brittany Lewis,F,"842 Gene Trafficway\nDavismouth, KY 39633",kingsarah@yahoo.com,2020-11-30,1
1,kimberly69,Mrs. Carly Berger,F,"5238 Williams Centers\nNorth Elizabeth, VA 66575",wolfmonica@hotmail.com,1963-10-09,2
2,thorntonallison,Duane Smith,M,"1735 Michael Cove Apt. 965\nJenniferview, NC 2...",yhill@gmail.com,1992-05-18,3
3,daniel01,Chad Bennett,M,"728 John Prairie\nNew Seth, NJ 29294",jhobbs@hotmail.com,1972-03-26,4
4,jeremyclarke,Lisa Russell,F,"31739 Jones Spur\nSouth Emilyburgh, NV 85196",theresasweeney@gmail.com,1956-06-15,5
5,qharrington,Rebecca Harris,F,"751 Jacqueline Flats\nClarkshire, NV 83038",bakerdonna@hotmail.com,1931-02-12,6
6,jasoncarpenter,Lisa Miller,F,"3301 Roberson Field\nSouth Claudia, MN 64543",sjohnson@yahoo.com,1927-09-09,7
7,llane,Steve Bailey,M,"506 Anthony Valley Apt. 571\nVegaport, ME 20814",coxpeter@gmail.com,1984-03-06,8
8,jmitchell,Evan Mcdonald,M,"22221 Francis Station\nNew Monica, CO 23299",nsharp@gmail.com,1920-11-12,9
9,daniel58,Danny Martin,M,"PSC 3103, Box 1896\nAPO AA 27535",edward35@hotmail.com,1915-05-16,10


### Product table

In [118]:
# function to create products info
def create_product(num):
    category = ['fruit', 'vegetable', 'drink', 'dish']  # defined categories
    product_name = [generic.food.fruit, generic.food.vegetable, generic.food.drink, generic.food.dish] #respective product names
    price_range = [(1.99, 20), (0.99, 10), (5,10), (15,50)] #price range for respective product categories

    product=[]
    # iterate 
    for index, item in enumerate(category):
        product.extend([{
            'product_id': f'{len(product)+x+1:03}',
            'product_name': product_name[index](),
            'category': item,
            'unit_price': round(random.uniform(*price_range[index]), 2)
        } for x in range(num)])

    return product

In [119]:
product_df = pd.DataFrame(create_product(10)) # define count of products in each category
product_df

Unnamed: 0,product_id,product_name,category,unit_price
0,1,Bearberry,fruit,4.18
1,2,Giant granadilla,fruit,15.21
2,3,Hippophae,fruit,9.26
3,4,Bignay,fruit,19.87
4,5,Calabash tree,fruit,15.9
5,6,Citron,fruit,17.22
6,7,Bilberry,fruit,2.09
7,8,Karonda,fruit,14.43
8,9,Keule,fruit,10.75
9,10,Doubah,fruit,19.88


### Order table

In [120]:
def create_orders(num):
    orders=[]
    for x in range(num):
        
        date = fake.date_between_dates(date_start=datetime(2021, 1, 1), date_end=datetime(2023, 6, 30))
        
        transaction={
        'date': date,
        'order_id': f'{date.strftime("%Y%m%d")}{x:04}',
        'customer_id': customer_df['customer_id'].iloc[random.randint(0, len(customer_df.index))],
        'session_type': random.choice(['Web', 'Playstore', 'App Store'], p=[0.4, 0.3, 0.3])
        }
        
        # one order may contain multiple products sold
        # create row for each product within the order
        for y in range(1, random.randint(1,10)):
            ind = random.randint(0, len(product_df.index))
            transaction['product_id'] = product_df['product_id'].iloc[ind]
            # transaction['product_name'] = product_df['product_name'].iloc[ind]
            # transaction['unit_price'] = product_df['unit_price'].iloc[ind]
            transaction['unit_count']= random.randint(1,10)
            # transaction['amount'] = round(transaction['unit_price'] * transaction['unit_count'],2)

            add_trans = transaction.copy()

            orders.append(transaction)
    
    return orders

In [121]:
order_df = pd.DataFrame(create_orders(1000)) #define number of orders
order_df.head(30)

Unnamed: 0,date,order_id,customer_id,session_type,product_id,unit_count
0,2022-06-23,202206230000,23,Web,38,4
1,2022-06-23,202206230000,23,Web,38,4
2,2022-06-23,202206230000,23,Web,38,4
3,2022-06-23,202206230000,23,Web,38,4
4,2022-06-23,202206230000,23,Web,38,4
5,2022-06-23,202206230000,23,Web,38,4
6,2022-06-23,202206230000,23,Web,38,4
7,2022-06-23,202206230000,23,Web,38,4
8,2022-11-11,202211110001,19,Playstore,15,1
9,2022-11-11,202211110001,19,Playstore,15,1


### Join and export Sales

In [122]:
sales_df = order_df.merge(product_df, how='left', on='product_id')
sales_df = sales_df.merge(customer_df, how='left', on='customer_id')

sales_df.head()

Unnamed: 0,date,order_id,customer_id,session_type,product_id,unit_count,product_name,category,unit_price,username,name,sex,address,mail,birthdate
0,2022-06-23,202206230000,23,Web,38,4,Parma Ham,dish,25.29,tuckermichael,James Singh,M,"0319 Warren Hills\nLake Randyborough, GU 14637",rdelacruz@hotmail.com,1937-01-19
1,2022-06-23,202206230000,23,Web,38,4,Parma Ham,dish,25.29,tuckermichael,James Singh,M,"0319 Warren Hills\nLake Randyborough, GU 14637",rdelacruz@hotmail.com,1937-01-19
2,2022-06-23,202206230000,23,Web,38,4,Parma Ham,dish,25.29,tuckermichael,James Singh,M,"0319 Warren Hills\nLake Randyborough, GU 14637",rdelacruz@hotmail.com,1937-01-19
3,2022-06-23,202206230000,23,Web,38,4,Parma Ham,dish,25.29,tuckermichael,James Singh,M,"0319 Warren Hills\nLake Randyborough, GU 14637",rdelacruz@hotmail.com,1937-01-19
4,2022-06-23,202206230000,23,Web,38,4,Parma Ham,dish,25.29,tuckermichael,James Singh,M,"0319 Warren Hills\nLake Randyborough, GU 14637",rdelacruz@hotmail.com,1937-01-19


In [123]:
sales_df.to_csv('sales_data', index=False)

# Mock Sessions

In [125]:
def create_sessions(num):
    sessions = [
        {'date': fake.date_between_dates(date_start=datetime(2021, 1, 1), date_end=datetime(2023, 6, 30)),
         'session_type': random.choice(['Web', 'Playstore', 'App Store'], p=[0.4, 0.3, 0.3]),
         'lead': random.choice(['Yes', 'No'], p=[0.3, 0.7])
    } for x in range(num)]
    
    return sessions


In [126]:
engagement_df = pd.DataFrame(create_sessions(num=100000))
engagement_df['session_id'] = engagement_df.index+1
engagement_df['session_id'] = engagement_df['session_id'].map("{:09}".format)
engagement_df

Unnamed: 0,date,session_type,lead,session_id
0,2023-02-20,App Store,Yes,000000001
1,2021-11-26,App Store,No,000000002
2,2022-05-12,Playstore,No,000000003
3,2021-09-13,Web,No,000000004
4,2022-07-20,Web,No,000000005
...,...,...,...,...
99995,2022-01-17,Web,Yes,000099996
99996,2023-05-30,App Store,No,000099997
99997,2022-07-11,Playstore,Yes,000099998
99998,2021-03-22,Playstore,No,000099999


In [127]:
engagement_df.to_csv('engagement_data', index=False)

# Mock Satisfaction