<a href="https://colab.research.google.com/github/cbonnin88/GreenBox/blob/main/The_GreenBox_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# **Settings**

In [3]:
np.random.seed(99)
num_users = 10000
start_date = datetime(2025,1,1)
end_date = datetime(2025,4,30)

print('Initializing GreenBox Data Generation')

Initializing GreenBox Data Generation


# **Generate Users (Demographics & Groups)**

In [8]:
user_ids = [f'U{i:05d}' for i in range(num_users)]
channels = ['Facebook','Google Ads','TikTok','Instagram','Organic','Referral']
channel_probs = [0.30, 0.20, 0.20, 0.15, 0.10, 0.05]

In [9]:
# Devices
devices = ['Mobile','Desktop','Tablet']
device_probs = [0.70,0.20,0.10]

In [10]:
# A/B Test: 'Save Money' (Control) vs 'Save Planet' (Variant)
ab_groups = np.random.choice(['Control','Variant'], num_users, p=[0.5,0.5])

user_data = {
    'user_id': user_ids,
    'signup_date': [start_date + timedelta(days=np.random.randint(0,90)) for _ in range(num_users)],
    'acquisition_channel': np.random.choice(channels, num_users, p=channel_probs),
    'ab_test_group': ab_groups,
    'age_groups': np.random.choice(['18-24','25-34','35-44','45+'], num_users),
    'device': np.random.choice(devices, num_users, p=device_probs)
}

df_users = pd.DataFrame(user_data)

In [11]:
df_users.head()

Unnamed: 0,user_id,signup_date,acquisition_channel,ab_test_group,age_groups,device
0,U00000,2025-03-18,Facebook,Variant,25-34,Desktop
1,U00001,2025-02-18,Instagram,Control,35-44,Mobile
2,U00002,2025-03-21,Organic,Control,25-34,Mobile
3,U00003,2025-02-08,Instagram,Variant,35-44,Desktop
4,U00004,2025-03-31,Facebook,Variant,45+,Tablet


# **Generating Marketing Spend (Daily Agg)**

In [12]:
dates = pd.date_range(start_date,end_date)
spend_data = []

In [13]:
paid_channels = ['Facebook','Google Ads','TikTok','Instagram']

In [14]:
for d in dates:
  for pc in paid_channels:
    # Random Daily spend with some seasonality
    base_spend = np.random.uniform(200,800)

    if d.weekday() >= 5:
      base_spend *= 1.2

    daily_spend = round(base_spend,2)

    # CPC varies by channel
    if pc == 'Facebook':
      cpc = np.random.uniform(1.5,3.0)
    elif pc == 'Instagram':
      cpc = np.random.uniform(1.8,3.5) # Instagram slight pricier than FB
    elif pc == 'Google Ads':
      cpc = np.random.uniform(2.0,5.0)
    else:
      cpc = np.random.uniform(0.5,1.5) # TikTok is Cheap

    clicks = int(daily_spend / cpc)
    spend_data.append([d,pc,daily_spend,clicks])

df_spend = pd.DataFrame(spend_data, columns=['date','channels','spend','clicks'])

In [15]:
df_spend.head()

Unnamed: 0,date,channels,spend,clicks
0,2025-01-01,Facebook,220.12,79
1,2025-01-01,Google Ads,570.76,119
2,2025-01-01,TikTok,234.3,419
3,2025-01-01,Instagram,429.37,198
4,2025-01-02,Facebook,328.05,161


# **Generating Events (Behavior + Revenue)**

In [17]:
events_list = []
print('Simulating user behavoir...')

Simulating user behavoir...


In [20]:
for idx, user in df_users.iterrows():
  # Base Conversion Rate
  base_prob = 0.30 if user['ab_test_group'] == 'Variant' else 0.22

  # 1. View Landing Page
  curr_time = user['signup_date'] + timedelta(hours=random.randint(8,20))
  events_list.append([user['user_id'], 'view_landing_page', curr_time,0.0])

  # 2. Add to Cart
  if random.random() < 0.60:
    curr_time += timedelta(minutes=random.randint(2,10))
    events_list.append([user['user_id'], 'add_to_cart',curr_time,0.0])

    # 3. Checkout_start
    if random.random() < 0.50:
      curr_time += timedelta(minutes=random.randint(1,5))
      events_list.append([user['user_id'], 'checkout_start',curr_time,0.0])

    # 4. Purchase
      if random.random() < base_prob:
        curr_time += timedelta(minutes=random.randint(2,5))
        events_list.append([user['user_id'], 'purchase',curr_time, 49.99])

      # LTV Simulation
        renewal_date = curr_time
        for month in range(1,6):
          if random.random() < 0.70:
            renewal_date += timedelta(days=30)
            if renewal_date < end_date:
              events_list.append([user['user_id'], 'purchase', renewal_date, 49.99])
        else:
          pass # Changed 'break' to 'pass' to allow the loop to complete for other users

df_events = pd.DataFrame(events_list, columns=['user_id','event_name','timestamp','revenue'])

In [21]:
df_events.head()

Unnamed: 0,user_id,event_name,timestamp,revenue
0,U00000,view_landing_page,2025-03-18 11:00:00,0.0
1,U00001,view_landing_page,2025-02-18 18:00:00,0.0
2,U00002,view_landing_page,2025-03-21 18:00:00,0.0
3,U00002,add_to_cart,2025-03-21 18:09:00,0.0
4,U00003,view_landing_page,2025-02-08 08:00:00,0.0


# **Saving Files as a CSV**

In [23]:
df_users.to_csv('greenbox_users.csv', index=False)
df_spend.to_csv('greenbox_spend.csv', index=False)
df_events.to_csv('greenbox_events.csv', index=False)

print(f"SUCCESS! \nGenerated {len(df_users)} users.")
print(f"Generated {len(df_spend)} days of ad spend.")
print(f"Generated {len(df_events)} behavioral events.")

SUCCESS! 
Generated 10000 users.
Generated 480 days of ad spend.
Generated 21219 behavioral events.
