<a href="https://colab.research.google.com/github/cbonnin88/RescueBites/blob/main/data_creation_rescuebites.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faker

Collecting faker
  Downloading faker-39.0.0-py3-none-any.whl.metadata (16 kB)
Downloading faker-39.0.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-39.0.0


In [None]:
import polars as pl
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

In [None]:
fake = Faker()
Faker.seed(42)
np.random.seed(42)

1. Generate Stores

In [None]:
n_stores = 200
categories = ['Supermarket','Bakery','Cafe','Buffet','Restaurant']
stores_data = {
    'store_id': range(1,n_stores +1),
    'store_name': [f'{fake.company()} {random.choice(['Eats','Foods','Market'])}' for _ in range(n_stores)],
    'category': [random.choice(categories) for _ in range(n_stores)],
    'city': [random.choice(['Paris','London','Dublin','Copenhagen']) for _ in range(n_stores)],
    'commission_rate': np.random.uniform(0.15,0.25,n_stores)
}

df_stores = pl.DataFrame(stores_data)

2. Generate Users

In [None]:
n_users = 5000
users_data = {
    'user_id': range(1, n_users + 1),
    'join_date': [fake.date_between(start_date='-1y', end_date='today') for _ in range(n_users)],
    'device': [random.choice(['iOS','Android']) for _ in range(n_users)]
}

df_users = pl.DataFrame(users_data)

3. Generate Orders (Transactions)

In [None]:
n_orders = 25000
user_weights = np.random.pareto(a=1.16, size=n_users)
user_weights /= user_weights.sum()

order_users = np.random.choice(df_users['user_id'], size=n_orders, p=user_weights)
order_dates = []

user_join_map = dict(zip(df_users['user_id'], df_users['join_date']))

for uid in order_users:
  join_dt = user_join_map[uid]
  # Random date between join date and today
  days_since = (datetime.today().date() - join_dt).days
  if days_since < 1: days_since = 1
  random_days = random.randint(0,days_since)
  order_dates.append(join_dt + timedelta(days=random_days))

orders_data = {
    'order_id': range(1,n_orders + 1),
    'user_id': order_users,
    'store_id': np.random.randint(1, n_stores + 1, size=n_orders),
    'order_date': order_dates,
    'order_amount': np.round(np.random.normal(5.50,1.50, size=n_orders),2),
    'status': np.random.choice(['Completed','Cancelled','No_Show'], size=n_orders, p=[0.85,0.10,0.05]),
    'rating': np.random.choice([1,2,3,4,5,None], size=n_orders, p=[0.05,0.05,0.10,0.30,0.40,0.10])
}

df_orders = pl.DataFrame(orders_data)

Save to CSV

In [None]:
df_stores.write_csv('stores.csv')
df_users.write_csv('users.csv')

df_orders = df_orders.with_columns(
    pl.col('rating').map_elements(lambda x: int(x) if x is not None else None, return_dtype=pl.Int64)
)
df_orders.write_csv('orders.csv')

print('Datasets Created')

Datasets Created
