<a href="https://colab.research.google.com/github/cbonnin88/TerraLoop/blob/main/TerraLoop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta
import random

# **Configuration**

In [None]:
num_users = 500
num_events = 5000
start_date = datetime(2026,1,1)

# **1. Generate USERS Dataset**

In [None]:
users = pl.DataFrame({
    'user_id':[f'TL{i:04d}' for i in range(num_users)],
    'signup_date':[start_date + timedelta(days=random.randint(0,30)) for _ in range(num_users)],
    'user_tier': random.choices(['Basic','Eco-Ally','Premium'], weights=[60,30,10], k=num_users),
    'region': random.choices(['North America','Europe','Asia',None], k=num_users) # Added Nulls for dbt testing
})

In [None]:
display(users.head())

user_id,signup_date,user_tier,region
str,datetime[μs],str,str
"""TL0000""",2026-01-07 00:00:00,"""Eco-Ally""","""Europe"""
"""TL0001""",2026-01-24 00:00:00,"""Eco-Ally""","""Asia"""
"""TL0002""",2026-01-17 00:00:00,"""Eco-Ally""","""Europe"""
"""TL0003""",2026-01-18 00:00:00,"""Basic""",
"""TL0004""",2026-01-07 00:00:00,"""Basic""","""North America"""


# **2. Generate EVENTS Dataset**

- I am simulating a funnel: *session_start -> search_item -> start_listing -> complete_listing*

In [None]:
event_types = ['session_start','search_item','start_listing','complete_listing']
event_list =[]

In [None]:
for _ in range(num_events):
  user_id = f'TL-{random.randint(0,num_users-1):04d}'
  # Logic: More people start than finish
  e_type = random.choices(event_types, weights=[40,30,20,10],k=1)[0]

  event_list.append({
      'user_id':user_id,
      'event_type':e_type,
      'event_time':start_date + timedelta(days=random.randint(0,45), hours=random.randint(0,23)),
      'platform': random.choice(['iOS','Android','Web','WEB']), # Inconsistent casing for cleaning practice
      'session_id': random.randint(100000,999999),
      'item_category': random.choice(['Battery','Laptop','Phone','Small Appliance']) if e_type in ['search_item','start_listing'] else None
  })

  events = pl.DataFrame(event_list)

In [None]:
display(events.head())

user_id,event_type,event_time,platform,session_id,item_category
str,str,datetime[μs],str,i64,str
"""TL-0366""","""complete_listing""",2026-01-27 19:00:00,"""Web""",144548,
"""TL-0138""","""search_item""",2026-01-05 20:00:00,"""WEB""",351153,"""Phone"""
"""TL-0055""","""session_start""",2026-02-07 11:00:00,"""WEB""",837524,
"""TL-0159""","""session_start""",2026-01-21 20:00:00,"""WEB""",115904,
"""TL-0321""","""start_listing""",2026-01-10 20:00:00,"""Android""",314802,"""Phone"""


# **3. Generate LISTINGS (The Business Entity)**

In [None]:
listings = pl.DataFrame({
    'listing_id':[f'LST-{i:05d}' for i in range(800)],
    'owner_id':[f'TL-{random.randint(0,num_users-1):04d}' for _ in range(800)],
    'eco_score_impact': [random.randint(10,100) for _ in range(800)],
    'status': random.choices(['active','completed','cancelled'], weights=[20,70,10],k=800),
    'created_at':[start_date + timedelta(days=random.randint(0,40)) for _ in range(800)]
})

In [None]:
display(listings.head())

listing_id,owner_id,eco_score_impact,status,created_at
str,str,i64,str,datetime[μs]
"""LST-00000""","""TL-0001""",45,"""completed""",2026-01-19 00:00:00
"""LST-00001""","""TL-0137""",77,"""completed""",2026-01-12 00:00:00
"""LST-00002""","""TL-0028""",91,"""active""",2026-01-31 00:00:00
"""LST-00003""","""TL-0378""",22,"""completed""",2026-02-07 00:00:00
"""LST-00004""","""TL-0062""",94,"""completed""",2026-01-31 00:00:00


# **Exporting to CSV**

In [None]:
users.write_csv('tl_users.csv')
events.write_csv('tl_events.csv')
listings.write_csv('tl_listings.csv')

print('✅ TerraLoop datasets created: tl_users.csv, tl_events.csv, tl_listings.csv')

✅ TerraLoop datasets created: tl_users.csv, tl_events.csv, tl_listings.csv
