In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from faker import Faker
import sqlite3
import json

In [3]:


# Initialize Faker
fake = Faker()
np.random.seed(42)
random.seed(42)

class EcommerceDataGenerator:
    def __init__(self, num_users=5000, num_days=365, start_date='2023-01-01'):
        self.num_users = num_users
        self.num_days = num_days
        self.start_date = datetime.strptime(start_date, '%Y-%m-%d')
        self.end_date = self.start_date + timedelta(days=num_days)
        
        # Product categories and items
        self.categories = ['Electronics', 'Clothing', 'Home & Garden', 'Books', 'Sports', 'Beauty', 'Toys']
        self.products = self.generate_products()
        
        # User segments (for realistic behavior patterns)
        self.user_segments = ['High Value', 'Regular', 'Occasional', 'New']
        
    def generate_products(self):
        """Generate product catalog"""
        products = []
        product_names = {
            'Electronics': ['Smartphone', 'Laptop', 'Headphones', 'Tablet', 'Smartwatch', 'Camera'],
            'Clothing': ['T-shirt', 'Jeans', 'Dress', 'Jacket', 'Shoes', 'Sweater'],
            'Home & Garden': ['Coffee Maker', 'Vacuum', 'Plant Pot', 'Lamp', 'Curtains', 'Pillow'],
            'Books': ['Fiction Novel', 'Cookbook', 'Biography', 'Self-help', 'Textbook', 'Comics'],
            'Sports': ['Running Shoes', 'Yoga Mat', 'Dumbbell', 'Bicycle', 'Football', 'Water Bottle'],
            'Beauty': ['Skincare Set', 'Makeup Kit', 'Perfume', 'Shampoo', 'Moisturizer', 'Lipstick'],
            'Toys': ['Board Game', 'Action Figure', 'Puzzle', 'Building Blocks', 'Doll', 'Remote Car']
        }
        
        for category in self.categories:
            for product_name in product_names[category]:
                price = round(random.uniform(10, 500), 2)
                products.append({
                    'product_id': f"P{len(products)+1:04d}",
                    'product_name': product_name,
                    'category': category,
                    'price': price
                })
        
        return products
    
    def generate_users(self):
        """Generate user base with segments"""
        users = []
        
        for i in range(self.num_users):
            segment = np.random.choice(self.user_segments, p=[0.1, 0.5, 0.3, 0.1])
            
            # Registration date (some users joined recently, others are old)
            if segment == 'New':
                reg_date = self.start_date + timedelta(days=random.randint(300, 365))
            else:
                reg_date = self.start_date + timedelta(days=random.randint(0, 200))
                
            users.append({
                'user_id': f"U{i+1:06d}",
                'email': fake.email(),
                'registration_date': reg_date,
                'age': random.randint(18, 65),
                'gender': random.choice(['M', 'F']),
                'location': fake.city(),
                'segment': segment
            })
        
        return users
    
    def generate_user_events(self, users):
        """Generate realistic user behavior events"""
        events = []
        
        for user in users:
            user_id = user['user_id']
            segment = user['segment']
            reg_date = user['registration_date']
            
            # Different behavior patterns by segment
            if segment == 'High Value':
                sessions_per_month = random.randint(8, 20)
                purchase_probability = 0.3
                avg_session_length = random.randint(5, 15)
            elif segment == 'Regular':
                sessions_per_month = random.randint(3, 10)
                purchase_probability = 0.15
                avg_session_length = random.randint(3, 8)
            elif segment == 'Occasional':
                sessions_per_month = random.randint(1, 4)
                purchase_probability = 0.08
                avg_session_length = random.randint(2, 5)
            else:  # New
                sessions_per_month = random.randint(1, 3)
                purchase_probability = 0.05
                avg_session_length = random.randint(1, 4)
            
            # Generate sessions from registration date to end date
            days_active = (self.end_date - reg_date).days
            total_sessions = int((days_active / 30) * sessions_per_month)
            
            for session_num in range(total_sessions):
                # Random session date
                session_date = reg_date + timedelta(days=random.randint(0, days_active))
                session_id = f"S{len(events)+1:08d}"
                
                # Session start
                session_start = session_date + timedelta(
                    hours=random.randint(8, 22),
                    minutes=random.randint(0, 59),
                    seconds=random.randint(0, 59)
                )
                
                events.append({
                    'event_id': f"E{len(events)+1:08d}",
                    'user_id': user_id,
                    'session_id': session_id,
                    'event_type': 'session_start',
                    'timestamp': session_start,
                    'product_id': None,
                    'category': None,
                    'price': None,
                    'quantity': None
                })
                
                # Generate events within session
                current_time = session_start
                session_products = []
                
                # Browse products
                num_products_viewed = random.randint(1, avg_session_length)
                for _ in range(num_products_viewed):
                    product = random.choice(self.products)
                    session_products.append(product)
                    
                    current_time += timedelta(minutes=random.randint(1, 5))
                    
                    events.append({
                        'event_id': f"E{len(events)+1:08d}",
                        'user_id': user_id,
                        'session_id': session_id,
                        'event_type': 'product_view',
                        'timestamp': current_time,
                        'product_id': product['product_id'],
                        'category': product['category'],
                        'price': product['price'],
                        'quantity': None
                    })
                
                # Add to cart (subset of viewed products)
                cart_products = random.sample(session_products, 
                                            min(len(session_products), random.randint(1, 3)))
                
                for product in cart_products:
                    current_time += timedelta(minutes=random.randint(1, 3))
                    
                    events.append({
                        'event_id': f"E{len(events)+1:08d}",
                        'user_id': user_id,
                        'session_id': session_id,
                        'event_type': 'add_to_cart',
                        'timestamp': current_time,
                        'product_id': product['product_id'],
                        'category': product['category'],
                        'price': product['price'],
                        'quantity': random.randint(1, 3)
                    })
                
                # Purchase decision
                if random.random() < purchase_probability and cart_products:
                    # Purchase subset of cart items
                    purchase_products = random.sample(cart_products, 
                                                    random.randint(1, len(cart_products)))
                    
                    for product in purchase_products:
                        current_time += timedelta(minutes=random.randint(1, 2))
                        quantity = random.randint(1, 2)
                        
                        events.append({
                            'event_id': f"E{len(events)+1:08d}",
                            'user_id': user_id,
                            'session_id': session_id,
                            'event_type': 'purchase',
                            'timestamp': current_time,
                            'product_id': product['product_id'],
                            'category': product['category'],
                            'price': product['price'],
                            'quantity': quantity
                        })
                
                # Session end
                current_time += timedelta(minutes=random.randint(1, 5))
                events.append({
                    'event_id': f"E{len(events)+1:08d}",
                    'user_id': user_id,
                    'session_id': session_id,
                    'event_type': 'session_end',
                    'timestamp': current_time,
                    'product_id': None,
                    'category': None,
                    'price': None,
                    'quantity': None
                })
        
        return events
    
    def create_database(self, db_name='ecommerce_data.db'):
        """Create SQLite database with generated data"""
        conn = sqlite3.connect(db_name)
        
        # Generate data
        print("Generating users...")
        users = self.generate_users()
        
        print("Generating user events...")
        events = self.generate_user_events(users)
        
        print("Creating database tables...")
        
        # Create tables
        users_df = pd.DataFrame(users)
        products_df = pd.DataFrame(self.products)
        events_df = pd.DataFrame(events)
        
        # Save to database
        users_df.to_sql('users', conn, if_exists='replace', index=False)
        products_df.to_sql('products', conn, if_exists='replace', index=False)
        events_df.to_sql('events', conn, if_exists='replace', index=False)
        
        conn.close()
        
        print(f"Database created successfully!")
        print(f"Users: {len(users)}")
        print(f"Products: {len(self.products)}")
        print(f"Events: {len(events)}")
        
        return users_df, products_df, events_df
    
    def generate_summary_stats(self, events_df):
        """Generate summary statistics for data validation"""
        print("\n=== DATA SUMMARY ===")
        print(f"Total Events: {len(events_df):,}")
        print(f"Date Range: {events_df['timestamp'].min()} to {events_df['timestamp'].max()}")
        print(f"Unique Users: {events_df['user_id'].nunique():,}")
        print(f"Unique Sessions: {events_df['session_id'].nunique():,}")
        
        print("\nEvent Type Distribution:")
        event_counts = events_df['event_type'].value_counts()
        for event_type, count in event_counts.items():
            print(f"  {event_type}: {count:,}")
        
        print("\nCategory Distribution (Product Views):")
        category_views = events_df[events_df['event_type'] == 'product_view']['category'].value_counts()
        for category, count in category_views.items():
            print(f"  {category}: {count:,}")
        
        # Conversion funnel
        total_sessions = events_df[events_df['event_type'] == 'session_start']['session_id'].nunique()
        sessions_with_views = events_df[events_df['event_type'] == 'product_view']['session_id'].nunique()
        sessions_with_cart = events_df[events_df['event_type'] == 'add_to_cart']['session_id'].nunique()
        sessions_with_purchase = events_df[events_df['event_type'] == 'purchase']['session_id'].nunique()
        
        print(f"\nConversion Funnel:")
        print(f"  Sessions: {total_sessions:,}")
        print(f"  With Product Views: {sessions_with_views:,} ({sessions_with_views/total_sessions*100:.1f}%)")
        print(f"  With Cart Adds: {sessions_with_cart:,} ({sessions_with_cart/total_sessions*100:.1f}%)")
        print(f"  With Purchases: {sessions_with_purchase:,} ({sessions_with_purchase/total_sessions*100:.1f}%)")



In [5]:
# Usage example
if __name__ == "__main__":
    # Generate data
    generator = EcommerceDataGenerator(num_users=5000, num_days=365)
    users_df, products_df, events_df = generator.create_database()

Generating users...
Generating user events...
Creating database tables...
Database created successfully!
Users: 5000
Products: 42
Events: 1848518


In [9]:
# Show summary statistics
generator.generate_summary_stats(events_df)


=== DATA SUMMARY ===
Total Events: 1,848,518
Date Range: 2023-01-01 09:25:40 to 2024-01-01 23:39:24
Unique Users: 4,879
Unique Sessions: 239,908

Event Type Distribution:
  product_view: 891,826
  add_to_cart: 416,593
  session_start: 239,908
  session_end: 239,908
  purchase: 60,283

Category Distribution (Product Views):
  Toys: 127,795
  Electronics: 127,451
  Beauty: 127,399
  Home & Garden: 127,386
  Sports: 127,284
  Clothing: 127,275
  Books: 127,236

Conversion Funnel:
  Sessions: 239,908
  With Product Views: 239,908 (100.0%)
  With Cart Adds: 239,908 (100.0%)
  With Purchases: 43,570 (18.2%)


In [13]:
 # Save as CSV files too (optional)
users_df.to_csv('users.csv', index=False)
products_df.to_csv('products.csv', index=False)
events_df.to_csv('events.csv', index=False)
    
print("\nFiles created:")
print("- ecommerce_data.db (SQLite database)")
print("- users.csv")
print("- products.csv") 
print("- events.csv")
    
print("\nNext steps:")
print("1. Explore the data with SQL queries")
print("2. Start building your cohort analysis")
print("3. Create basic Streamlit app structure")


Files created:
- ecommerce_data.db (SQLite database)
- users.csv
- products.csv
- events.csv

Next steps:
1. Explore the data with SQL queries
2. Start building your cohort analysis
3. Create basic Streamlit app structure


In [15]:
events_df

Unnamed: 0,event_id,user_id,session_id,event_type,timestamp,product_id,category,price,quantity
0,E00000001,U000001,S00000001,session_start,2023-12-19 12:00:39,,,,
1,E00000002,U000001,S00000001,product_view,2023-12-19 12:01:39,P0002,Electronics,22.26,
2,E00000003,U000001,S00000001,product_view,2023-12-19 12:04:39,P0015,Home & Garden,328.44,
3,E00000004,U000001,S00000001,product_view,2023-12-19 12:06:39,P0028,Sports,57.39,
4,E00000005,U000001,S00000001,product_view,2023-12-19 12:08:39,P0017,Home & Garden,118.02,
...,...,...,...,...,...,...,...,...,...
1848513,E01848514,U005000,S01848513,product_view,2023-08-30 12:21:32,P0025,Sports,479.03,
1848514,E01848515,U005000,S01848513,product_view,2023-08-30 12:25:32,P0010,Clothing,24.60,
1848515,E01848516,U005000,S01848513,add_to_cart,2023-08-30 12:26:32,P0010,Clothing,24.60,1.0
1848516,E01848517,U005000,S01848513,add_to_cart,2023-08-30 12:27:32,P0025,Sports,479.03,1.0
