In [0]:
# Import required libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType, TimestampType
import pyspark.sql.functions as F

print("Libraries imported successfully")

Libraries imported successfully


## Configuration
Set up Unity Catalog details and table configurations

In [0]:
# # Unity Catalog configuration
CATALOG_NAME = "data_catalog"
SCHEMA_NAME = "outputs"

# # Create schema if it doesn't exist
# spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG_NAME}")
# spark.sql(f"USE CATALOG {CATALOG_NAME}")
# spark.sql(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA_NAME}")

# print(f"Using catalog: {CATALOG_NAME}.{SCHEMA_NAME}")

# Data generation parameters
NUM_CUSTOMERS = 10000
NUM_TRANSACTIONS = 50000
NUM_USER_EVENTS = 30000

# Random seed for reproducibility
np.random.seed(42)

## Helper Functions for Data Generation

In [0]:
def generate_customer_data(num_records, version, drift_level='none'):
    """
    Generate customer data with optional drift.
    
    Args:
        num_records: Number of customer records to generate
        version: Version identifier (e.g., 'v1', 'v2')
        drift_level: Level of drift ('none', 'low', 'medium', 'high')
    
    Returns:
        pandas DataFrame with customer data
    """
    # Base timestamp for this version
    base_date = datetime.now() - timedelta(days=30 if version == 'v1' else 0)
    
    # Generate base data
    customer_ids = [f"CUST_{i:06d}" for i in range(1, num_records + 1)]
    
    # Age distribution (introduce drift in v2)
    if version == 'v1':
        ages = np.random.normal(45, 15, num_records).astype(int)
    else:
        if drift_level == 'high':
            # Shift younger in v2 (significant drift)
            ages = np.random.normal(38, 12, num_records).astype(int)
        elif drift_level == 'medium':
            ages = np.random.normal(42, 14, num_records).astype(int)
        else:
            ages = np.random.normal(45, 15, num_records).astype(int)
    
    ages = np.clip(ages, 18, 90)
    
    # Income distribution (introduce drift)
    if version == 'v1':
        incomes = np.random.lognormal(10.5, 0.8, num_records)
    else:
        if drift_level == 'high':
            # Higher income in v2
            incomes = np.random.lognormal(10.8, 0.9, num_records)
        elif drift_level == 'medium':
            incomes = np.random.lognormal(10.6, 0.85, num_records)
        else:
            incomes = np.random.lognormal(10.5, 0.8, num_records)
    
    incomes = np.round(incomes, 2)
    
    # Region distribution (introduce drift)
    if version == 'v1':
        regions = np.random.choice(
            ['North', 'South', 'East', 'West'],
            num_records,
            p=[0.3, 0.25, 0.25, 0.2]
        )
    else:
        if drift_level == 'high':
            # Different region distribution in v2
            regions = np.random.choice(
                ['North', 'South', 'East', 'West'],
                num_records,
                p=[0.2, 0.3, 0.15, 0.35]
            )
        elif drift_level == 'medium':
            regions = np.random.choice(
                ['North', 'South', 'East', 'West'],
                num_records,
                p=[0.25, 0.27, 0.23, 0.25]
            )
        else:
            regions = np.random.choice(
                ['North', 'South', 'East', 'West'],
                num_records,
                p=[0.3, 0.25, 0.25, 0.2]
            )
    
    # Credit score (introduce drift)
    if version == 'v1':
        credit_scores = np.random.normal(680, 80, num_records).astype(int)
    else:
        if drift_level == 'high':
            credit_scores = np.random.normal(710, 75, num_records).astype(int)
        elif drift_level == 'medium':
            credit_scores = np.random.normal(695, 78, num_records).astype(int)
        else:
            credit_scores = np.random.normal(680, 80, num_records).astype(int)
    
    credit_scores = np.clip(credit_scores, 300, 850)
    
    # Account type (introduce drift)
    if version == 'v1':
        account_types = np.random.choice(
            ['Basic', 'Premium', 'Gold'],
            num_records,
            p=[0.6, 0.3, 0.1]
        )
    else:
        if drift_level == 'high':
            account_types = np.random.choice(
                ['Basic', 'Premium', 'Gold'],
                num_records,
                p=[0.4, 0.35, 0.25]
            )
        elif drift_level == 'medium':
            account_types = np.random.choice(
                ['Basic', 'Premium', 'Gold'],
                num_records,
                p=[0.5, 0.35, 0.15]
            )
        else:
            account_types = np.random.choice(
                ['Basic', 'Premium', 'Gold'],
                num_records,
                p=[0.6, 0.3, 0.1]
            )
    
    # Account balance
    balances = np.random.exponential(5000, num_records)
    balances = np.round(balances, 2)
    
    # Create DataFrame
    df = pd.DataFrame({
        'customer_id': customer_ids,
        'age': ages,
        'income': incomes,
        'region': regions,
        'credit_score': credit_scores,
        'account_type': account_types,
        'account_balance': balances,
        'data_version': version,
        'processed_timestamp': base_date
    })
    
    return df

print("Customer data generation function created")

Customer data generation function created


In [0]:
def generate_product_sales_data(num_records, version, drift_level='none'):
    """
    Generate product sales transaction data with optional drift.
    
    Args:
        num_records: Number of transaction records to generate
        version: Version identifier
        drift_level: Level of drift
    
    Returns:
        pandas DataFrame with sales data
    """
    base_date = datetime.now() - timedelta(days=30 if version == 'v1' else 0)
    
    transaction_ids = [f"TXN_{i:08d}" for i in range(1, num_records + 1)]
    
    # Product categories (introduce drift)
    if version == 'v1':
        categories = np.random.choice(
            ['Electronics', 'Clothing', 'Home', 'Books', 'Sports'],
            num_records,
            p=[0.25, 0.25, 0.2, 0.15, 0.15]
        )
    else:
        if drift_level == 'high':
            categories = np.random.choice(
                ['Electronics', 'Clothing', 'Home', 'Books', 'Sports'],
                num_records,
                p=[0.35, 0.15, 0.25, 0.1, 0.15]
            )
        elif drift_level == 'medium':
            categories = np.random.choice(
                ['Electronics', 'Clothing', 'Home', 'Books', 'Sports'],
                num_records,
                p=[0.28, 0.22, 0.22, 0.14, 0.14]
            )
        else:
            categories = np.random.choice(
                ['Electronics', 'Clothing', 'Home', 'Books', 'Sports'],
                num_records,
                p=[0.25, 0.25, 0.2, 0.15, 0.15]
            )
    
    # Price distribution (introduce drift)
    if version == 'v1':
        prices = np.random.gamma(5, 20, num_records)
    else:
        if drift_level == 'high':
            prices = np.random.gamma(6, 25, num_records)
        elif drift_level == 'medium':
            prices = np.random.gamma(5.5, 22, num_records)
        else:
            prices = np.random.gamma(5, 20, num_records)
    
    prices = np.round(prices, 2)
    
    # Quantity
    quantities = np.random.poisson(2, num_records) + 1
    
    # Revenue
    revenues = prices * quantities
    
    # Discount applied (introduce drift)
    if version == 'v1':
        discounts = np.random.choice([True, False], num_records, p=[0.3, 0.7])
    else:
        if drift_level == 'high':
            discounts = np.random.choice([True, False], num_records, p=[0.5, 0.5])
        else:
            discounts = np.random.choice([True, False], num_records, p=[0.3, 0.7])
    
    # Payment method (introduce drift)
    if version == 'v1':
        payment_methods = np.random.choice(
            ['Credit Card', 'Debit Card', 'PayPal', 'Cash'],
            num_records,
            p=[0.4, 0.3, 0.2, 0.1]
        )
    else:
        if drift_level == 'high':
            payment_methods = np.random.choice(
                ['Credit Card', 'Debit Card', 'PayPal', 'Cash'],
                num_records,
                p=[0.35, 0.25, 0.35, 0.05]
            )
        else:
            payment_methods = np.random.choice(
                ['Credit Card', 'Debit Card', 'PayPal', 'Cash'],
                num_records,
                p=[0.4, 0.3, 0.2, 0.1]
            )
    
    df = pd.DataFrame({
        'transaction_id': transaction_ids,
        'product_category': categories,
        'price': prices,
        'quantity': quantities,
        'revenue': revenues,
        'discount_applied': discounts,
        'payment_method': payment_methods,
        'data_version': version,
        'processed_timestamp': base_date
    })
    
    return df

print("Product sales data generation function created")

Product sales data generation function created


In [0]:
def generate_user_behavior_data(num_records, version, drift_level='none'):
    """
    Generate user behavior and engagement data with optional drift.
    
    Args:
        num_records: Number of user event records to generate
        version: Version identifier
        drift_level: Level of drift
    
    Returns:
        pandas DataFrame with user behavior data
    """
    base_date = datetime.now() - timedelta(days=30 if version == 'v1' else 0)
    
    user_ids = [f"USER_{i:07d}" for i in range(1, num_records + 1)]
    
    # Session duration in minutes (introduce drift)
    if version == 'v1':
        session_durations = np.random.exponential(15, num_records)
    else:
        if drift_level == 'high':
            session_durations = np.random.exponential(22, num_records)
        elif drift_level == 'medium':
            session_durations = np.random.exponential(18, num_records)
        else:
            session_durations = np.random.exponential(15, num_records)
    
    session_durations = np.round(session_durations, 2)
    
    # Pages viewed (introduce drift)
    if version == 'v1':
        pages_viewed = np.random.poisson(8, num_records)
    else:
        if drift_level == 'high':
            pages_viewed = np.random.poisson(12, num_records)
        elif drift_level == 'medium':
            pages_viewed = np.random.poisson(10, num_records)
        else:
            pages_viewed = np.random.poisson(8, num_records)
    
    # Device type (introduce drift)
    if version == 'v1':
        devices = np.random.choice(
            ['Mobile', 'Desktop', 'Tablet'],
            num_records,
            p=[0.5, 0.35, 0.15]
        )
    else:
        if drift_level == 'high':
            devices = np.random.choice(
                ['Mobile', 'Desktop', 'Tablet'],
                num_records,
                p=[0.65, 0.25, 0.1]
            )
        elif drift_level == 'medium':
            devices = np.random.choice(
                ['Mobile', 'Desktop', 'Tablet'],
                num_records,
                p=[0.57, 0.30, 0.13]
            )
        else:
            devices = np.random.choice(
                ['Mobile', 'Desktop', 'Tablet'],
                num_records,
                p=[0.5, 0.35, 0.15]
            )
    
    # Conversion (introduce drift)
    if version == 'v1':
        conversions = np.random.choice([True, False], num_records, p=[0.15, 0.85])
    else:
        if drift_level == 'high':
            conversions = np.random.choice([True, False], num_records, p=[0.22, 0.78])
        else:
            conversions = np.random.choice([True, False], num_records, p=[0.15, 0.85])
    
    # Bounce rate
    bounce_rates = np.random.beta(2, 5, num_records)
    bounce_rates = np.round(bounce_rates, 3)
    
    # Traffic source (introduce drift)
    if version == 'v1':
        traffic_sources = np.random.choice(
            ['Organic', 'Paid', 'Social', 'Direct', 'Referral'],
            num_records,
            p=[0.3, 0.25, 0.2, 0.15, 0.1]
        )
    else:
        if drift_level == 'high':
            traffic_sources = np.random.choice(
                ['Organic', 'Paid', 'Social', 'Direct', 'Referral'],
                num_records,
                p=[0.25, 0.2, 0.35, 0.12, 0.08]
            )
        else:
            traffic_sources = np.random.choice(
                ['Organic', 'Paid', 'Social', 'Direct', 'Referral'],
                num_records,
                p=[0.3, 0.25, 0.2, 0.15, 0.1]
            )
    
    df = pd.DataFrame({
        'user_id': user_ids,
        'session_duration_minutes': session_durations,
        'pages_viewed': pages_viewed,
        'device_type': devices,
        'conversion': conversions,
        'bounce_rate': bounce_rates,
        'traffic_source': traffic_sources,
        'data_version': version,
        'processed_timestamp': base_date
    })
    
    return df

print("User behavior data generation function created")

User behavior data generation function created


## Generate and Save Data to Unity Catalog

Generate two versions of each table:
- Version 1 (v1): Reference/baseline data
- Version 2 (v2): Current data with introduced drift

In [0]:
# Generate Customer Data
print("Generating customer data...")
customer_v1 = generate_customer_data(NUM_CUSTOMERS, 'v1', drift_level='none')
customer_v2 = generate_customer_data(NUM_CUSTOMERS, 'v2', drift_level='high')

# Combine versions
#customer_combined = pd.concat([customer_v1, customer_v2], ignore_index=True)

# Convert to Spark DataFrame and save
customer_spark_df1 = spark.createDataFrame(customer_v1)
customer_spark_df2 = spark.createDataFrame(customer_v2)

table_name = f"{CATALOG_NAME}.{SCHEMA_NAME}.customer_data"
customer_spark_df1.write.mode("overwrite").saveAsTable(table_name)
customer_spark_df2.write.mode("overwrite").saveAsTable(table_name)

print(f"âœ“ Customer data saved to {table_name}")
print(f"  - Version 1: {len(customer_v1)} records")
print(f"  - Version 2: {len(customer_v2)} records")

Generating customer data...
âœ“ Customer data saved to data_catalog.outputs.customer_data
  - Version 1: 10000 records
  - Version 2: 10000 records


In [0]:
# Generate Product Sales Data
print("Generating product sales data...")
sales_v1 = generate_product_sales_data(NUM_TRANSACTIONS, 'v1', drift_level='none')
sales_v2 = generate_product_sales_data(NUM_TRANSACTIONS, 'v2', drift_level='high')

# Combine versions
#sales_combined = pd.concat([sales_v1, sales_v2], ignore_index=True)

# Convert to Spark DataFrame and save
sales_spark_df1 = spark.createDataFrame(sales_v1)
sales_spark_df2 = spark.createDataFrame(sales_v2)

table_name = f"{CATALOG_NAME}.{SCHEMA_NAME}.product_sales"
sales_spark_df1.write.mode("overwrite").saveAsTable(table_name)
sales_spark_df2.write.mode("overwrite").saveAsTable(table_name)

print(f"âœ“ Product sales data saved to {table_name}")
print(f"  - Version 1: {len(sales_v1)} records")
print(f"  - Version 2: {len(sales_v2)} records")

Generating product sales data...
âœ“ Product sales data saved to data_catalog.outputs.product_sales
  - Version 1: 50000 records
  - Version 2: 50000 records


In [0]:
# Generate User Behavior Data
print("Generating user behavior data...")
behavior_v1 = generate_user_behavior_data(NUM_USER_EVENTS, 'v1', drift_level='none')
behavior_v2 = generate_user_behavior_data(NUM_USER_EVENTS, 'v2', drift_level='high')

# Combine versions
behavior_combined = pd.concat([behavior_v1, behavior_v2], ignore_index=True)

# Convert to Spark DataFrame and save
behavior_spark_df1 = spark.createDataFrame(behavior_v1)
behavior_spark_df2 = spark.createDataFrame(behavior_v2)

table_name = f"{CATALOG_NAME}.{SCHEMA_NAME}.user_behavior"
behavior_spark_df1.write.mode("overwrite").saveAsTable(table_name)
behavior_spark_df2.write.mode("overwrite").saveAsTable(table_name)

print(f"âœ“ User behavior data saved to {table_name}")
print(f"  - Version 1: {len(behavior_v1)} records")
print(f"  - Version 2: {len(behavior_v2)} records")

Generating user behavior data...
âœ“ User behavior data saved to data_catalog.outputs.user_behavior
  - Version 1: 30000 records
  - Version 2: 30000 records


## Verify Data Creation

In [0]:
# Verify tables exist
print("Verifying created tables...")
tables = spark.sql(f"SHOW TABLES IN {CATALOG_NAME}.{SCHEMA_NAME}").collect()

print("\nTables in catalog:")
for table in tables:
    print(f"  - {table.tableName}")

# Show sample data from each table
print("\n" + "="*80)
print("Sample Customer Data (v2):")
print("="*80)
spark.sql(f"""
    SELECT * FROM {CATALOG_NAME}.{SCHEMA_NAME}.customer_data 
    WHERE data_version = 'v2' 
    LIMIT 5
""").show()

print("\n" + "="*80)
print("Sample Product Sales Data (v2):")
print("="*80)
spark.sql(f"""
    SELECT * FROM {CATALOG_NAME}.{SCHEMA_NAME}.product_sales 
    WHERE data_version = 'v2' 
    LIMIT 5
""").show()

print("\n" + "="*80)
print("Sample User Behavior Data (v2):")
print("="*80)
spark.sql(f"""
    SELECT * FROM {CATALOG_NAME}.{SCHEMA_NAME}.user_behavior 
    WHERE data_version = 'v2' 
    LIMIT 5
""").show()

Verifying created tables...

Tables in catalog:
  - customer_data
  - product_sales
  - user_behavior

Sample Customer Data (v2):
+-----------+---+---------+------+------------+------------+---------------+------------+--------------------+
|customer_id|age|   income|region|credit_score|account_type|account_balance|data_version| processed_timestamp|
+-----------+---+---------+------+------------+------------+---------------+------------+--------------------+
|CUST_001251| 36| 84731.33| North|         655|        Gold|        2766.51|          v2|2026-01-19 16:20:...|
|CUST_001252| 43|  15048.8| South|         682|       Basic|         951.49|          v2|2026-01-19 16:20:...|
|CUST_001253| 33|120593.68|  East|         634|     Premium|        4913.81|          v2|2026-01-19 16:20:...|
|CUST_001254| 18| 31117.31|  West|         718|       Basic|        1899.18|          v2|2026-01-19 16:20:...|
|CUST_001255| 35|132706.51| North|         557|     Premium|        2159.24|          v2|2026