In [0]:
# Imports
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window
from datetime import datetime, timedelta
import random

In [0]:
# ============================================
# Parameters
# ============================================
dbutils.widgets.text("s3_base_path", "s3://databricks-storage-4052354327981619/raw", "S3 base path")
dbutils.widgets.text("min_transactions", "10000", "Minimum transactions per batch")
dbutils.widgets.text("max_transactions", "20000", "Maximum transactions per batch")
dbutils.widgets.text("batch_frequency_seconds", "300", "Batch frequency in seconds") 
dbutils.widgets.text("transaction_id_start", "1000000", "Starting transaction_id")
dbutils.widgets.text("account_id_min", "1000000", "Minimum account_id")
dbutils.widgets.text("account_id_max", "10000000", "Maximum account_id")

base_path = dbutils.widgets.get("s3_base_path").rstrip("/")
min_transactions = int(dbutils.widgets.get("min_transactions"))
max_transactions = int(dbutils.widgets.get("max_transactions"))
batch_frequency_seconds = int(dbutils.widgets.get("batch_frequency_seconds"))  
transaction_id_start = int(dbutils.widgets.get("transaction_id_start"))
account_id_min = int(dbutils.widgets.get("account_id_min"))
account_id_max = int(dbutils.widgets.get("account_id_max"))

# Random number of transactions for this batch
total_transactions = random.randint(min_transactions, max_transactions)

print(f"S3 Base Path: {base_path}")
print(f"Batch Frequency: {batch_frequency_seconds} seconds")  
print(f"Transactions for this batch: {total_transactions}")
print(f"Account ID Range: {account_id_min} - {account_id_max}")

In [0]:
# ============================================
# Generate Timestamp for filename
# ============================================
now_utc = datetime.now(datetime.utcnow().tzinfo)
timestamp = now_utc.strftime("%Y%m%d-%H%M%S")
export_id = int(now_utc.strftime("%Y%m%d%H%M%S"))

print(f"\nExport ID: {export_id}")
print(f"Timestamp: {timestamp}")

In [0]:
# ============================================
# Find Current Maximum Transaction ID
# ============================================
existing_transactions_path = f"{base_path}/transactions/"
current_max_transaction_id = transaction_id_start - 1

try:
    # List files in the transactions directory
    files = dbutils.fs.ls(existing_transactions_path)
    
    # Filter for parquet files
    parquet_items = [f for f in files if '.parquet' in f.name and not f.name.startswith('_')]
    
    if len(parquet_items) > 0:
        print(f"\nüìÇ Found {len(parquet_items)} parquet file(s)")
        
        # Sort by modification time to get the latest file
        parquet_items_sorted = sorted(parquet_items, key=lambda x: x.modificationTime, reverse=True)
        latest_file = parquet_items_sorted[0]
        
        print(f"üìÑ Latest file: {latest_file.name}")
        
        # Read only the latest parquet file
        latest_file_path = latest_file.path
        df_existing_check = spark.read.parquet(latest_file_path)
        
        # Get count to verify data exists
        record_count = df_existing_check.count()
        print(f"üìä Records in latest file: {record_count}")
        
        if record_count > 0:
            max_id = df_existing_check.agg(F.max("transaction_id")).collect()[0][0]
            
            if max_id is not None:
                current_max_transaction_id = max_id
                print(f"üìä Current Maximum Transaction ID: {current_max_transaction_id}")
        
    else:
        print(f"\n‚ö†Ô∏è  No parquet files found in {existing_transactions_path}")
        print(f"Starting fresh with transaction_id from {transaction_id_start}")
    
except Exception as e:
    print(f"\n‚ö†Ô∏è  Error accessing existing transactions: {str(e)}")
    print(f"Starting fresh with transaction_id from {transaction_id_start}")
    current_max_transaction_id = transaction_id_start - 1

# Calculate next transaction_id
next_transaction_id = current_max_transaction_id + 1
print(f"\nüÜï Transactions will start from transaction_id: {next_transaction_id}")

In [0]:
# ============================================
# UDF Definitions
# ============================================

# Merchant Category Distribution (with probabilities)
def generate_merchant_category():
    rand_val = random.random() * 100
    if rand_val < 18:
        return "Groceries & Supermarkets"
    elif rand_val < 34:  # 18 + 16
        return "Restaurants & Fast Food"
    elif rand_val < 44:  # 34 + 10
        return "Gas & Fuel"
    elif rand_val < 52:  # 44 + 8
        return "Travel (Airline, Hotel, Car Rental)"
    elif rand_val < 59:  # 52 + 7
        return "Healthcare & Pharmacies"
    elif rand_val < 71:  # 59 + 12
        return "Retail ‚Äì Clothing & Department Stores"
    elif rand_val < 85:  # 71 + 14
        return "Online Marketplaces (Amazon, Etsy, etc.)"
    elif rand_val < 90:  # 85 + 5
        return "Entertainment (Movies, Events, Digital Media)"
    elif rand_val < 96:  # 90 + 6
        return "Utilities & Bills (Electric, Telecom, Internet)"
    else:
        return "Other (Government, Insurance, Misc.)"

merchant_category_udf = F.udf(lambda: generate_merchant_category(), StringType())

# Transaction Status (96-100% approved, 0-4% declined)
def generate_transaction_status():
    approval_rate = random.uniform(0.96, 1.0)
    return "APPROVED" if random.random() < approval_rate else "DECLINED"

transaction_status_udf = F.udf(lambda: generate_transaction_status(), StringType())

In [0]:
# ============================================
# Create Base DataFrame
# ============================================

df_transactions = (
    spark.range(0, total_transactions)
    
    # transaction_id: incremental starting from next_transaction_id
    .withColumn("transaction_id", F.lit(next_transaction_id) + F.col("id"))
    
    # account_id: random between account_id_min and account_id_max
    .withColumn("account_id", 
                (F.floor(F.rand() * (account_id_max - account_id_min + 1)) + account_id_min).cast("long"))
    
    # merchant_category: based on distribution
    .withColumn("merchant_category", merchant_category_udf())
    
    # merchant_id: based on category ranges
    .withColumn("merchant_id",
                F.when(F.col("merchant_category") == "Groceries & Supermarkets",
                       (F.floor(F.rand() * 18000) + 10000000).cast("long"))
                .when(F.col("merchant_category") == "Restaurants & Fast Food",
                      (F.floor(F.rand() * 16000) + 10018000).cast("long"))
                .when(F.col("merchant_category") == "Gas & Fuel",
                      (F.floor(F.rand() * 10000) + 10034000).cast("long"))
                .when(F.col("merchant_category") == "Travel (Airline, Hotel, Car Rental)",
                      (F.floor(F.rand() * 8000) + 10044000).cast("long"))
                .when(F.col("merchant_category") == "Healthcare & Pharmacies",
                      (F.floor(F.rand() * 7000) + 10052000).cast("long"))
                .when(F.col("merchant_category") == "Retail ‚Äì Clothing & Department Stores",
                      (F.floor(F.rand() * 12000) + 10059000).cast("long"))
                .when(F.col("merchant_category") == "Online Marketplaces (Amazon, Etsy, etc.)",
                      (F.floor(F.rand() * 14000) + 10071000).cast("long"))
                .when(F.col("merchant_category") == "Entertainment (Movies, Events, Digital Media)",
                      (F.floor(F.rand() * 5000) + 10085000).cast("long"))
                .when(F.col("merchant_category") == "Utilities & Bills (Electric, Telecom, Internet)",
                      (F.floor(F.rand() * 6000) + 10090000).cast("long"))
                .otherwise(
                      (F.floor(F.rand() * 4000) + 10096000).cast("long")))
    
    .drop("id")
)

In [0]:
# ============================================
# Add Transaction Amount Based on Category
# ============================================

# Helper function to generate amount based on distribution
def calculate_amount(category):
    """Generate transaction amount based on category-specific distributions"""
    
    # Distribution definitions per category
    distributions = {
        "Groceries & Supermarkets": [(0.80, 10, 100), (0.15, 100, 250), (0.04, 250, 1000), (0.01, 1000, 2000)],
        "Restaurants & Fast Food": [(0.80, 10, 100), (0.15, 100, 250), (0.04, 250, 1000), (0.01, 1000, 2000)],
        "Gas & Fuel": [(0.80, 10, 100), (0.15, 100, 250), (0.04, 250, 1000), (0.01, 1000, 2000)],
        "Travel (Airline, Hotel, Car Rental)": [(0.60, 100, 300), (0.20, 300, 1500), (0.15, 1500, 3000), (0.05, 3000, 6000)],
        "Healthcare & Pharmacies": [(0.50, 10, 100), (0.20, 100, 250), (0.10, 250, 2000), (0.10, 2000, 10000)],
        "Retail ‚Äì Clothing & Department Stores": [(0.60, 100, 300), (0.20, 300, 1500), (0.15, 1500, 3000), (0.05, 3000, 6000)],
        "Online Marketplaces (Amazon, Etsy, etc.)": [(0.80, 10, 100), (0.15, 100, 250), (0.04, 250, 1000), (0.01, 1000, 10000)],
        "Entertainment (Movies, Events, Digital Media)": [(0.90, 10, 100), (0.05, 100, 250), (0.05, 250, 1000)],
        "Utilities & Bills (Electric, Telecom, Internet)": [(0.80, 10, 100), (0.15, 100, 250), (0.04, 250, 1000), (0.01, 1000, 2000)],
        "Other (Government, Insurance, Misc.)": [(0.80, 10, 100), (0.15, 100, 250), (0.04, 250, 1000), (0.01, 1000, 2000)]
    }
    
    # Use PySpark's rand() to select distribution tier
    dist = distributions.get(category, distributions["Other (Government, Insurance, Misc.)"])
    
    # Create CASE statement for each tier
    amount_expr = None
    cumulative_prob = 0.0
    
    for prob, min_amt, max_amt in dist:
        cumulative_prob += prob
        tier_expr = (F.rand() * (max_amt - min_amt) + min_amt)
        
        if amount_expr is None:
            amount_expr = F.when(F.rand() < cumulative_prob, tier_expr)
        else:
            amount_expr = amount_expr.when(F.rand() < cumulative_prob, tier_expr)
    
    # Default to last tier if nothing matches
    last_tier = dist[-1]
    amount_expr = amount_expr.otherwise(F.rand() * (last_tier[2] - last_tier[1]) + last_tier[1])
    
    return amount_expr

# Apply amount calculation for each category
df_transactions = (
    df_transactions
    .withColumn("rand_for_amount", F.rand())
    .withColumn("transaction_amount",
                F.when(F.col("merchant_category") == "Groceries & Supermarkets",
                       F.when(F.col("rand_for_amount") < 0.80, F.rand() * 90 + 10)
                       .when(F.col("rand_for_amount") < 0.95, F.rand() * 150 + 100)
                       .when(F.col("rand_for_amount") < 0.99, F.rand() * 750 + 250)
                       .otherwise(F.rand() * 1000 + 1000))
                .when(F.col("merchant_category") == "Restaurants & Fast Food",
                      F.when(F.col("rand_for_amount") < 0.80, F.rand() * 90 + 10)
                      .when(F.col("rand_for_amount") < 0.95, F.rand() * 150 + 100)
                      .when(F.col("rand_for_amount") < 0.99, F.rand() * 750 + 250)
                      .otherwise(F.rand() * 1000 + 1000))
                .when(F.col("merchant_category") == "Gas & Fuel",
                      F.when(F.col("rand_for_amount") < 0.80, F.rand() * 90 + 10)
                      .when(F.col("rand_for_amount") < 0.95, F.rand() * 150 + 100)
                      .when(F.col("rand_for_amount") < 0.99, F.rand() * 750 + 250)
                      .otherwise(F.rand() * 1000 + 1000))
                .when(F.col("merchant_category") == "Travel (Airline, Hotel, Car Rental)",
                      F.when(F.col("rand_for_amount") < 0.60, F.rand() * 200 + 100)
                      .when(F.col("rand_for_amount") < 0.80, F.rand() * 1200 + 300)
                      .when(F.col("rand_for_amount") < 0.95, F.rand() * 1500 + 1500)
                      .otherwise(F.rand() * 3000 + 3000))
                .when(F.col("merchant_category") == "Healthcare & Pharmacies",
                      F.when(F.col("rand_for_amount") < 0.50, F.rand() * 90 + 10)
                      .when(F.col("rand_for_amount") < 0.70, F.rand() * 150 + 100)
                      .when(F.col("rand_for_amount") < 0.80, F.rand() * 1750 + 250)
                      .otherwise(F.rand() * 8000 + 2000))
                .when(F.col("merchant_category") == "Retail ‚Äì Clothing & Department Stores",
                      F.when(F.col("rand_for_amount") < 0.60, F.rand() * 200 + 100)
                      .when(F.col("rand_for_amount") < 0.80, F.rand() * 1200 + 300)
                      .when(F.col("rand_for_amount") < 0.95, F.rand() * 1500 + 1500)
                      .otherwise(F.rand() * 3000 + 3000))
                .when(F.col("merchant_category") == "Online Marketplaces (Amazon, Etsy, etc.)",
                      F.when(F.col("rand_for_amount") < 0.80, F.rand() * 90 + 10)
                      .when(F.col("rand_for_amount") < 0.95, F.rand() * 150 + 100)
                      .when(F.col("rand_for_amount") < 0.99, F.rand() * 750 + 250)
                      .otherwise(F.rand() * 9000 + 1000))
                .when(F.col("merchant_category") == "Entertainment (Movies, Events, Digital Media)",
                      F.when(F.col("rand_for_amount") < 0.90, F.rand() * 90 + 10)
                      .when(F.col("rand_for_amount") < 0.95, F.rand() * 150 + 100)
                      .otherwise(F.rand() * 750 + 250))
                .when(F.col("merchant_category") == "Utilities & Bills (Electric, Telecom, Internet)",
                      F.when(F.col("rand_for_amount") < 0.80, F.rand() * 90 + 10)
                      .when(F.col("rand_for_amount") < 0.95, F.rand() * 150 + 100)
                      .when(F.col("rand_for_amount") < 0.99, F.rand() * 750 + 250)
                      .otherwise(F.rand() * 1000 + 1000))
                .otherwise(  # Other (Government, Insurance, Misc.)
                      F.when(F.col("rand_for_amount") < 0.80, F.rand() * 90 + 10)
                      .when(F.col("rand_for_amount") < 0.95, F.rand() * 150 + 100)
                      .when(F.col("rand_for_amount") < 0.99, F.rand() * 750 + 250)
                      .otherwise(F.rand() * 1000 + 1000)))
    .drop("rand_for_amount")
    .withColumn("transaction_amount", F.round(F.col("transaction_amount"), 2))
)

In [0]:
# ============================================
# Add Transaction Timestamp (within batch window)
# ============================================

# Time window is already in seconds
time_window_seconds = batch_frequency_seconds  # No conversion needed

df_transactions = (
    df_transactions
    # Random seconds within the batch window
    .withColumn("seconds_ago", (F.rand() * time_window_seconds).cast("long"))
    .withColumn("transaction_timestamp", 
                F.expr("current_timestamp() - make_interval(0, 0, 0, 0, 0, 0, seconds_ago)"))
    .drop("seconds_ago")
)

In [0]:
# ============================================
# Add Transaction Status and Export Metadata
# ============================================

df_transactions = (
    df_transactions
    .withColumn("transaction_status", transaction_status_udf())
    .withColumn("export_id", F.lit(export_id))
    .withColumn("export_ts", F.current_timestamp())
    .withColumn("export_date", F.to_date("export_ts"))
)


In [0]:
# ============================================
# Reorder Columns
# ============================================

df_transactions = df_transactions.select(
    "transaction_id",
    "account_id",
    "merchant_id",
    "merchant_category",
    "transaction_amount",
    "transaction_timestamp",
    "transaction_status",
    "export_id",
    "export_ts",
    "export_date"
)

In [0]:
# ============================================
# Write to S3
# ============================================

output_path = f"{base_path}/transactions/transactions_{timestamp}.parquet"

print(f"\nüíæ Writing to: {output_path}")

(
    df_transactions
    .write
    .mode("overwrite")
    .parquet(output_path)
)

print(f"‚úÖ Successfully written {total_transactions} transactions to {output_path}")

In [0]:
# ============================================
# Verify and Show Statistics
# ============================================

print("\nüîç Verifying written data...")
df_verify = spark.read.parquet(output_path)

print(f"‚úÖ Verified: {df_verify.count()} records written")

print("\nüìä Transaction ID Range:")
df_verify.agg(
    F.min("transaction_id").alias("min_transaction_id"),
    F.max("transaction_id").alias("max_transaction_id")
).show()

print("\nüìä Merchant Category Distribution:")
df_verify.groupBy("merchant_category").count().orderBy(F.desc("count")).show(truncate=False)

print("\nüìä Transaction Status Distribution:")
df_verify.groupBy("transaction_status").count().show()

print("\nüìä Transaction Amount Statistics by Category:")
df_verify.groupBy("merchant_category").agg(
    F.count("*").alias("count"),
    F.round(F.min("transaction_amount"), 2).alias("min_amount"),
    F.round(F.avg("transaction_amount"), 2).alias("avg_amount"),
    F.round(F.max("transaction_amount"), 2).alias("max_amount")
).orderBy(F.desc("count")).show(truncate=False)

print("\nüìã Sample Transactions:")
df_verify.orderBy(F.rand()).show(10, truncate=False)

print("\n" + "="*80)
print(f"Summary:")
print(f"  Next batch will start from transaction_id: {next_transaction_id + total_transactions}")
print(f"  Run this notebook again in {batch_frequency_seconds} seconds for next batch")  # Changed from minutes
print("="*80)