In [0]:
# Imports
from pyspark.sql import functions as F
from pyspark.sql.types import *
from datetime import datetime
import random

In [0]:
# ============================================
# Parameters
# ============================================
# dbutils.widgets.text("s3_base_path", "s3a://databricks-storage-4052354327981619", "S3 base path")
dbutils.widgets.text("total_merchants", "100000", "Total number of merchants")
dbutils.widgets.text("schema_name", "silver", "schema name")
dbutils.widgets.text("merchants_table", "merchants", "Merchants table name")
# dbutils.widgets.text("partition_date", datetime.now().strftime("%Y-%m-%d"), "Partition Date (YYYY-MM-DD) - Business date for partitioning")
dbutils.widgets.text("partition_date", "2025-11-27", "Partition Date (YYYY-MM-DD) - Business date for partitioning")

#s3_base_path = dbutils.widgets.get("s3_base_path").rstrip("/")
total_merchants = int(dbutils.widgets.get("total_merchants"))
schema_name = dbutils.widgets.get("schema_name")
merchants_table = dbutils.widgets.get("merchants_table")
partition_date = dbutils.widgets.get("partition_date")

# Generate ETL date and time (when the job actually runs)
etl_datetime = datetime.now()
etl_date = etl_datetime.strftime("%Y-%m-%d")
etl_time = etl_datetime.strftime("%H:%M:%S")

# print(f"S3 Base Path: {s3_base_path}")
print(f"Total Merchants: {total_merchants}")
print(f"Target: {schema_name}.{merchants_table}")
print(f"Partition Date (Business Date): {partition_date}")
print(f"ETL Date (Job Run Date): {etl_date}")
print(f"ETL Time: {etl_time}")

In [0]:
# ============================================
# Merchant Category Ranges and Distribution
# ============================================

merchant_ranges = {
    "Groceries & Supermarkets": (10000000, 10017999, 0.18),
    "Restaurants & Fast Food": (10018000, 10033999, 0.16),
    "Gas & Fuel": (10034000, 10043999, 0.10),
    "Travel (Airline, Hotel, Car Rental)": (10044000, 10051999, 0.08),
    "Healthcare & Pharmacies": (10052000, 10058899, 0.07),
    "Retail ‚Äì Clothing & Department Stores": (10058900, 10070999, 0.12),
    "Online Marketplaces (Amazon, Etsy, etc.)": (10071000, 10083499, 0.14),
    "Entertainment (Movies, Events, Digital Media)": (10083500, 10088499, 0.05),
    "Utilities & Bills (Electric, Telecom, Internet)": (10088500, 10094499, 0.06),
    "Other (Government, Insurance, Misc.)": (10094500, 10099999, 0.04)
}


In [0]:
# ============================================
# Merchant Name Templates
# ============================================

merchant_name_templates = {
    "Groceries & Supermarkets": [
        "{city} Grocery", "{city} Market", "{adjective} Foods", "{city} Supermarket",
        "{adjective} Grocery Co", "Fresh {city} Market", "{city} Food Store",
        "{adjective} Mart", "Super {city} Foods", "{city} Fresh Market"
    ],
    "Restaurants & Fast Food": [
        "{city} Grill", "{city} Diner", "{adjective} Cafe", "{city} Bistro",
        "{city} Pizza", "{adjective} Burger", "{city} Steakhouse", "{adjective} Kitchen",
        "{city} BBQ", "{adjective} Wings", "{city} Sushi Bar", "{adjective} Tacos"
    ],
    "Gas & Fuel": [
        "{city} Gas Station", "{adjective} Fuel", "{city} Quick Stop", "{adjective} Petroleum",
        "{city} Fuel Depot", "{adjective} Gas & Go", "{city} Energy Station"
    ],
    "Travel (Airline, Hotel, Car Rental)": [
        "{city} Hotel", "{adjective} Inn", "{city} Suites", "{adjective} Airlines",
        "{city} Car Rental", "{adjective} Express Hotel", "{city} Vacation Rentals",
        "{adjective} Lodge", "{city} Travel Services"
    ],
    "Healthcare & Pharmacies": [
        "{city} Pharmacy", "{adjective} Health", "{city} Medical Center", "{adjective} Drugstore",
        "{city} Wellness", "{adjective} Care Pharmacy", "{city} Health Clinic"
    ],
    "Retail ‚Äì Clothing & Department Stores": [
        "{city} Fashion", "{adjective} Apparel", "{city} Department Store", "{adjective} Boutique",
        "{city} Clothing Co", "{adjective} Style", "{city} Outfitters", "{adjective} Wear"
    ],
    "Online Marketplaces (Amazon, Etsy, etc.)": [
        "{adjective} Shop Online", "{city} E-commerce", "{adjective} Marketplace",
        "{city} Online Store", "{adjective} Web Shop", "Digital {city} Market"
    ],
    "Entertainment (Movies, Events, Digital Media)": [
        "{city} Cinema", "{adjective} Theater", "{city} Entertainment", "{adjective} Streaming",
        "{city} Events", "{adjective} Media", "{city} Concert Hall", "{adjective} Gaming"
    ],
    "Utilities & Bills (Electric, Telecom, Internet)": [
        "{city} Electric", "{adjective} Power", "{city} Telecom", "{adjective} Internet",
        "{city} Utilities", "{adjective} Communications", "{city} Energy Co"
    ],
    "Other (Government, Insurance, Misc.)": [
        "{city} Insurance", "{adjective} Services", "{city} Government Office",
        "{adjective} Insurance Co", "{city} Municipal Services", "{adjective} Solutions"
    ]
}

# Lists for generating names
cities = [
    "Springfield", "Franklin", "Clinton", "Madison", "Georgetown", "Salem", "Fairview", "Bristol",
    "Manchester", "Oakland", "Ashland", "Burlington", "Riverside", "Chester", "Dover", "Hudson",
    "Jackson", "Kingston", "Marion", "Newport", "Oxford", "Princeton", "Winchester", "Arlington",
    "Auburn", "Concord", "Dayton", "Easton", "Greenville", "Milton", "Clifton", "Hamilton",
    "Lawrence", "Richmond", "Washington", "Lincoln", "Columbus", "Portland", "Denver", "Phoenix",
    "Austin", "Seattle", "Boston", "Dallas", "Miami", "Atlanta", "Detroit", "Minneapolis",
    "Cleveland", "Memphis", "Nashville", "Charlotte", "Milwaukee", "Pittsburgh", "Baltimore",
    "Tampa", "Orlando", "St. Louis", "Raleigh", "Buffalo", "Rochester", "Cincinnati"
]

adjectives = [
    "Golden", "Silver", "Blue", "Green", "Red", "Sunrise", "Sunset", "Mountain", "Valley",
    "River", "Ocean", "Star", "Crown", "Royal", "Prime", "Elite", "Premier", "Supreme",
    "Grand", "Main", "Central", "Metro", "Urban", "Summit", "Peak", "Haven", "Harbor",
    "Bay", "Coast", "Lakeside", "Park", "Garden", "Forest", "Meadow", "Hill", "Ridge"
]

# Real US city-state pairs
city_state_pairs = [
    "New York, New York", "Los Angeles, California", "Chicago, Illinois", "Houston, Texas",
    "Phoenix, Arizona", "Philadelphia, Pennsylvania", "San Antonio, Texas", "San Diego, California",
    "Dallas, Texas", "San Jose, California", "Austin, Texas", "Jacksonville, Florida",
    "Fort Worth, Texas", "Columbus, Ohio", "San Francisco, California", "Charlotte, North Carolina",
    "Indianapolis, Indiana", "Seattle, Washington", "Denver, Colorado", "Boston, Massachusetts",
    "Nashville, Tennessee", "Detroit, Michigan", "Portland, Oregon", "Las Vegas, Nevada",
    "Memphis, Tennessee", "Louisville, Kentucky", "Baltimore, Maryland", "Milwaukee, Wisconsin",
    "Albuquerque, New Mexico", "Tucson, Arizona", "Fresno, California", "Mesa, Arizona",
    "Sacramento, California", "Atlanta, Georgia", "Kansas City, Missouri", "Colorado Springs, Colorado",
    "Omaha, Nebraska", "Raleigh, North Carolina", "Miami, Florida", "Cleveland, Ohio",
    "Tulsa, Oklahoma", "Oakland, California", "Minneapolis, Minnesota", "Wichita, Kansas",
    "Arlington, Texas", "Tampa, Florida", "New Orleans, Louisiana", "Bakersfield, California",
    "Aurora, Colorado", "Anaheim, California", "Honolulu, Hawaii", "Santa Ana, California",
    "Riverside, California", "Corpus Christi, Texas", "Lexington, Kentucky", "Pittsburgh, Pennsylvania",
    "Anchorage, Alaska", "Stockton, California", "Cincinnati, Ohio", "Saint Paul, Minnesota",
    "Toledo, Ohio", "Greensboro, North Carolina", "Newark, New Jersey", "Plano, Texas",
    "Henderson, Nevada", "Lincoln, Nebraska", "Buffalo, New York", "Jersey City, New Jersey",
    "Chula Vista, California", "Orlando, Florida", "Norfolk, Virginia", "Chandler, Arizona",
    "Laredo, Texas", "Madison, Wisconsin", "Winston-Salem, North Carolina", "Lubbock, Texas",
    "Baton Rouge, Louisiana", "Durham, North Carolina", "Garland, Texas", "Glendale, Arizona",
    "Reno, Nevada", "Hialeah, Florida", "Chesapeake, Virginia", "Gilbert, Arizona",
    "Boise, Idaho", "Irving, Texas", "Scottsdale, Arizona", "North Las Vegas, Nevada",
    "Fremont, California", "Richmond, Virginia", "San Bernardino, California", "Birmingham, Alabama",
    "Spokane, Washington", "Rochester, New York", "Des Moines, Iowa", "Modesto, California",
    "Fayetteville, North Carolina", "Tacoma, Washington", "Oxnard, California", "Fontana, California",
    "Springfield, Massachusetts", "Springfield, Missouri", "Springfield, Illinois", "Sioux Falls, South Dakota",
    "Little Rock, Arkansas", "Portland, Maine", "Burlington, Vermont", "Wilmington, Delaware",
    "Charleston, South Carolina", "Manchester, New Hampshire", "Billings, Montana", "Cheyenne, Wyoming",
    "Fargo, North Dakota", "Jackson, Mississippi", "Salt Lake City, Utah", "Bismarck, North Dakota"
]

# US Regions mapping
state_to_region = {
    # Northeast
    "Connecticut": "Northeast", "Maine": "Northeast", "Massachusetts": "Northeast",
    "New Hampshire": "Northeast", "Rhode Island": "Northeast", "Vermont": "Northeast",
    "New Jersey": "Northeast", "New York": "Northeast", "Pennsylvania": "Northeast",
    
    # Midwest
    "Illinois": "Midwest", "Indiana": "Midwest", "Michigan": "Midwest", "Ohio": "Midwest",
    "Wisconsin": "Midwest", "Iowa": "Midwest", "Kansas": "Midwest", "Minnesota": "Midwest",
    "Missouri": "Midwest", "Nebraska": "Midwest", "North Dakota": "Midwest", "South Dakota": "Midwest",
    
    # South
    "Delaware": "South", "Florida": "South", "Georgia": "South", "Maryland": "South",
    "North Carolina": "South", "South Carolina": "South", "Virginia": "South",
    "West Virginia": "South", "Alabama": "South", "Kentucky": "South", "Mississippi": "South",
    "Tennessee": "South", "Arkansas": "South", "Louisiana": "South", "Oklahoma": "South",
    "Texas": "South",
    
    # West
    "Arizona": "West", "Colorado": "West", "Idaho": "West", "Montana": "West",
    "Nevada": "West", "New Mexico": "West", "Utah": "West", "Wyoming": "West",
    "Alaska": "West", "California": "West", "Hawaii": "West", "Oregon": "West",
    "Washington": "West"
}


In [0]:
# ============================================
# UDF Definitions
# ============================================

def generate_merchant_name(category, seed):
    """Generate a realistic merchant name based on category"""
    random.seed(seed)
    templates = merchant_name_templates.get(category, merchant_name_templates["Other (Government, Insurance, Misc.)"])
    template = random.choice(templates)
    city = random.choice(cities)
    adjective = random.choice(adjectives)
    return template.format(city=city, adjective=adjective)

def generate_location(seed):
    """Generate a real US city, state location"""
    random.seed(seed)
    return random.choice(city_state_pairs)

def extract_region(location):
    """Extract region from 'City, State' location string"""
    if not location or ", " not in location:
        return "Unknown"
    state = location.split(", ")[-1].strip()
    return state_to_region.get(state, "Unknown")


merchant_name_udf = F.udf(generate_merchant_name, StringType())
location_udf = F.udf(generate_location, StringType())
region_udf = F.udf(extract_region, StringType())

In [0]:
# ============================================
# Generate Merchant IDs for Each Category
# ============================================

dfs = []

for category, (start_id, end_id, distribution) in merchant_ranges.items():
    # Calculate number of merchants for this category
    num_merchants = int(total_merchants * distribution)
    
    # Generate sequential merchant IDs
    available_ids = end_id - start_id + 1
    
    if num_merchants > available_ids:
        print(f"‚ö†Ô∏è  Warning: {category} needs {num_merchants} IDs but only {available_ids} available. Adjusting...")
        num_merchants = available_ids
    
    print(f"üìä Generating {num_merchants} merchants for {category} (IDs: {start_id}-{end_id})")
    
    # Create DataFrame for this category
    df_category = (
        spark.range(0, num_merchants)
        .withColumn("merchant_id", F.lit(start_id) + F.col("id"))
        .withColumn("merchant_category", F.lit(category))
        .withColumn("seed_for_name", F.col("merchant_id"))
        .withColumn("seed_for_location", F.col("merchant_id") * 17)  # Different seed for location
        .drop("id")
    )
    
    dfs.append(df_category)

# Combine all categories
df_merchants = dfs[0]
for df in dfs[1:]:
    df_merchants = df_merchants.union(df)

In [0]:
# ============================================
# Add Merchant Details
# ============================================

df_merchants = (
    df_merchants
    .withColumn("merchant_name", merchant_name_udf(F.col("merchant_category"), F.col("seed_for_name")))
    .withColumn("merchant_location", location_udf(F.col("seed_for_location")))
    .withColumn("merchant_region", region_udf(F.col("merchant_location")))
    .drop("seed_for_name", "seed_for_location")
)

# Add partition_date (business date for partitioning)
df_merchants = df_merchants.withColumn("partition_date", F.lit(partition_date).cast(DateType()))

# Add ETL metadata columns (when the job actually runs)
df_merchants = df_merchants.withColumn("etl_date", F.lit(etl_date).cast(DateType()))
df_merchants = df_merchants.withColumn("etl_time", F.lit(etl_time).cast(StringType()))

# Reorder columns
df_merchants = df_merchants.select(
    "merchant_id",
    "merchant_name",
    "merchant_category",
    "merchant_location",
    "merchant_region",
    "partition_date",  # Business date - Partition column
    "etl_date",        # ETL run date
    "etl_time"         # ETL run time
)

In [0]:
# # ============================================
# # Write to Delta Table (Managed, Partitioned)
# # ============================================

# print(f"\nüíæ Writing to Delta managed table: {bronze_schema}.{merchants_table}")
# print(f"üìç Partitioned by: partition_date")

# (
#     df_merchants
#     .write
#     .format("delta")
#     .mode("overwrite")
#     .partitionBy("partition_date")
#     .option("overwriteSchema", "true")
#     .saveAsTable(f"{bronze_schema}.{merchants_table}")
# )

# print(f"‚úÖ Successfully written {df_merchants.count()} merchants to {bronze_schema}.{merchants_table}")

In [0]:
# ============================================
# Write to Delta Table (Managed, Partitioned)
# ============================================

print(f"\nüíæ Writing to Delta managed table: {schema_name}.{merchants_table}")

(
    df_merchants
    .filter(F.col("partition_date") == partition_date)       # keep only that partition‚Äôs rows
    .write
    .format("delta")
    .mode("overwrite")
    .option("replaceWhere", f"partition_date = '{partition_date}'")
    .saveAsTable(f"{schema_name}.{merchants_table}")
)

print(f"‚úÖ Successfully overwritten partition {partition_date} in {schema_name}.{merchants_table}")