## This notebook is from GitRepo

In [0]:
%sql
-- In Databricks notebooks, the SparkSession 'spark' is automatically available, so no need to import or create it.

-- MasterCard Unified Customer Schema
CREATE TABLE mc_unified_customers (
    -- Universal identifiers
    mc_customer_id STRING,              -- MasterCard's unified ID
    source_system STRING,               -- Which acquired company
    source_customer_id STRING,          -- Original customer ID
    
    -- Customer information (nullable to handle variations)
    first_name STRING,
    last_name STRING,
    full_name STRING,                   -- For systems that store full name
    email STRING,
    phone STRING,
    
    -- Financial data
    primary_balance DECIMAL(15,2),
    currency_code STRING DEFAULT 'USD',
    
    -- Geographic
    country_code STRING,
    region STRING,
    
    -- Metadata
    original_signup_date TIMESTAMP,
    data_source_version STRING,
    last_updated TIMESTAMP,
    
    -- Flexible columns for system-specific data
    additional_attributes MAP<STRING, STRING>,  -- Key-value pairs
    raw_source_data STRING                      -- JSON string of original record
) 
USING DELTA
PARTITIONED BY (source_system, country_code)


In [0]:
# Databricks notebook for data harmonization

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import uuid

spark = SparkSession.builder.getOrCreate()

def harmonize_company_a_data(df):
    """Transform Company A's traditional bank data"""
    return df.select(
        # Generate MasterCard unified ID
        concat(lit("MC_"), col("customer_id").cast("string")).alias("mc_customer_id"),
        lit("CompanyA_Bank").alias("source_system"),
        col("customer_id").cast("string").alias("source_customer_id"),
        
        # Customer info
        col("first_name"),
        col("last_name"),
        concat(col("first_name"), lit(" "), col("last_name")).alias("full_name"),
        lit(None).cast("string").alias("email"),  # Company A doesn't have email
        lit(None).cast("string").alias("phone"),
        
        # Financial
        col("balance").alias("primary_balance"),
        lit("USD").alias("currency_code"),
        
        # Geographic (assume US for this legacy bank)
        lit("US").alias("country_code"),
        lit("North America").alias("region"),
        
        # Metadata
        col("created_date").cast("timestamp").alias("original_signup_date"),
        lit("v1.0").alias("data_source_version"),
        current_timestamp().alias("last_updated"),
        
        # Additional attributes
        map(
            lit("account_number"), col("account_number").cast("string")
        ).alias("additional_attributes"),
        
        # Store original record
        to_json(struct("*")).alias("raw_source_data")
    )

def harmonize_company_b_data(df):
    """Transform Company B's JSON fintech data"""
    return df.select(
        # Generate MasterCard unified ID
        concat(lit("MC_"), regexp_replace(col("userId"), "-", "")).alias("mc_customer_id"),
        lit("CompanyB_Fintech").alias("source_system"),
        col("userId").alias("source_customer_id"),
        
        # Parse nested JSON structure
        split(col("profile.fullName"), " ")[0].alias("first_name"),
        split(col("profile.fullName"), " ")[1].alias("last_name"),
        col("profile.fullName").alias("full_name"),
        col("profile.email").alias("email"),
        col("profile.phone").alias("phone"),
        
        # Financial
        col("walletBalance").cast("decimal(15,2)").alias("primary_balance"),
        col("metadata.preferredCurrency").alias("currency_code"),
        
        # Geographic (extract from phone or default)
        when(col("profile.phone").startswith("+1"), "US")
        .otherwise("Unknown").alias("country_code"),
        lit("TBD").alias("region"),
        
        # Metadata
        to_timestamp(col("metadata.signupTimestamp")).alias("original_signup_date"),
        lit("v2.0").alias("data_source_version"),
        current_timestamp().alias("last_updated"),
        
        # Store additional fintech-specific attributes
        map(
            lit("wallet_type"), lit("digital"),
            lit("transaction_count"), size(col("transactions")).cast("string")
        ).alias("additional_attributes"),
        
        # Original data
        to_json(struct("*")).alias("raw_source_data")
    )

def harmonize_company_c_data(df):
    """Transform Company C's e-commerce data"""
    return df.select(
        concat(lit("MC_"), col("user_guid")).alias("mc_customer_id"),
        lit("CompanyC_Ecommerce").alias("source_system"),
        col("user_guid").alias("source_customer_id"),
        
        # Parse full name
        split(col("customer_name"), " ")[0].alias("first_name"),
        expr("substring_index(customer_name, ' ', -1)").alias("last_name"),
        col("customer_name").alias("full_name"),
        col("email_address").alias("email"),
        lit(None).cast("string").alias("phone"),
        
        # Financial (handle FLOAT to DECIMAL conversion)
        col("account_balance").cast("decimal(15,2)").alias("primary_balance"),
        lit("USD").alias("currency_code"),
        
        # Geographic
        col("country_code"),
        case()
        .when(col("country_code").isin(["US", "CA"]), "North America")
        .when(col("country_code").isin(["GB", "DE", "FR"]), "Europe")
        .otherwise("Other").alias("region"),
        
        # Metadata
        col("registration_datetime").alias("original_signup_date"),
        lit("v1.5").alias("data_source_version"),
        current_timestamp().alias("last_updated"),
        
        # Parse payment methods JSON and store
        map(
            lit("payment_methods"), col("payment_methods"),
            lit("platform"), lit("ecommerce")
        ).alias("additional_attributes"),
        
        to_json(struct("*")).alias("raw_source_data")
    )

# Main processing pipeline
def process_all_acquisitions():
    """Process data from all acquired companies"""
    
    # Load data from each source
    company_a_df = spark.read.table("raw_data.company_a_customers")
    company_b_df = spark.read.json("/mnt/acquisitions/company_b/customers/")
    company_c_df = spark.read.table("raw_data.company_c_user_accounts")
    
    # Transform each dataset
    unified_a = harmonize_company_a_data(company_a_df)
    unified_b = harmonize_company_b_data(company_b_df)
    unified_c = harmonize_company_c_data(company_c_df)
    
    # Combine all datasets
    unified_customers = unified_a.union(unified_b).union(unified_c)
    
    # Data quality checks
    quality_checked = unified_customers.filter(
        col("mc_customer_id").isNotNull() &
        col("primary_balance").isNotNull() &
        (col("first_name").isNotNull() | col("full_name").isNotNull())
    )
    
    # Write to Delta Lake with merge capability
    quality_checked.write \
        .format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("mastercard.unified_customers")
    
    return quality_checked

# Execute the pipeline
result_df = process_all_acquisitions()

In [0]:
# Company D has new fields like 'credit_score' and 'risk_category'
# Delta Lake automatically adds these columns with NULL values for existing records

# Enable automatic schema evolution
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

# New data with additional columns gets merged automatically
new_data.write.format("delta").option("mergeSchema", "true").mode("append").saveAsTable("mastercard.unified_customers")

In [0]:
%sql
-- Data Science: Analyze customer distribution by acquisition
SELECT 
    source_system,
    country_code,
    COUNT(*) as customer_count,
    AVG(primary_balance) as avg_balance,
    MAX(original_signup_date) as latest_signup
FROM mastercard.unified_customers
GROUP BY source_system, country_code
ORDER BY customer_count DESC;

-- Machine Learning: Feature engineering across all sources
SELECT 
    mc_customer_id,
    CASE 
        WHEN source_system LIKE '%Fintech%' THEN 'digital_native'
        WHEN source_system LIKE '%Bank%' THEN 'traditional'
        ELSE 'ecommerce'
    END as customer_segment,
    primary_balance,
    datediff(current_date(), original_signup_date) as customer_age_days,
    additional_attributes['payment_methods'] as payment_preferences
FROM mastercard.unified_customers
WHERE primary_balance > 1000;