In [0]:
# Databricks notebook source
# =====================================================
# SIMPLE CARE GAPS ETL - BEGINNER FRIENDLY
# Read from ADLS, Create Delta Tables
# =====================================================

print("Starting Care Gaps ETL...")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 1. Configuration (Edit these paths for your environment)

# COMMAND ----------

# Storage account configuration - CHANGE THESE TO YOUR VALUES
STORAGE_ACCOUNT = "duse1achstdbx1"  # Your storage account name abfss://dev@duse1achstdbx1.dfs.core.windows.net/
CONTAINER = "dev"       # Your container name
STORAGE_KEY = "ouqQcLrewVPACdGe5y9i6z+Qz3+Jz2TT6ivC8HCO5VNiJ/i5x3nJvE/uQplUBlUfXSsqNTg3wNZm+AStDFVQAA=="  # Get from Azure Portal -> Storage Account -> Access Keys

# Configure Spark to access ADLS
spark.conf.set(
    f"fs.azure.account.key.{STORAGE_ACCOUNT}.dfs.core.windows.net",
    STORAGE_KEY
)

print(f"✓ Configured access to storage account: {STORAGE_ACCOUNT}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 2. Define Paths (All in one place)

# COMMAND ----------

# Base path to your storage
BASE_PATH = f"abfss://{CONTAINER}@{STORAGE_ACCOUNT}.dfs.core.windows.net"

# Landing zone (where ADF puts Parquet files)
LANDING_PATH = f"{BASE_PATH}/landing/chmca_custom/caregaps"

CATALOG = "dev_kiddo"
BRONZE_SCHEMA = "bronze"
SILVER_SCHEMA = "silver"
GOLD_SCHEMA = "gold"

# Delta Lake paths /Volumes/dev_kiddo/bronze/landing/chmca_custom/ah_eligibility_roster_mrn/raw/ah_eligibility_roster_mrn_*.parquet
BRONZE_PATH = f"{CATALOG}.{BRONZE_SCHEMA}"
SILVER_PATH = f"{CATALOG}.{SILVER_SCHEMA}"
GOLD_PATH = f"{CATALOG}.{GOLD_SCHEMA}"

print("Paths configured:")
print(f"  Landing: {LANDING_PATH}")
print(f"  Bronze:  {BRONZE_PATH}")
print(f"  Silver:  {SILVER_PATH}")
print(f"  Gold:    {GOLD_PATH}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 3. Read Parquet Files from Landing Zone

# COMMAND ----------

print("\n" + "="*60)
print("STEP 1: Reading Parquet files from landing zone...")
print("="*60)

# Read care gaps
df_care_gaps = spark.read.parquet(f"{LANDING_PATH}/dbo.STG_CareGaps_Daily.parquet")
care_gaps_count = df_care_gaps.count()
print(f"✓ Care Gaps: {care_gaps_count:,} rows")

# Read appointments
df_appointments = spark.read.parquet(f"{LANDING_PATH}/dbo.STG_Appointments_Daily.parquet")
appointments_count = df_appointments.count()
print(f"✓ Appointments: {appointments_count:,} rows")

# Read patient summary
df_patient_summary = spark.read.parquet(f"{LANDING_PATH}/dbo.STG_PatientGapSummary_Daily.parquet")
patient_summary_count = df_patient_summary.count()
print(f"✓ Patient Summary: {patient_summary_count:,} rows")

# Read provider metrics
df_provider_metrics = spark.read.parquet(f"{LANDING_PATH}/dbo.STG_ProviderMetrics_Daily.parquet")
provider_metrics_count = df_provider_metrics.count()
print(f"✓ Provider Metrics: {provider_metrics_count:,} rows")

# Read campaign opportunities
df_campaign_opportunities = spark.read.parquet(f"{LANDING_PATH}/dbo.STG_CampaignOpportunities_Daily.parquet")
campaign_opportunities_count = df_campaign_opportunities.count()
print(f"✓ Campaign Opportunities: {campaign_opportunities_count:,} rows")

print("\n✓ All files read successfully!")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 4. Create Bronze Delta Tables (Raw Data)

# COMMAND ----------

print("\n" + "="*60)
print("STEP 2: Creating Bronze Delta tables...")
print("="*60)

# Bronze: Care Gaps

df_care_gaps.write.format("delta").mode("overwrite").saveAsTable(
    f"{BRONZE_PATH}.care_gaps_daily"
)
print("✓ Bronze: care_gaps_daily created")

df_appointments.write.format("delta").mode("overwrite").saveAsTable(
    f"{BRONZE_PATH}.appointments_daily"
)
print("✓ Bronze: appointments_daily created")

df_patient_summary.write.format("delta").mode("overwrite").saveAsTable(
    f"{BRONZE_PATH}.patient_summary_daily"
)
print("✓ Bronze: patient_summary_daily created")

df_provider_metrics.write.format("delta").mode("overwrite").saveAsTable(
    f"{BRONZE_PATH}.provider_metrics_daily"
)
print("✓ Bronze: provider_metrics_daily created")

df_campaign_opportunities.write.format("delta").mode("overwrite").saveAsTable(
    f"{BRONZE_PATH}.campaign_opportunities_daily"
)
print("✓ Bronze: campaign_opportunities_daily created")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 5. Create Silver Delta Tables (Cleaned Data)

# COMMAND ----------

from pyspark.sql.functions import *

print("\n" + "="*60)
print("STEP 3: Creating Silver Delta tables...")
print("="*60)

# Silver: Care Gaps (cleaned)
df_care_gaps_clean = df_care_gaps \
    .filter(col("PAT_ID").isNotNull()) \
    .filter(col("GAP_TYPE").isNotNull()) \
    .withColumn("PRIORITY_NAME", 
                when(col("PRIORITY_LEVEL") == 1, "Critical")
                .when(col("PRIORITY_LEVEL") == 2, "Important")
                .otherwise("Routine"))

df_care_gaps_clean.write.format("delta").mode("overwrite").saveAsTable(f"{SILVER_PATH}.care_gaps_cleaned")

silver_care_gaps_count = df_care_gaps_clean.count()
print(f"✓ Silver: care_gaps_cleaned ({silver_care_gaps_count:,} rows)")

# Silver: Patient 360 (joined data)
df_patient_360 = df_patient_summary.join(
    df_appointments.groupBy("PAT_ID").agg(
        min("APPT_DATE").alias("FIRST_APPT_DATE"),
        min("DAYS_UNTIL_APPT").alias("DAYS_UNTIL_FIRST_APPT")
    ),
    "PAT_ID",
    "left"
)

df_patient_360.write.format("delta").mode("overwrite").saveAsTable(f"{SILVER_PATH}.patient_360")

silver_patient_360_count = df_patient_360.count()
print(f"✓ Silver: patient_360 ({silver_patient_360_count:,} rows)")

# Silver: Campaign Opportunities (cleaned)
df_campaign_clean = df_campaign_opportunities \
    .filter(col("patient_mrn").isNotNull()) \
    .filter(col("campaign_type").isNotNull())

df_campaign_clean.write.format("delta").mode("overwrite").saveAsTable(f"{SILVER_PATH}.campaign_opportunities")

silver_campaign_count = df_campaign_clean.count()
print(f"✓ Silver: campaign_opportunities ({silver_campaign_count:,} rows)")

print("\n✓ All Silver tables created!")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 6. Create Gold Delta Tables (Analytics)

# COMMAND ----------

print("\n" + "="*60)
print("STEP 4: Creating Gold Delta tables...")
print("="*60)

# Gold: Gap Summary by Type
df_gap_summary = df_care_gaps_clean.groupBy("GAP_TYPE", "PRIORITY_NAME") \
    .agg(
        count("*").alias("TOTAL_GAPS"),
        countDistinct("PAT_ID").alias("PATIENTS_AFFECTED")
    ) \
    .orderBy("TOTAL_GAPS", ascending=False)

df_gap_summary.write.format("delta").mode("overwrite").saveAsTable(f"{GOLD_PATH}.gap_summary")

gold_gap_summary_count = df_gap_summary.count()
print(f"✓ Gold: gap_summary ({gold_gap_summary_count:,} rows)")

# Gold: Provider Dashboard
df_provider_dashboard = df_provider_metrics \
    .withColumn("GAP_RATE", col("TOTAL_GAPS") / col("TOTAL_PATIENTS_WITH_GAPS"))

df_provider_dashboard.write.format("delta").mode("overwrite").saveAsTable(f"{GOLD_PATH}.provider_dashboard")

gold_provider_dashboard_count = df_provider_dashboard.count()
print(f"✓ Gold: provider_dashboard ({gold_provider_dashboard_count:,} rows)")

print("\n✓ All Gold tables created!")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 7. Summary

# COMMAND ----------

print("\n" + "="*60)
print("ETL COMPLETE - SUMMARY")
print("="*60)
print("\nData Loaded:")
print(f"  Care Gaps:      {care_gaps_count:,} rows")
print(f"  Appointments:   {appointments_count:,} rows")
print(f"  Patient Summary: {patient_summary_count:,} rows")
print(f"  Provider Metrics: {provider_metrics_count:,} rows")
print(f"  Campaign Opportunities: {campaign_opportunities_count:,} rows")

print("\nDelta Tables Created:")
print("\nBronze Layer:")
print(f"  {BRONZE_PATH}.care_gaps_daily")
print(f"  {BRONZE_PATH}.appointments_daily")
print(f"  {BRONZE_PATH}.patient_summary_daily")
print(f"  {BRONZE_PATH}.provider_metrics_daily")
print(f"  {BRONZE_PATH}.campaign_opportunities_daily")

print("\nSilver Layer:")
print(f"  {SILVER_PATH}.care_gaps_cleaned ({silver_care_gaps_count:,} rows)")
print(f"  {SILVER_PATH}.patient_360 ({silver_patient_360_count:,} rows)")
print(f"  {SILVER_PATH}.campaign_opportunities ({silver_campaign_count:,} rows)")

print("\nGold Layer:")
print(f"  {GOLD_PATH}.gap_summary ({gold_gap_summary_count:,} rows)")
print(f"  {GOLD_PATH}.provider_dashboard ({gold_provider_dashboard_count:,} rows)")

print("\n" + "="*60)
print("✓ SUCCESS - All data processed!")
print("="*60)

# COMMAND ----------

# MAGIC %md
# MAGIC ## 8. Preview Data (Optional)

# COMMAND ----------

# Uncomment to see sample data

# print("\nSample Care Gaps:")
# display(df_care_gaps_clean.limit(10))

# print("\nSample Patient 360:")
# display(df_patient_360.limit(10))

# print("\nGap Summary:")
# display(df_gap_summary)

Starting Care Gaps ETL...
✓ Configured access to storage account: duse1achstdbx1
Paths configured:
  Landing: abfss://dev@duse1achstdbx1.dfs.core.windows.net/landing/chmca_custom/caregaps
  Bronze:  dev_kiddo.bronze
  Silver:  dev_kiddo.silver
  Gold:    dev_kiddo.gold

STEP 1: Reading Parquet files from landing zone...
✓ Care Gaps: 2,026,452 rows
✓ Appointments: 150,196 rows
✓ Patient Summary: 324,258 rows
✓ Provider Metrics: 275 rows

✓ All files read successfully!

STEP 2: Creating Bronze Delta tables...
✓ Bronze: care_gaps_daily created
✓ Bronze: appointments_daily created
✓ Bronze: patient_summary_daily created
✓ Bronze: provider_metrics_daily created

STEP 3: Creating Silver Delta tables...
✓ Silver: care_gaps_cleaned (2,026,452 rows)
✓ Silver: patient_360 (324,258 rows)

✓ All Silver tables created!

STEP 4: Creating Gold Delta tables...
✓ Gold: gap_summary (27 rows)
✓ Gold: provider_dashboard (275 rows)

✓ All Gold tables created!

ETL COMPLETE - SUMMARY

Data Loaded:
  Care Ga