In [None]:
# Databricks notebook source
# =====================================================
# SIMPLE CARE GAPS ETL - BEGINNER FRIENDLY
# Read from ADLS, Create Delta Tables
# =====================================================

print("Starting Care Gaps ETL...")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 1. Configuration (Edit these paths for your environment)

# COMMAND ----------

# Get parameters passed from ADF pipeline
dbutils.widgets.text("run_date", "", "Run Date (yyyy-MM-dd)")
dbutils.widgets.text("environment", "dev", "Environment")
dbutils.widgets.text("care_gaps_count", "0", "Care Gaps Row Count")
dbutils.widgets.text("appointments_count", "0", "Appointments Row Count")
dbutils.widgets.text("patient_summary_count", "0", "Patient Summary Row Count")
dbutils.widgets.text("provider_metrics_count", "0", "Provider Metrics Row Count")
dbutils.widgets.text("campaign_opportunities_count", "0", "Campaign Opportunities Row Count")

RUN_DATE = dbutils.widgets.get("run_date")
ENVIRONMENT = dbutils.widgets.get("environment")

# If run_date not provided (manual run), use today's date
if not RUN_DATE:
    from datetime import datetime
    RUN_DATE = datetime.now().strftime("%Y-%m-%d")

print(f"Run Date: {RUN_DATE}")
print(f"Environment: {ENVIRONMENT}")

# Storage account configuration - CHANGE THESE TO YOUR VALUES
STORAGE_ACCOUNT = "duse1achstdbx1"  # Your storage account name abfss://dev@duse1achstdbx1.dfs.core.windows.net/
CONTAINER = "dev"       # Your container name
STORAGE_KEY = "ouqQcLrewVPACdGe5y9i6z+Qz3+Jz2TT6ivC8HCO5VNiJ/i5x3nJvE/uQplUBlUfXSsqNTg3wNZm+AStDFVQAA=="  # Get from Azure Portal -> Storage Account -> Access Keys

# Configure Spark to access ADLS
spark.conf.set(
    f"fs.azure.account.key.{STORAGE_ACCOUNT}.dfs.core.windows.net",
    STORAGE_KEY
)

print(f"✓ Configured access to storage account: {STORAGE_ACCOUNT}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 2. Define Paths (All in one place)

# COMMAND ----------

# Base path to your storage
BASE_PATH = f"abfss://{CONTAINER}@{STORAGE_ACCOUNT}.dfs.core.windows.net"

# Landing zone (where ADF puts Parquet files) - includes RunDate partition
LANDING_PATH = f"{BASE_PATH}/landing/chmca_custom/caregaps/{RUN_DATE}"

CATALOG = "dev_kiddo"
BRONZE_SCHEMA = "bronze"
SILVER_SCHEMA = "silver"
GOLD_SCHEMA = "gold"

# Delta Lake paths /Volumes/dev_kiddo/bronze/landing/chmca_custom/ah_eligibility_roster_mrn/raw/ah_eligibility_roster_mrn_*.parquet
BRONZE_PATH = f"{CATALOG}.{BRONZE_SCHEMA}"
SILVER_PATH = f"{CATALOG}.{SILVER_SCHEMA}"
GOLD_PATH = f"{CATALOG}.{GOLD_SCHEMA}"

print("Paths configured:")
print(f"  Landing: {LANDING_PATH}")
print(f"  Bronze:  {BRONZE_PATH}")
print(f"  Silver:  {SILVER_PATH}")
print(f"  Gold:    {GOLD_PATH}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 3. Read Parquet Files from Landing Zone

# COMMAND ----------

print("\n" + "="*60)
print("STEP 1: Reading Parquet files from landing zone...")
print("="*60)

# File names match ADF sink activity output names
df_care_gaps = spark.read.parquet(f"{LANDING_PATH}/CareGaps_daily.parquet")
care_gaps_count = df_care_gaps.count()
print(f"✓ Care Gaps: {care_gaps_count:,} rows")

df_appointments = spark.read.parquet(f"{LANDING_PATH}/Appointments_daily.parquet")
appointments_count = df_appointments.count()
print(f"✓ Appointments: {appointments_count:,} rows")

df_patient_summary = spark.read.parquet(f"{LANDING_PATH}/PatientGapsSummary_daily.parquet")
patient_summary_count = df_patient_summary.count()
print(f"✓ Patient Summary: {patient_summary_count:,} rows")

df_provider_metrics = spark.read.parquet(f"{LANDING_PATH}/ProviderMetrics_daily.parquet")
provider_metrics_count = df_provider_metrics.count()
print(f"✓ Provider Metrics: {provider_metrics_count:,} rows")

df_campaign_opportunities = spark.read.parquet(f"{LANDING_PATH}/CampaignOpportunities_daily.parquet")
campaign_opportunities_count = df_campaign_opportunities.count()
print(f"✓ Campaign Opportunities: {campaign_opportunities_count:,} rows")

print("\n✓ All files read successfully!")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 4. Create Bronze Delta Tables (Raw Data)

# COMMAND ----------

print("\n" + "="*60)
print("STEP 2: Creating Bronze Delta tables...")
print("="*60)

# Bronze: Care Gaps

df_care_gaps.write.format("delta").mode("overwrite").saveAsTable(
    f"{BRONZE_PATH}.care_gaps_daily"
)
print("✓ Bronze: care_gaps_daily created")

df_appointments.write.format("delta").mode("overwrite").saveAsTable(
    f"{BRONZE_PATH}.appointments_daily"
)
print("✓ Bronze: appointments_daily created")

df_patient_summary.write.format("delta").mode("overwrite").saveAsTable(
    f"{BRONZE_PATH}.patient_summary_daily"
)
print("✓ Bronze: patient_summary_daily created")

df_provider_metrics.write.format("delta").mode("overwrite").saveAsTable(
    f"{BRONZE_PATH}.provider_metrics_daily"
)
print("✓ Bronze: provider_metrics_daily created")

df_campaign_opportunities.write.format("delta").mode("overwrite").saveAsTable(
    f"{BRONZE_PATH}.campaign_opportunities_daily"
)
print("✓ Bronze: campaign_opportunities_daily created")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 5. Create Silver Delta Tables (Cleaned Data)

# COMMAND ----------

from pyspark.sql.functions import *

print("\n" + "="*60)
print("STEP 3: Creating Silver Delta tables...")
print("="*60)

# Silver: Care Gaps (cleaned)
df_care_gaps_clean = df_care_gaps \
    .filter(col("PAT_ID").isNotNull()) \
    .filter(col("GAP_TYPE").isNotNull()) \
    .withColumn("PRIORITY_NAME", 
                when(col("PRIORITY_LEVEL") == 1, "Critical")
                .when(col("PRIORITY_LEVEL") == 2, "Important")
                .otherwise("Routine"))

df_care_gaps_clean.write.format("delta").mode("overwrite").saveAsTable(f"{SILVER_PATH}.care_gaps_cleaned")

silver_care_gaps_count = df_care_gaps_clean.count()
print(f"✓ Silver: care_gaps_cleaned ({silver_care_gaps_count:,} rows)")

# Silver: Patient 360 (joined data)
df_patient_360 = df_patient_summary.join(
    df_appointments.groupBy("PAT_ID").agg(
        min("APPT_DATE").alias("FIRST_APPT_DATE"),
        min("DAYS_UNTIL_APPT").alias("DAYS_UNTIL_FIRST_APPT")
    ),
    "PAT_ID",
    "left"
)

df_patient_360.write.format("delta").mode("overwrite").saveAsTable(f"{SILVER_PATH}.patient_360")

silver_patient_360_count = df_patient_360.count()
print(f"✓ Silver: patient_360 ({silver_patient_360_count:,} rows)")

# Silver: Campaign Opportunities (cleaned, without LLM messages yet)
df_campaign_clean = df_campaign_opportunities \
    .filter(col("patient_mrn").isNotNull()) \
    .filter(col("campaign_type").isNotNull())

silver_campaign_count = df_campaign_clean.count()
print(f"✓ Silver: campaign_opportunities cleaned ({silver_campaign_count:,} rows)")

print("\n✓ All Silver tables created!")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 5b. Generate LLM Messages for Campaign Opportunities
# MAGIC 
# MAGIC Takes each row's `suggested_prompt` and calls Llama 3.3 70B to generate
# MAGIC a personalized 160-character SMS message. Stores the result in `llm_message`.

# COMMAND ----------

from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from openai import OpenAI

print("\n" + "="*60)
print("STEP 3b: Generating LLM messages for campaign opportunities...")
print("="*60)

# Check if suggested_prompt column exists
campaign_columns = [c.lower() for c in df_campaign_clean.columns]

if "suggested_prompt" not in campaign_columns:
    print("⚠ No 'suggested_prompt' column found — skipping LLM generation")
    print("  Writing campaign data without LLM messages...")
    df_campaign_clean.write.format("delta").mode("overwrite").saveAsTable(
        f"{SILVER_PATH}.campaign_opportunities"
    )
else:
    # Get Databricks auth token and workspace URL
    token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
    workspace_url = spark.conf.get("spark.databricks.workspaceUrl")

    client = OpenAI(
        api_key=token,
        base_url=f"https://{workspace_url}/serving-endpoints"
    )

    LLM_ENDPOINT = "databricks-meta-llama-3-3-70b-instruct"

    def generate_message(prompt_text):
        """Call Llama 3.3 to generate a 160-char SMS message from the suggested prompt."""
        try:
            response = client.chat.completions.create(
                model=LLM_ENDPOINT,
                messages=[
                    {
                        "role": "system",
                        "content": (
                            "You are a message writer for Akron Children's Hospital. "
                            "Generate a single SMS message (160 characters max). "
                            "Be cheerful and professional. Output ONLY the message text, nothing else."
                        ),
                    },
                    {"role": "user", "content": prompt_text},
                ],
                max_tokens=80,
                temperature=0.7,
            )
            msg = response.choices[0].message.content.strip()
            # Remove quotes if the LLM wraps in quotes
            if msg.startswith('"') and msg.endswith('"'):
                msg = msg[1:-1]
            return msg[:160]
        except Exception as e:
            print(f"  LLM error: {e}")
            return None

    # Convert to pandas for row-by-row LLM calls
    pdf = df_campaign_clean.toPandas()
    total_rows = len(pdf)
    print(f"  Processing {total_rows} opportunities...")

    # Parallel generation with ThreadPoolExecutor
    messages = [None] * total_rows
    prompt_indices = []

    for idx, row in pdf.iterrows():
        prompt = row.get("suggested_prompt")
        if prompt and pd.notna(prompt) and str(prompt).strip():
            prompt_indices.append((idx, str(prompt).strip()))

    print(f"  {len(prompt_indices)} rows have suggested_prompt to process")

    completed = 0
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_idx = {
            executor.submit(generate_message, prompt): idx
            for idx, prompt in prompt_indices
        }
        for future in as_completed(future_to_idx):
            idx = future_to_idx[future]
            try:
                messages[idx] = future.result()
            except Exception as e:
                print(f"  Error for row {idx}: {e}")
            completed += 1
            if completed % 100 == 0:
                print(f"  Progress: {completed}/{len(prompt_indices)} messages generated")

    pdf["llm_message"] = messages
    generated_count = sum(1 for m in messages if m is not None)
    print(f"✓ Generated {generated_count}/{len(prompt_indices)} LLM messages")

    # Convert back to Spark and write to Silver
    df_campaign_with_messages = spark.createDataFrame(pdf)
    df_campaign_with_messages.write.format("delta").mode("overwrite").saveAsTable(
        f"{SILVER_PATH}.campaign_opportunities"
    )
    print(f"✓ Silver: campaign_opportunities updated with LLM messages ({silver_campaign_count:,} rows)")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 6. Create Gold Delta Tables (Analytics)

# COMMAND ----------

print("\n" + "="*60)
print("STEP 4: Creating Gold Delta tables...")
print("="*60)

# Gold: Gap Summary by Type
df_gap_summary = df_care_gaps_clean.groupBy("GAP_TYPE", "PRIORITY_NAME") \
    .agg(
        count("*").alias("TOTAL_GAPS"),
        countDistinct("PAT_ID").alias("PATIENTS_AFFECTED")
    ) \
    .orderBy("TOTAL_GAPS", ascending=False)

df_gap_summary.write.format("delta").mode("overwrite").saveAsTable(f"{GOLD_PATH}.gap_summary")

gold_gap_summary_count = df_gap_summary.count()
print(f"✓ Gold: gap_summary ({gold_gap_summary_count:,} rows)")

# Gold: Provider Dashboard
df_provider_dashboard = df_provider_metrics \
    .withColumn("GAP_RATE", col("TOTAL_GAPS") / col("TOTAL_PATIENTS_WITH_GAPS"))

df_provider_dashboard.write.format("delta").mode("overwrite").saveAsTable(f"{GOLD_PATH}.provider_dashboard")

gold_provider_dashboard_count = df_provider_dashboard.count()
print(f"✓ Gold: provider_dashboard ({gold_provider_dashboard_count:,} rows)")

print("\n✓ All Gold tables created!")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 7. Summary

# COMMAND ----------

print("\n" + "="*60)
print("ETL COMPLETE - SUMMARY")
print("="*60)
print(f"\nRun Date: {RUN_DATE}")
print("\nData Loaded:")
print(f"  Care Gaps:      {care_gaps_count:,} rows")
print(f"  Appointments:   {appointments_count:,} rows")
print(f"  Patient Summary: {patient_summary_count:,} rows")
print(f"  Provider Metrics: {provider_metrics_count:,} rows")
print(f"  Campaign Opportunities: {campaign_opportunities_count:,} rows")

print("\nDelta Tables Created:")
print("\nBronze Layer:")
print(f"  {BRONZE_PATH}.care_gaps_daily")
print(f"  {BRONZE_PATH}.appointments_daily")
print(f"  {BRONZE_PATH}.patient_summary_daily")
print(f"  {BRONZE_PATH}.provider_metrics_daily")
print(f"  {BRONZE_PATH}.campaign_opportunities_daily")

print("\nSilver Layer:")
print(f"  {SILVER_PATH}.care_gaps_cleaned ({silver_care_gaps_count:,} rows)")
print(f"  {SILVER_PATH}.patient_360 ({silver_patient_360_count:,} rows)")
print(f"  {SILVER_PATH}.campaign_opportunities ({silver_campaign_count:,} rows)")

print("\nGold Layer:")
print(f"  {GOLD_PATH}.gap_summary ({gold_gap_summary_count:,} rows)")
print(f"  {GOLD_PATH}.provider_dashboard ({gold_provider_dashboard_count:,} rows)")

print("\n" + "="*60)
print("✓ SUCCESS - All data processed!")
print("="*60)

# COMMAND ----------

# MAGIC %md
# MAGIC ## 8. Preview Data (Optional)

# COMMAND ----------

# Uncomment to see sample data

# print("\nSample Care Gaps:")
# display(df_care_gaps_clean.limit(10))

# print("\nSample Patient 360:")
# display(df_patient_360.limit(10))

# print("\nGap Summary:")
# display(df_gap_summary)