## Pipeline: Bronze to Silver

## Data Source

- **Catalog Location:**  `workspace.hospital_bronze.patients`
- **Format:** Delta Lake Table


## Destination

- **Catalog Location:** `workspace.hospital_silver.patients`
- **Format:** Delta Lake Table 

In [0]:
entity = "patients"

In [0]:
# Databricks Storage
catalog_name = "workspace"
schema_silver = "hospital_silver"
schema_bronze = "hospital_bronze"
schema_gold = "hospital_gold"

# data source path
data_source = "s3://buckethospitaldata/data_batching/"

# for streaming: schema and checkpoint location (stored in data source S3 buckets)
checkpoint_location = f"s3://buckethospitaldata/pipeline_checkpoints/data_streaming/_checkpoints/silver/{entity}"

## Read Data to from Bronze Layer

In [0]:
df = spark.readStream.table(f"{catalog_name}.{schema_bronze}.{entity}")

## Convert Data format: Gender

In [0]:
# from pyspark.sql.functions import col, to_date, when

# df = df.withColumn("Gender", when(col("Gender") == "Male", 1)
#                                         .when(col("Gender") == "Female", 0)
#                                         .otherwise(None))

## Handling Missing Data

In [0]:
from pyspark.sql.functions import col, count, when, isnan


def count_missing_values(df, sort=True, as_pandas=True):
    """
    Counts missing values (nulls for all columns, and NaNs for float/double columns) in a PySpark DataFrame,
    and shows the percentage of missing values per column.
    
    Parameters:
        df (DataFrame): The PySpark DataFrame to check.
        sort (bool): Whether to sort the result by missing count (descending).
        as_pandas (bool): Whether to return the result as a pandas DataFrame.
        
    Returns:
        pandas.DataFrame or dict: Missing value counts and percentages per column.
    """
    total_rows = df.count()
    float_cols = [c for c, t in df.dtypes if t in ['float', 'double']]
    other_cols = [c for c in df.columns if c not in float_cols]

    exprs = [count(when(col(c).isNull() | isnan(col(c)), c)).alias(c) for c in float_cols] + \
            [count(when(col(c).isNull(), c)).alias(c) for c in other_cols]

    result_row = df.select(exprs).collect()[0].asDict()
    
    # Prepare results with percentage
    results = []
    for col_name, missing_count in result_row.items():
        percent = (missing_count / total_rows * 100) if total_rows > 0 else 0
        results.append((col_name, missing_count, round(percent, 2)))
    
    if as_pandas:
        import pandas as pd
        result_df = pd.DataFrame(results, columns=['column', 'missing_count', 'missing_percent'])
        if sort:
            result_df = result_df.sort_values('missing_count', ascending=False).reset_index(drop=True)
        return result_df
    else:
        if sort:
            results = sorted(results, key=lambda x: x[1], reverse=True)
        return results


# df_batch = spark.read.table(f"{catalog_name}.{schema_bronze}.{entity}")

# missing_counts = count_missing_values(df_batch)
# print(df_batch.count())
# print(missing_counts)

## Write Data to Silver Layer

In [0]:
(
    df.writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_location)
    .option("mergeSchema", "true")  # Optional but useful
    .outputMode("append")
    .trigger(once=True)
    .table(f"{schema_silver}.{entity}")
)