# View: Revenue over time

## Data Source
- **Visits:** `workspace.hospital_silver.visits`

## Details: 
- Location: `workspace.hospital_gold.visits`
- Description: Total Revenue over Time, breakdown by Month, Quarter and Year


In [0]:
# Databricks Storage
catalog_name = "workspace"
schema_bronze = "hospital_bronze"
schema_silver = "hospital_silver"
schema_gold = "hospital_gold"

# view name: name of the view in schema and checkpoint
view_name = "view_revenue_over_time"

# data source path
data_source = "s3://buckethospitaldata/view/"

# for streaming: schema and checkpoint location (stored in data source S3 buckets)
checkpoint_location = f"{data_source}_checkpoints/{view_name}"

## Read data from Silver Layer

In [0]:
df_visits = spark.read.table(f"{catalog_name}.{schema_silver}.visits")

## Aggregate data by Date

In [0]:
from pyspark.sql.functions import col
df_group = df_visits.groupBy("Date_of_Visit").agg({"Revenue_per_visit": "sum", "Patient_ID": "count"}).sort("Date_of_Visit")

df_group = df_group.select(col("Date_of_Visit").alias("Date"),col("sum(Revenue_per_visit)").alias("Revenue_per_day"),col("count(Patient_ID)").alias("Total_patient"))

## Split Date to Month and Year

In [0]:
# import datetime
# # from pyspark.sql import functions as sf
# from pyspark.sql.functions import concat, col, lit, extract, typeof, monthname
# df_group = df_group.select(
#     '*',
#     extract(lit('YEAR'), 'Date').alias('year'),
#     extract(lit('month'), 'Date').alias('month'),
#     extract(lit('WEEK'), 'Date').alias('week'),
#     extract(lit('D'), df_group.Date).alias('day'),
#     extract(lit('M'), df_group.Date).alias('minute'),
#     extract(lit('S'), df_group.Date).alias('second')
# )
# df_group = df_group.select("*", typeof('Date'), monthname('Date'))
# df_group = df_group.withColumn("Month_Year", concat(col("monthname(Date)"), lit(","), col("year")))
# df_group = df_group.drop('monthname(Date)')
# df_group = df_group.drop('typeof(Date)')

# display(df_group)

In [0]:
from pyspark.sql.functions import (
    year, month, quarter, monthname,
    concat_ws, concat, col, lit
)

# Add Year, Month Name, Quarter
view_revenue_over_time = df_group.withColumns({
    "Year": year("Date"),
    "Month": month("Date"),
    "Quarter": concat(lit("Q"), quarter("Date")),
    "Month_Year": concat_ws(" ", monthname("Date"), concat(lit("("), year("Date").cast("string"), lit(")")))
})

In [0]:
# from pyspark.sql.functions import countDistinct,sum,count
# revenue_over_time = df_group.groupBy("Month_Year").agg(sum("Revenue_per_day").alias("Revenue_per_day"),count("Total_Patient").alias("Total_Patient")).sort("Month_Year")
# # display(revenue_over_time)

## Write data as a View in Gold Layer

In [0]:
(
    view_revenue_over_time.write
    .format("delta")
    .mode("overwrite")  
    .option("overwriteSchema", "true") 
    .saveAsTable(f"{catalog_name}.{schema_gold}.{view_name}")
)