```markdown
To run this Jupyter notebook, use Python 3.13 or later. The recommended environment is a virtual environment or a conda environment with the necessary packages installed, as specified int he requirements.txt file. Ensure that you have Jupyter Notebook installed to execute the cells interactively.
```

In [31]:
%run reuse.ipynb

Note: you may need to restart the kernel to use updated packages.


In [32]:
import ipywidgets as widgets
from IPython.display import display

# Create a dropdown widget for environment selection
env_dropdown = widgets.Dropdown(
    options=['dev', 'uat', 'prod'],
    value='dev',
    description='Environment:',
)

outputdirectory = widgets.Text("testdata")

# Display the widget
display(env_dropdown)
display(outputdirectory)


Dropdown(description='Environment:', options=('dev', 'uat', 'prod'), value='dev')

Text(value='testdata')

In [33]:
selected_env = env_dropdown.value
outputdir = outputdirectory.value
healthData = HealthData(selected_env,outputdir) # type: ignore
healthData.generate_data()

In [34]:
import pandas as pd

healthdatadf = pd.read_parquet(f"./{outputdir}/health_data_{selected_env}.parquet")

In [35]:
####!pyspark --packages io.delta:delta-core_2.11:0.4.0
import pyspark
from pyspark.sql import SparkSession
from delta import *


builder = (
    SparkSession.builder.appName("DeltaTableCreation")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)


In [36]:
filtered_healthdatadf = healthdatadf[healthdatadf['BloodPressure'] > 120]
print(filtered_healthdatadf)

# Use Spark SQL to filter the data and convert to Pandas DataFrame
spark.createDataFrame(healthdatadf).createOrReplaceTempView("sparkhealthdatadf")
filtered_healthdatadf_sql = spark.sql("SELECT * FROM sparkhealthdatadf WHERE BloodPressure > 120").toPandas()
print(filtered_healthdatadf_sql)

    PatientID  Age  Height  Weight  BloodPressure
0           1   64     161      95            161
2           3   73     156      66            171
5           6   79     153      71            128
7           8   59     186      75            147
14         15   26     165      56            173
15         16   44     163      71            164
17         18   32     198      63            149
19         20   21     155      74            124
20         21   58     191      65            146
21         22   59     185      91            171
22         23   43     150      68            165
25         26   37     180      61            155
29         30   28     184      51            150
30         31   29     198      81            151
32         33   71     153      74            172
33         34   36     184      74            123
34         35   71     192      53            163
35         36   25     163      68            129
36         37   35     198      97            121


In [37]:
# Create a Spark DataFrame from the healthdatadf DataFrame
sparkdf = spark.createDataFrame(healthdatadf)

# Write the DataFrame as a Delta table
sparkdf.write.format("delta").mode("overwrite").save(f"./{outputdir}/health_data_{selected_env}")

24/11/30 08:55:39 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/11/30 08:55:39 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
24/11/30 08:55:39 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
24/11/30 08:55:39 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/11/30 08:55:39 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
24/11/30 08:55:39 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/11/30 08:55:39 WARN MemoryManager: Total allocation exceeds 95.

In [38]:
# Read the Delta table
delta_table = spark.read.format("delta").load(f"./{outputdir}/health_data_{selected_env}")

filtered_data = delta_table.filter(delta_table.BloodPressure > 120)
filtered_data.show()

+---------+---+------+------+-------------+
|PatientID|Age|Height|Weight|BloodPressure|
+---------+---+------+------+-------------+
|        1| 64|   161|    95|          161|
|        3| 73|   156|    66|          171|
|        6| 79|   153|    71|          128|
|        8| 59|   186|    75|          147|
|       84| 61|   190|    71|          162|
|       85| 77|   158|    83|          148|
|       87| 58|   181|    84|          179|
|       88| 75|   158|    84|          163|
|       90| 66|   152|    83|          152|
|       91| 38|   153|    55|          141|
|       34| 36|   184|    74|          123|
|       35| 71|   192|    53|          163|
|       36| 25|   163|    68|          129|
|       37| 35|   198|    97|          121|
|       38| 67|   189|    53|          173|
|       39| 20|   171|    92|          126|
|       41| 55|   150|    88|          153|
|       51| 21|   196|    60|          143|
|       54| 52|   167|    78|          174|
|       56| 30|   164|    90|   

                                                                                

In [41]:
# Create a temporary view
delta_table.createOrReplaceTempView(f"health_data")

# Use Spark SQL to filter the data
filtered_data_sql = spark.sql("SELECT * FROM health_data WHERE BloodPressure > 120")
filtered_data_sql.show()

+---------+---+------+------+-------------+
|PatientID|Age|Height|Weight|BloodPressure|
+---------+---+------+------+-------------+
|        1| 64|   161|    95|          161|
|        3| 73|   156|    66|          171|
|        6| 79|   153|    71|          128|
|        8| 59|   186|    75|          147|
|       84| 61|   190|    71|          162|
|       85| 77|   158|    83|          148|
|       87| 58|   181|    84|          179|
|       88| 75|   158|    84|          163|
|       90| 66|   152|    83|          152|
|       91| 38|   153|    55|          141|
|       34| 36|   184|    74|          123|
|       35| 71|   192|    53|          163|
|       36| 25|   163|    68|          129|
|       37| 35|   198|    97|          121|
|       38| 67|   189|    53|          173|
|       39| 20|   171|    92|          126|
|       41| 55|   150|    88|          153|
|       51| 21|   196|    60|          143|
|       54| 52|   167|    78|          174|
|       56| 30|   164|    90|   