In [0]:
%run ./nb_utility_functions

In [0]:

spark.sql('USE CATALOG dev_edh')
spark.sql('USE SCHEMA dummy')

In [0]:
# %sql
# select current_catalog() , current_database()

In [0]:
%sql
--select * from sample_lab_data where test_name is null

In [0]:
# Lab_name = dbutils.jobs.taskValues.get("silver", "Lab_name",debugValue="unknown")
# print(Lab_name)

In [0]:
lab = dbutils.jobs.taskValues.get(taskKey = "get_Lab_Name", key = "Lab_name", debugValue = "Lab A")

In [0]:

from pyspark.sql.functions import col, date_format, to_timestamp
from pyspark.sql.utils import AnalysisException


df = spark.table('sample_lab_data').where(col("test_source") == lab)
#display(df)



try:
    # Check for null values in any field
    null_df = checking_null(df, ["member_id", "test_source", "test_name", "patient_city", "patient_state", "test_date", "patient_phone", "created_date"])
    #display(null_df)

    # Save records with null values to 'quarantine' table
    if null_df.count() > 0:
        null_df.write.mode("overwrite").saveAsTable("lab_data_quarantine")
    
    # Filter out records with null values
    cleaned_df = remove_null(df, ["member_id", "test_source", "test_name", "patient_city", "patient_state", "test_date", "patient_phone", "created_date"])
    #display(cleaned_df)
    
    # Change the date data type to yyyy-MM-dd
    cleaned_df = ( cleaned_df
                           .withColumn("test_date", date_format(col("test_date"), "yyyy-MM-dd"))
                           #.withColumn("created_date", date_format(col("created_date"), "yyyy-MM-dd HH:mm:ss"))
                           .withColumn("created_date", date_format(to_timestamp(col("created_date"), "M/d/yyyy H:mm"), "yyyy-MM-dd HH:mm:ss"))
                 )
    
    # Drop duplicate records
    cleaned_df = cleaned_df.dropDuplicates()
    
    #display(cleaned_df)
    
except AnalysisException as e:
    print(f"An error occurred: {e}")

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType


silver_df = cleaned_df.select(
    "member_id",
    "test_source",
    "test_name",
    "patient_city",
    "patient_state",
    "test_date",
    # Masking the phone number using the custom Python function
    mask_phone_number_udf("patient_phone").alias("masked_phone_number"),
    
    "created_date"
)


display(silver_df)

In [0]:
silver_df.write.mode("overwrite").saveAsTable("lab_data_silver")

In [0]:
# # Import necessary functions
# from pyspark.sql.functions import col, array
# from functools import reduce, lru_cache,

# # Create an example DataFrame
# data = [
#     (1, "source1", "test1", "city1", "state1", "2025-05-24", "1234567890", "2025-05-24"),
#     (2, "source2", "test2", "city2", "state2", "2025-05-24", None, "2025-05-24"),
#     (3, None, "test3", "city3", "state3", "2025-05-24", "0987654321", "2025-05-24")
# ]
# columns = ["member_id", "test_source", "test_name", "patient_city", "patient_state", "test_date", "patient_phone", "created_date"]
# df = spark.createDataFrame(data, columns)

# # List of columns to check for non-null values
# columns_to_check = ["member_id", "test_source", "test_name", "patient_city", "patient_state", "test_date", "patient_phone", "created_date"]

# # Create a list of conditions where each column in 'columns_to_check' is checked for non-null values
# conditions = [col(c).isNotNull() for c in columns_to_check]

# print("Condition \n",conditions)

# # Combine all conditions using a logical AND operation
# combined_condition = reduce(lambda a, b: a & b, conditions)


# print("Combined Condition \n",combined_condition)

# # Filter the DataFrame using the combined condition
# filtered_df = df.filter(combined_condition)

# # Display the filtered DataFrame
# display(filtered_df)

In [0]:
%sql
SELECT * FROM lab_data_silver