In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, DoubleType, TimestampType, BooleanType, DateType
from pyspark.sql.window import Window
from pyspark.sql.functions import when, col

In [0]:
df_patients_bronze = spark.read.table('fhir_data_bronze.patients')
target_table = "fhir_data_silver.patients"

In [0]:
df_patients_bronze = df_patients_bronze.withColumn('fullname', F.concat_ws(' ', col('prefix'), col('first'), col('middle'), col('last'), col('suffix'))) \
    .withColumnRenamed('first' , 'first_name') \
    .withColumnRenamed('last' , 'last_name') \
    .withColumnRenamed('middle' , 'middle_name') \
    .withColumnRenamed('ssn' , 'social_security_number') \
    .withColumnRenamed('drivers' , 'drivers_license_number') \
    .withColumnRenamed('passport' , 'passport_number') \
    .withColumnRenamed('marital' , 'marital_status') \
    .filter(col("patient_id").isNotNull() & col("fullname").isNotNull())

In [0]:
window = Window.partitionBy("patient_id", "timestamp").orderBy(col("timestamp").desc())
df_patients_bronze = df_patients_bronze.withColumn("row_num", F.row_number().over(window)) \
                    .filter(col("row_num") == 1) \
                    .drop("row_num")

In [0]:
window = Window.orderBy(F.lit(1))
df_patients_bronze = df_patients_bronze \
    .withColumn("gender",
        when(col("gender") == "M", "Male") \
        .when(col("gender") == "F", "Female") \
        .otherwise(col("gender"))) \
    .withColumn("marital_status", 
        when(col("marital_status") == "S", "Single") \
        .when(col("marital_status") == "M", "Married") \
        .when(col("marital_status") == "D", "Divorced") \
        .otherwise(col("marital_status")))

In [0]:
if spark.catalog.tableExists("target_schema.patients"):
    max_key = spark.table(target_table).agg({"patient_key": "max"}).collect()[0][0]
    if max_key is None:
        max_key = 0
else:
    max_key = 0


df_patients_bronze = df_patients_bronze.withColumn("patient_key", F.row_number().over(window) + max_key)

In [0]:
target_schema = StructType([
    StructField("patient_key", IntegerType(), False),
    StructField("patient_id", StringType(), False),
    StructField("date_of_birth", StringType(), True),
    StructField("date_of_death", StringType(), True),
    StructField("social_security_number", StringType(), True),
    StructField("drivers_license_number", StringType(), True),
    StructField("passport_number", StringType(), True),
    StructField("prefix", StringType(), True),
    StructField("first_name", StringType(), True),
    StructField("middle_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("suffix", StringType(), True),
    StructField("maiden", StringType(), True),
    StructField("marital_status", StringType(), True),
    StructField("race", StringType(), True),
    StructField("ethnicity", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("birthplace", StringType(), True),
    StructField("address", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("county", StringType(), True),
    StructField("fips", IntegerType(), True),
    StructField("zip", IntegerType(), True),
    StructField("lat", DoubleType(), True),
    StructField("lon", DoubleType(), True),
    StructField("healthcare_expenses", DoubleType(), True),
    StructField("healthcare_coverage", DoubleType(), True),
    StructField("income", LongType(), True),
    StructField("fullname", StringType(), False),
    # SCD2 columns
    StructField("effective_start_date", TimestampType(), False),
    StructField("effective_end_date", TimestampType(), True),
    StructField("current_flag", StringType(), False),
])  

if not spark.catalog.tableExists(target_table):
    spark.catalog.createTable(
        target_table,
        schema=target_schema 
    )