# Load merge data from Data Lake

Get raw data from Data Lake

In [10]:
objectName = 'Patient'

StatementMeta(SmallSparkpool, 6, 7, Finished, Available)

In [22]:

df = (spark
    .read
    .format('csv') 
    .option("delimiter", ";")
    .option("multiline", True)
    .option("quote", "\"")
    .option("escape", "\"")
    .option("header",True)
    .option('path',f'abfss://<your own filesystem>@<your own filesystem>.dfs.core.windows.net/{objectName}')
    .load()
    )

StatementMeta(SmallSparkpool, 6, 19, Finished, Available)

# Handle schema
Remove training header names from dataframe

In [23]:
from pyspark.sql.functions import col

df = df.select([col(c).alias(c[:c.index(' ')]) for c in df.columns])


StatementMeta(SmallSparkpool, 6, 20, Finished, Available)

Remove columns that are not used - if they exist

In [24]:
ColumnnamesToDelete = spark.createDataFrame(
    [
        (1, "booking_description"),
        (2, "last_change_date"),
        (3, "color_code_fk"),
        (4, "username"),
        (5, "password"),
        (6, "phone"),
        (7, "email"),
        (8, "sms"),
        (9, "hold_email"),
        (10, "team_sms")
    ]
    ,["number","columnname"]
    )

for d in ColumnnamesToDelete.collect():
    if d["columnname"] in df.columns:
        df = df.drop(d["columnname"])

StatementMeta(SmallSparkpool, 6, 21, Finished, Available)

Change column datatypes if they exits

In [26]:
from pyspark.sql.types import StringType,BooleanType,DateType

if 'date' in df.columns:
    df = df.withColumn("date", df["date"].cast(DateType()))

StatementMeta(SmallSparkpool, 6, 23, Finished, Available)

Remove characters not used in data

In [27]:
from pyspark.sql.functions import regexp_replace

SearchInColumns = spark.createDataFrame(
    [
        (1, "start_time"),
        (2, "end_time")
    ]
    ,["number","columnname"]
    )

CharacterChanges = spark.createDataFrame(
    [
        (1, "\\+", "")
    ]
    ,["number","RegExString","NewString"]
    )

for s in SearchInColumns.collect():
    if s["columnname"] in df.columns:
        for r in CharacterChanges.collect():
            df = df.withColumn(s["columnname"], regexp_replace(col(s["columnname"]), r["RegExString"], r["NewString"]))

#display(df.limit(10))

StatementMeta(SmallSparkpool, 6, 24, Finished, Available)

# Write data to Bronze area

Write new data to the delta lake in Bronze version<br>
Notice the APPEND part of the 4th line - this appends all data to the existing data (delta lake)

In [None]:
bronze_loc = f'abfss://<your own filesystem>@<your own filesystem>.dfs.core.windows.net/Bronze/{objectName}'

df.write.mode("append").format("delta").save(bronze_loc)


# Write data to Silver area

Load to Silver data lake with correct partition approach

In [123]:
# silver_loc = f'abfss://<your own filesystem>@<your own filesystem>.dfs.core.windows.net/Silver/{objectName}'

# parquetFile = spark.read.parquet(bronze_loc)

# parquetFile.repartition(1).write.parquet(silver_loc)


StatementMeta(SmallSparkpool, 4, 140, Finished, Available)

# Things and notes to keep for later

In [57]:
## managed table - hard link between objects
## df2.write.mode("overwrite").format("delta").saveAsTable("Bookings")

## unmanaged table - soft link between objects
## df2.write.mode("overwrite").format("delta").option("path",save_loc).saveAsTable("Bookings")

## statement = 'select * from Bookings'

## spark.sql(statement).createOrReplaceTempView("Temp_Bookings")

## df2 = current_file.join(df, [df.journal_pk != current_file.journal_pk], how = 'inner' )


StatementMeta(SmallSparkpool, 1, 55, Finished, Available)