In [0]:
# Use Bronze Store DB to retrieve data from payment table
spark.sql("USE DATABASE bronze_store")

In [0]:
# Load data from payment table to dataframe
payment_df = (
spark
    .read
    .table(tableName = "payment")
)

display(payment_df)

payment_id,date,amount,rider_id
1,2019-05-01,9.0,1000
2,2019-06-01,9.0,1000
3,2019-07-01,9.0,1000
4,2019-08-01,9.0,1000
5,2019-09-01,9.0,1000
6,2019-10-01,9.0,1000
7,2019-11-01,9.0,1000
8,2019-12-01,9.0,1000
9,2020-01-01,9.0,1000
10,2020-02-01,9.0,1000


In [0]:
# Use Bronze Store DB to retrieve data from payment table
spark.sql("USE DATABASE gold_store")

In [0]:
# Load data from dim_rider table to dataframe to use in a join operation with payment.
rider_df = (
spark
    .read
    .table(tableName = "dim_rider")
)

display(rider_df)

rider_id,first_name,last_name,address,birth_dt,account_start_dt,account_end_dt,is_member,rider_age_at_acc_start
1000,Diana,Clark,1200 Alyssa Squares,1989-02-13,2019-04-23,,True,30
1001,Jennifer,Smith,397 Diana Ferry,1976-08-10,2019-11-01,2020-09-01,True,43
1002,Karen,Smith,644 Brittany Row Apt. 097,1998-08-10,2022-02-04,,True,23
1003,Bryan,Roberts,996 Dickerson Turnpike,1999-03-29,2019-08-26,,False,20
1004,Jesse,Middleton,7009 Nathan Expressway,1969-04-11,2019-09-14,,True,50
1005,Christine,Rodriguez,224 Washington Mills Apt. 467,1974-08-27,2020-03-24,,False,46
1006,Alicia,Taylor,1137 Angela Locks,2004-01-30,2020-11-27,2021-12-01,True,17
1007,Benjamin,Fernandez,979 Phillips Ways,1988-01-11,2016-12-11,,False,29
1008,John,Crawford,7691 Evans Court,1987-02-21,2021-03-28,2021-07-01,True,34
1009,Victoria,Ritter,9922 Jim Crest Apt. 319,1981-02-07,2020-06-12,2021-11-01,True,39


In [0]:
# Load data from dim_date table to dataframe to use in a join operation with payment.
date_df = (
spark
    .read
    .table(tableName = "dim_date")
)

display(date_df)

date_id,date_value,month,month_name,year,quarter,day_of_week
20130201,2013-02-01,2,February,2013,1,Fri
20130202,2013-02-02,2,February,2013,1,Sat
20130203,2013-02-03,2,February,2013,1,Sun
20130204,2013-02-04,2,February,2013,1,Mon
20130205,2013-02-05,2,February,2013,1,Tue
20130206,2013-02-06,2,February,2013,1,Wed
20130207,2013-02-07,2,February,2013,1,Thu
20130208,2013-02-08,2,February,2013,1,Fri
20130209,2013-02-09,2,February,2013,1,Sat
20130210,2013-02-10,2,February,2013,1,Sun


In [0]:
# Import required functions and data types
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.types import IntegerType, FloatType

# Convert payment_id, rider_id to integer type; amount to float Type. Enforce referential integrity by peforming an inner join operation between payment and rider.
payment_df = (
    payment_df
        .select(
               col("payment_id").cast(IntegerType()),
               regexp_replace(col("date"), "-", "")
                    .cast(IntegerType())
                    .alias("date_id"),
               col("amount").cast(FloatType()),
               col("rider_id").cast(IntegerType())
            )
        .join(other = rider_df.select(col("rider_id")), 
            on = "rider_id", 
            how = "inner"
            )
        .join(other = date_df.select(col("date_id")),
            on = "date_id",
            how = "inner"
           )
)

payment_df.printSchema()

In [0]:
display(payment_df)

date_id,rider_id,payment_id,amount
20190501,1000,1,9.0
20190601,1000,2,9.0
20190701,1000,3,9.0
20190801,1000,4,9.0
20190901,1000,5,9.0
20191001,1000,6,9.0
20191101,1000,7,9.0
20191201,1000,8,9.0
20200101,1000,9,9.0
20200201,1000,10,9.0


In [0]:
# Load dataframe content to fact table inside the Gold Store; rearrange the column header in the process.
(
payment_df
    .select(col("payment_id"),
            col("date_id"),
            col("rider_id"),
            col("amount")
           )
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("fact_payment")
)