In [0]:
# Use Bronze Store DB to retrieve data from rider table
spark.sql("USE DATABASE bronze_store")

In [0]:
# Load data from rider table to dataframe
rider_df = (
spark
    .read
    .table(tableName = "rider")
)

display(rider_df)

rider_id,first_name,last_name,address,birth_dt,account_start_dt,account_end_dt,is_member
1000,Diana,Clark,1200 Alyssa Squares,1989-02-13,2019-04-23,,True
1001,Jennifer,Smith,397 Diana Ferry,1976-08-10,2019-11-01,2020-09-01,True
1002,Karen,Smith,644 Brittany Row Apt. 097,1998-08-10,2022-02-04,,True
1003,Bryan,Roberts,996 Dickerson Turnpike,1999-03-29,2019-08-26,,False
1004,Jesse,Middleton,7009 Nathan Expressway,1969-04-11,2019-09-14,,True
1005,Christine,Rodriguez,224 Washington Mills Apt. 467,1974-08-27,2020-03-24,,False
1006,Alicia,Taylor,1137 Angela Locks,2004-01-30,2020-11-27,2021-12-01,True
1007,Benjamin,Fernandez,979 Phillips Ways,1988-01-11,2016-12-11,,False
1008,John,Crawford,7691 Evans Court,1987-02-21,2021-03-28,2021-07-01,True
1009,Victoria,Ritter,9922 Jim Crest Apt. 319,1981-02-07,2020-06-12,2021-11-01,True


In [0]:
# Import required functions and data types
from pyspark.sql.functions import col, months_between, to_date, lit, round
from pyspark.sql.types import IntegerType, BooleanType

# Convert rider_id to integer; birth_dt, account_start_dt, account_end_dt to date; calculate rider_age_at_acc_start time delta (in years) between birth_dt and account_start_dt 
rider_df = (
    rider_df
        .select(
              col("rider_id").cast(IntegerType()),
              col("first_name"),
              col("last_name"),
              col("address"),
              to_date(col("birth_dt")).alias("birth_dt"),
              to_date(col("account_start_dt")).alias("account_start_dt"),
              to_date(col("account_end_dt")).alias("account_end_dt"),
              col("is_member").cast(BooleanType()),
              round(
                     months_between(
                                     to_date(col("account_start_dt")),
                                     to_date(col("birth_dt"))
                     )
                      / lit(12)
                   , 0)
                .cast(IntegerType())
                .alias("rider_age_at_acc_start")
            )
)

rider_df.printSchema()

In [0]:
display(rider_df)

rider_id,first_name,last_name,address,birth_dt,account_start_dt,account_end_dt,is_member,rider_age_at_acc_start
1000,Diana,Clark,1200 Alyssa Squares,1989-02-13,2019-04-23,,True,30
1001,Jennifer,Smith,397 Diana Ferry,1976-08-10,2019-11-01,2020-09-01,True,43
1002,Karen,Smith,644 Brittany Row Apt. 097,1998-08-10,2022-02-04,,True,23
1003,Bryan,Roberts,996 Dickerson Turnpike,1999-03-29,2019-08-26,,False,20
1004,Jesse,Middleton,7009 Nathan Expressway,1969-04-11,2019-09-14,,True,50
1005,Christine,Rodriguez,224 Washington Mills Apt. 467,1974-08-27,2020-03-24,,False,46
1006,Alicia,Taylor,1137 Angela Locks,2004-01-30,2020-11-27,2021-12-01,True,17
1007,Benjamin,Fernandez,979 Phillips Ways,1988-01-11,2016-12-11,,False,29
1008,John,Crawford,7691 Evans Court,1987-02-21,2021-03-28,2021-07-01,True,34
1009,Victoria,Ritter,9922 Jim Crest Apt. 319,1981-02-07,2020-06-12,2021-11-01,True,39


In [0]:
# Use Gold Store to load dimension tables and fact tables that are part of the designed star schema
spark.sql("USE DATABASE Gold_Store")

In [0]:
# Load dataframe content to dimension table inside the Gold Store
(
rider_df
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("dim_rider")
)