# Task: Extract data from the raw CSV files into Bronze

In [0]:
# Display all the source path
display(dbutils.fs.ls("/FileStore/tables"))

path,name,size,modificationTime
dbfs:/FileStore/tables/payments.csv,payments.csv,57666115,1724313878000
dbfs:/FileStore/tables/riders.csv,riders.csv,5594949,1724313876000
dbfs:/FileStore/tables/stations.csv,stations.csv,49552,1724313876000
dbfs:/FileStore/tables/trips.csv,trips.csv,440125504,1724313887000


In [0]:
# Set up the delta path
delta_path = "/bronze"

In [0]:
# Read source files into DataFrames
df_payments = spark.read.option("header", "false").option("sep", ",").csv("/FileStore/tables/payments.csv").toDF(
    "payment_id",
    "date",
    "amount",
    "ride_id",
)
df_payments.printSchema()

root
 |-- payment_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- ride_id: string (nullable = true)



In [0]:
# Display the DataFrame as an example
display(df_payments.limit(3))

payment_id,date,amount,ride_id
1,2019-05-01,9.0,1000
2,2019-06-01,9.0,1000
3,2019-07-01,9.0,1000


In [0]:
# Write the DataFrame to Delta in Bronze
df_payments.write.format("delta").mode("overwrite").save(f"{delta_path}/payments")

In [0]:
df_riders = spark.read.option("header", "false").option("sep", ",").csv("/FileStore/tables/riders.csv").toDF(
    "rider_id",
    "first",
    "last",
    "address",
    "birthday",
    "account_start_date",
    "account_end_date",
    "is_member"
)
df_riders.printSchema()

root
 |-- rider_id: string (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- address: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- account_start_date: string (nullable = true)
 |-- account_end_date: string (nullable = true)
 |-- is_member: string (nullable = true)



In [0]:
# Write the df_riders DataFrame to a Delta table
df_riders.write.format("delta").mode("overwrite").save(f"{delta_path}/riders")

In [0]:
df_stations = spark.read.option("header", "false").option("sep", ",").csv("/FileStore/tables/stations.csv").toDF(
    "station_id",
    "name",
    "latitude",
    "longitude",
)
df_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)



In [0]:
# Write the df_stations DataFrame to a Delta table
df_stations.write.format("delta").mode("overwrite").save(f"{delta_path}/stations")

In [0]:
df_trips = spark.read.option("header", "false").option("sep", ",").csv("/FileStore/tables/trips.csv").toDF(
    "trip_id",
    "rideable_type",
    "started_at",
    "ended_at",
    "start_station_id",
    "end_station_id",
    "rider_id"
)
df_trips.printSchema()



root
 |-- trip_id: string (nullable = true)
 |-- rideable_type: string (nullable = true)
 |-- started_at: string (nullable = true)
 |-- ended_at: string (nullable = true)
 |-- start_station_id: string (nullable = true)
 |-- end_station_id: string (nullable = true)
 |-- rider_id: string (nullable = true)



In [0]:
# Write the df_trips DataFrame to a Delta table
df_trips.write.format("delta").mode("overwrite").save(f"{delta_path}/trips")