Extract : 
in this stage we will extract data from csv files in DBFS and save them as Delta files 


In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

#Load csv file form Databricks file storage
def Load_DBFS(table_Schema, DBFS_path):
    df = spark.read.load(DBFS_path, format="csv", schema=table_Schema, header=False)
    return df

#save spark dataframe to Databricks as Delta foramt
def Save_as_Delta(df, delta_path):
    df.write.format("delta").mode("overwrite").save(delta_path)

Riders :

In [None]:
#define Schema
riderSchema = StructType([
    StructField("rider_id", IntegerType()),
    StructField("first", StringType()),
    StructField("last", StringType()),
    StructField("address", StringType()),
    StructField("birthday", DateType()),
    StructField("account_start_date", DateType()),
    StructField("account_end_date", DateType()),
    StructField("is_member", BooleanType())
    ])

#define path
rider_DBFS_path='/FileStore/bikeshare_source/riders.csv'
rider_delta='/delta/riders'

#Load csv to spark dataframe
df=Load_DBFS(riderSchema,rider_DBFS_path)

#print out schema design
df.printSchema()

#Data exploration
display(df.limit(5))

#Save as Delta file format
Save_as_Delta(df, rider_delta)

root
 |-- rider_id: integer (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- address: string (nullable = true)
 |-- birthday: date (nullable = true)
 |-- account_start_date: date (nullable = true)
 |-- account_end_date: date (nullable = true)
 |-- is_member: boolean (nullable = true)



rider_id,first,last,address,birthday,account_start_date,account_end_date,is_member
1000,Diana,Clark,1200 Alyssa Squares,1989-02-13,2019-04-23,,True
1001,Jennifer,Smith,397 Diana Ferry,1976-08-10,2019-11-01,2020-09-01,True
1002,Karen,Smith,644 Brittany Row Apt. 097,1998-08-10,2022-02-04,,True
1003,Bryan,Roberts,996 Dickerson Turnpike,1999-03-29,2019-08-26,,False
1004,Jesse,Middleton,7009 Nathan Expressway,1969-04-11,2019-09-14,,True


Payments :

In [None]:
#define Schema
paymentSchema = StructType([
    StructField("payment_id", IntegerType()),
    StructField("date", DateType()),
    StructField("amount", FloatType()),
    StructField("rider_id", IntegerType())
    
    ])
#define path
payment_DBFS_path='/FileStore/bikeshare_source/payments.csv'
payment_delta='/delta/payments'
#Load csv to spark dataframe
df=Load_DBFS(paymentSchema,payment_DBFS_path)
#print out schema design
df.printSchema()
#Data exploration
display(df.limit(5))
#Save as Delta file format
Save_as_Delta(df, payment_delta)

root
 |-- payment_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- amount: float (nullable = true)
 |-- rider_id: integer (nullable = true)



payment_id,date,amount,rider_id
1,2019-05-01,9.0,1000
2,2019-06-01,9.0,1000
3,2019-07-01,9.0,1000
4,2019-08-01,9.0,1000
5,2019-09-01,9.0,1000


Stations :

In [None]:
#define Schema
stationSchema = StructType([
    StructField("station_id", StringType()),
    StructField("name", StringType()),
    StructField("latitude", FloatType()),
    StructField("longitude", FloatType())
    
    ])
#define path
station_DBFS_path='/FileStore/bikeshare_source/stations.csv'
station_delta='/delta/stations'
#Load csv to spark dataframe
df=Load_DBFS(stationSchema,station_DBFS_path)
#print out schema design
df.printSchema()
#Data exploration
display(df.limit(5))
#Save as Delta file format
Save_as_Delta(df, station_delta)

root
 |-- station_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)



station_id,name,latitude,longitude
525,Glenwood Ave & Touhy Ave,42.0127,-87.66606
KA1503000012,Clark St & Lake St,41.885796,-87.6311
637,Wood St & Chicago Ave,41.895634,-87.672066
13216,State St & 33rd St,41.834732,-87.625824
18003,Fairbanks St & Superior St,41.89581,-87.620255


Trips :

In [None]:
#define Schema
tripSchema = StructType([
    StructField("trip_id", StringType()),
    StructField("rideable_type", StringType()),
    StructField("start_at", TimestampType()),
    StructField("ended_at", TimestampType()),
    StructField("start_station_id", StringType()),
    StructField("end_station_id", StringType()),
    StructField("rider_id", IntegerType())
    
    ])
#define path
trip_DBFS_path='/FileStore/bikeshare_source/trips.csv'
trip_delta='/delta/trips'
#Load csv to spark dataframe
df=Load_DBFS(tripSchema,trip_DBFS_path)
#print out schema design
df.printSchema()
#Data exploration
display(df.limit(5))
#Save as Delta file format
Save_as_Delta(df, trip_delta)

root
 |-- trip_id: string (nullable = true)
 |-- rideable_type: string (nullable = true)
 |-- start_at: timestamp (nullable = true)
 |-- ended_at: timestamp (nullable = true)
 |-- start_station_id: string (nullable = true)
 |-- end_station_id: string (nullable = true)
 |-- rider_id: integer (nullable = true)



trip_id,rideable_type,start_at,ended_at,start_station_id,end_station_id,rider_id
89E7AA6C29227EFF,classic_bike,2021-02-12T16:14:56.000+0000,2021-02-12T16:21:43.000+0000,525,660,71934
0FEFDE2603568365,classic_bike,2021-02-14T17:52:38.000+0000,2021-02-14T18:12:09.000+0000,525,16806,47854
E6159D746B2DBB91,electric_bike,2021-02-09T19:10:18.000+0000,2021-02-09T19:19:10.000+0000,KA1503000012,TA1305000029,70870
B32D3199F1C2E75B,classic_bike,2021-02-02T17:49:41.000+0000,2021-02-02T17:54:06.000+0000,637,TA1305000034,58974
83E463F23575F4BF,electric_bike,2021-02-23T15:07:23.000+0000,2021-02-23T15:22:37.000+0000,13216,TA1309000055,39608
