In [0]:
spark.sql("USE DATABASE gold_store")

In [0]:
# Analyze how much time is spent per ride
# Based on date and time factors such as day of week and time of day
# Analysis conclusion: The longest trips are taken during the weekend.

spark.sql("""
                CREATE OR REPLACE VIEW vw_avg_time_spent_by_dow AS
                select d.day_of_week, avg(t.trip_duration_minutes) as avg_time_spent
                from fact_trip t
                inner join dim_date d on d.date_id = t.start_date_id
                group by d.day_of_week
                order by avg_time_spent desc
          """)

display(
        spark.sql("SELECT * FROM vw_avg_time_spent_by_dow")
)

day_of_week,avg_time_spent
Sun,27.33503334146198
Sat,25.704447578635087
Fri,20.31559736646373
Mon,20.133180583400595
Thu,17.75227189535492
Tue,17.686302554495292
Wed,17.546224786480103


In [0]:
# Analysis conclusion: At night and at noon are the times of day when trips are the longest

spark.sql("""
                CREATE OR REPLACE VIEW vw_avg_time_spent_by_tod AS
                select case when hour_of_day between 6 and 9 then 'Morning'
                            when hour_of_day between 10 and 11 then 'Day'
                            when hour_of_day between 12 and 13 then 'Noon'
                            when hour_of_day between 14 and 17 then 'Afternoon'
                            when hour_of_day between 18 and 20 then 'Evening'
                            when hour_of_day between 21 and 23 
                                OR hour_of_day between 0 and 5 then 'Night'
                            else NULL
                        end as time_of_day, 
                        avg(t.trip_duration_minutes) as avg_time_spent
                from fact_trip t
                group by 
                    case when hour_of_day between 6 and 9 then 'Morning'
                            when hour_of_day between 10 and 11 then 'Day'
                            when hour_of_day between 12 and 13 then 'Noon'
                            when hour_of_day between 14 and 17 then 'Afternoon'
                            when hour_of_day between 18 and 20 then 'Evening'
                            when hour_of_day between 21 and 23 
                                OR hour_of_day between 0 and 5 then 'Night'
                            else NULL
                        end
                order by avg_time_spent desc
            """)

display(
        spark.sql("SELECT * FROM vw_avg_time_spent_by_tod")
)

time_of_day,avg_time_spent
Night,24.562721938784
Noon,23.387059396731186
Day,22.38944564814318
Afternoon,21.90289995401902
Evening,20.80050314215257
Morning,14.846905932690465


In [0]:
# Based on which station is the starting and / or ending station
# Analysis conclusion: "Throop St & 52nd St" is the top starting station based on avg number of minutes per trip

spark.sql("""
            CREATE OR REPLACE VIEW vw_avg_time_spent_by_start_station AS
            select s.station_name as start_station_name, avg(t.trip_duration_minutes) as avg_time_spent
            from fact_trip t
            inner join dim_station s on s.station_id = t.start_station_id
            group by s.station_name
            order by avg_time_spent desc
            limit 5
          """
)

display(
        spark.sql("SELECT * FROM vw_avg_time_spent_by_start_station")
)

start_station_name,avg_time_spent
Throop St & 52nd St,537.5128205128206
South Chicago Ave & Elliot Ave,535.1935483870968
Wabash Ave & 83rd St,409.27272727272725
Racine Ave & 65th St,352.93333333333334
Central Ave & Harrison St,317.1478873239437


In [0]:
# Analysis conclusion: "Base - 2132 W Hubbard Warehouse" is the top ending station based on avg number of minutes per trip

spark.sql("""
            CREATE OR REPLACE VIEW vw_avg_time_spent_by_end_station AS
            select s.station_name as end_station_name, avg(t.trip_duration_minutes) as avg_time_spent
            from fact_trip t
            inner join dim_station s on s.station_id = t.end_station_id
            group by s.station_name
            order by avg_time_spent desc
            limit 5
         """
)

display(
        spark.sql("SELECT * FROM vw_avg_time_spent_by_end_station")
)

end_station_name,avg_time_spent
Base - 2132 W Hubbard Warehouse,3749.974358974359
Cicero Ave & Lake St,569.2615384615384
State St & 76th St,501.5365853658537
Eberhart Ave & 91st St,376.3
Greenwood Ave & 91st St,371.3272727272727


In [0]:
# Based on age of the rider at time of the ride
# Analysis conclusion: Age Groups share the same average amount of minutes spent per trip = 21 mins.

spark.sql("""
            CREATE OR REPLACE VIEW vw_avg_time_spent_by_rider_age_at_trip_time AS
            select 
                case when rider_age_at_trip_time < 20 then '< 20'
                     when rider_age_at_trip_time between 20 and 29 then '20s'
                     when rider_age_at_trip_time between 30 and 39 then '30s'
                     when rider_age_at_trip_time between 40 and 49 then '40s'
                     when rider_age_at_trip_time between 50 and 59 then '50s'
                     when rider_age_at_trip_time between 60 and 69 then '60s'
                     when rider_age_at_trip_time >= 70 then '>= 70'
                     else null 
                end as age_bucket,
                avg(trip_duration_minutes) as avg_time_spent
            from fact_trip t
            group by 
                    case when rider_age_at_trip_time < 20 then '< 20'
                            when rider_age_at_trip_time between 20 and 29 then '20s'
                            when rider_age_at_trip_time between 30 and 39 then '30s'
                            when rider_age_at_trip_time between 40 and 49 then '40s'
                            when rider_age_at_trip_time between 50 and 59 then '50s'
                            when rider_age_at_trip_time between 60 and 69 then '60s'
                            when rider_age_at_trip_time >= 70 then '>= 70'
                            else null 
                    end
            order by avg_time_spent desc
        """
)

display(
        spark.sql("SELECT * FROM vw_avg_time_spent_by_rider_age_at_trip_time")
)

age_bucket,avg_time_spent
20s,21.47339004501732
< 20,21.42923495074372
30s,21.36845898887396
60s,21.09370605000809
40s,20.973500361673825
50s,20.78992974860892
>= 70,19.471639738741835


In [0]:
# Based on whether the rider is a member or a casual rider: 
# Analysis conclusion: Members and Casual Riders share the same average amount of minutes spent per trip = 21 mins.

spark.sql("""
            CREATE OR REPLACE VIEW vw_avg_time_spent_by_membership_status AS
            select 
                case when r.is_member = 1 then 'Member' 
                     else 'Casual Rider' 
                end as Membership, 
                avg(t.trip_duration_minutes) as avg_time_spent
            from fact_trip t
            inner join dim_rider r on r.rider_id = t.rider_id
            group by case when r.is_member = 1 then 'Member' else 'Casual Rider' end
            order by Membership desc
        """
)

display(
        spark.sql("SELECT * FROM vw_avg_time_spent_by_membership_status")
)            

Membership,avg_time_spent
Member,21.41434539288319
Casual Rider,20.83449758603985


In [0]:
# Analyze how much money is spent
# Per month, quarter, year
# Analysis conclusion: Winter Months are the top selling months

spark.sql("""
            CREATE OR REPLACE VIEW vw_avg_amt_by_month AS
            SELECT month_name, sum(Amount) as Total_Amount, avg(Amount) as Avg_Amt
            from fact_payment p
            inner join dim_date d on d.date_id = p.date_id
            group by month_name
            order by total_amount desc
        """
)

display(
        spark.sql("SELECT * FROM vw_avg_amt_by_month")
)

month_name,Total_Amount,Avg_Amt
February,1907807.270002365,10.001977896973232
January,1855786.8299827576,9.999659616794233
December,1799778.4600391388,9.992107817228174
November,1747273.570006609,9.986474760559938
October,1696207.3200480938,10.002224987458025
September,1641916.4000854492,9.992370844681007
August,1592322.5799536705,10.010137485485542
July,1538960.8800587654,9.987999039847647
June,1491227.5800013542,9.992344927875488
May,1441279.190028429,9.986206254051073


In [0]:
# Analysis conclusion: Last Quarter is the top selling quarter

spark.sql("""
            CREATE OR REPLACE VIEW vw_avg_amt_by_quarter AS
            SELECT Quarter, sum(Amount) as total_amount, avg(Amount) as avg_amt
            from fact_payment p
            inner join dim_date d on d.date_id = p.date_id
            group by Quarter
            order by total_amount desc
        """
)

display(
        spark.sql("SELECT * FROM vw_avg_amt_by_quarter")
)            

Quarter,total_amount,avg_amt
4,5243259.350093842,9.993499400751032
1,5112376.489925623,9.999973573858064
3,4773199.860097885,9.99687908554877
2,4328269.550025225,9.9906506214343


In [0]:
# Analysis conclusion: Increase in sales between 2017 and 2021, with 2021 being the top selling year.
spark.sql("""
            CREATE OR REPLACE VIEW vw_avg_amt_by_year AS
            SELECT year, sum(Amount) as total_amount, avg(Amount) as avg_amt
            from fact_payment p
            inner join dim_date d on d.date_id = p.date_id
            group by year
            order by total_amount desc
          """
)

display(
        spark.sql("SELECT * FROM vw_avg_amt_by_year")
)            

year,total_amount,avg_amt
2021,6081098.249984741,9.992766822750376
2020,4315449.399999857,9.998029330676406
2019,2978658.79005599,9.989967937403753
2018,2000105.4999730587,9.9849509763422
2017,1308372.5401260853,9.992458453947618
2022,1189970.4500157833,9.989091054217628
2016,825120.8099706173,10.003889548625333
2015,477233.2200062275,10.058026049701304
2014,227402.9500119686,10.05851689720314
2013,53693.33999824524,10.034262754297371


In [0]:
# Per member, based on the age of the rider at account start
# Analysis Conclusion: Young people tend to spend the most.

spark.sql("""
            CREATE OR REPLACE VIEW vw_avg_amt_by_rider_age_at_acc_start AS
            select
                case when rider_age_at_acc_start < 20 then '< 20'
                     when rider_age_at_acc_start between 20 and 29 then '20s'
                     when rider_age_at_acc_start between 30 and 39 then '30s'
                     when rider_age_at_acc_start between 40 and 49 then '40s'
                     when rider_age_at_acc_start between 50 and 59 then '50s'
                     when rider_age_at_acc_start between 60 and 69 then '60s'
                     when rider_age_at_acc_start >= 70 then '>= 70'
                     else null 
                end as Age_Bucket,
                avg(Total_Amt) as avg_amt
            from (
                select 
                    p.rider_id,
                    r.rider_age_at_acc_start,
                    sum(p.Amount) as Total_Amt
                from fact_payment p
                inner join dim_rider r on r.rider_id = p.rider_id
                where is_member = 1
                group by p.rider_id, r.rider_age_at_acc_start
            ) a
            group by
                    case when rider_age_at_acc_start < 20 then '< 20'
                            when rider_age_at_acc_start between 20 and 29 then '20s'
                            when rider_age_at_acc_start between 30 and 39 then '30s'
                            when rider_age_at_acc_start between 40 and 49 then '40s'
                            when rider_age_at_acc_start between 50 and 59 then '50s'
                            when rider_age_at_acc_start between 60 and 69 then '60s'
                            when rider_age_at_acc_start >= 70 then '>= 70'
                            else null 
                    end
            order by avg_amt desc
        """
)

display(
        spark.sql("SELECT * FROM vw_avg_amt_by_rider_age_at_acc_start")
)            

Age_Bucket,avg_amt
< 20,315.2741339042429
20s,238.2994039431453
30s,222.38455369810623
40s,206.29332410930476
50s,194.4537487828627
60s,169.67913669064748
>= 70,125.55
