In [1]:
from pyspark.sql.types import *

schema = StructType([
    StructField("duration_sec", IntegerType()),
    StructField("start_time", DateType()),
    StructField("end_time", DateType()),
    StructField("start_station_id", StringType()),
    StructField("start_station_name", StringType()),
    StructField("start_station_latitude", StringType()),
    StructField("start_station_longitude", StringType()),
    StructField("end_station_id", StringType()),
    StructField("end_station_name", StringType()),
    StructField("end_station_latitude", StringType()),
    StructField("end_station_longitude", StringType()),
    StructField("bike_id", IntegerType()),
    StructField("user_type", StringType()),
    StructField("member_birth_year", IntegerType()),
    StructField("member_gender", IntegerType()),
    StructField("bike_share_for_all_trip", StringType())
])

bike = sqlContext.read.format('csv').options(header='true', nullValue='').schema(schema).load('/mnt/bike/sanfrancisco')

In [2]:
from pyspark.sql.functions import mean, min, max

bike.select([mean('duration_sec'), min('duration_sec'), max('duration_sec')]).show()

In [3]:
# Stolen blatentnly from https://stackoverflow.com/a/51633111/404006 
# Monthly Data
from pyspark import *
from pyspark.sql import functions as F
aggregatebike = bike.groupBy(F.year("start_time").alias("year"), F.month("start_time").alias("month"), "bike_id").agg(F.mean("duration_sec"), F.count("duration_sec").alias("count")).sort(["year", "month", "count"], ascending=False)
display(aggregatebike.take(1))

year,month,bike_id,avg(duration_sec),count
2019,2,4794,582.6335078534031,191


In [4]:
#Yearly Data
aggregatebike = bike.where(F.year("start_time") == 2018).groupBy(F.year("start_time").alias("year"), "bike_id").agg(F.mean("duration_sec"), F.count("duration_sec").alias("count")).sort(["year", "count"], ascending=False)
display(aggregatebike.take(1))

year,bike_id,avg(duration_sec),count
2018,3961,974.8852459016392,1220


In [5]:
#All Time Data
aggregatebike = bike.groupBy("bike_id").agg(F.mean("duration_sec"), F.count("duration_sec").alias("count")).sort("count", ascending=False)
display(aggregatebike.take(1))

bike_id,avg(duration_sec),count
4452,919.6548480463096,1382


In [6]:
#All Time Last, First Ride
sortbike = bike.select("bike_id", "start_time").where(F.col("bike_id") == 4452)
df1 = sortbike.sort("start_time", ascending=True).first()
df2 = sortbike.sort("start_time", ascending=False).first()

#Year Last First Ride
sortbike = bike.select("bike_id", "start_time").where(F.col("bike_id") == 3961)
df3 = sortbike.sort("start_time", ascending=True).first()
df4 = sortbike.sort("start_time", ascending=False).first()

#Month Last First Ride
sortbike = bike.select("bike_id", "start_time").where(F.col("bike_id") == 4794)
df5 = sortbike.sort("start_time", ascending=True).first()
df6 = sortbike.sort("start_time", ascending=False).first()

display([df1, df2, df3, df4, df5, df6])



bike_id,start_time
4452,2018-08-30
4452,2019-02-28
3961,2018-04-24
3961,2019-01-27
4794,2018-12-20
4794,2019-02-28


In [7]:
from pyspark.sql import Window
sortbike = bike.select("bike_id", "start_time").where(F.col("bike_id") == 4794)
df = sortbike.groupBy("bike_id", "start_time").count()
df = df.withColumn("daily_average", F.avg("count").over(Window.partitionBy(F.window("start_time", "30 days"))))
display(df)


bike_id,start_time,count,daily_average
4794,2019-02-24,2,5.0
4794,2019-02-25,7,5.0
4794,2019-02-26,5,5.0
4794,2019-02-17,1,5.0
4794,2019-02-28,8,5.0
4794,2019-02-27,4,5.0
4794,2019-02-15,6,5.0
4794,2019-02-22,11,5.0
4794,2019-02-19,11,5.0
4794,2019-02-18,2,5.0


In [8]:
sortbike = bike.select("bike_id", "start_time", "duration_sec").where(F.col("bike_id") == 4794).where(F.year("start_time") == 2019).where(F.month("start_time") == 2)
totalmiles = ((sortbike.select("duration_sec").groupby().sum("duration_sec").collect()[0][0])/60/60)*7.456
totalmiles