In [1]:
from pyspark.sql.types import *

schema = StructType([
    StructField("tripduration", IntegerType()),
    StructField("starttime", DateType()),
    StructField("stoptime", DateType()),
    StructField("start station id", IntegerType()),
    StructField("start station name", StringType()),
    StructField("start station latitude", DecimalType(11, 9)),
    StructField("start station longitude", DecimalType(11, 9)),
    StructField("end station id", IntegerType()),
    StructField("end station name", StringType()),
    StructField("end station latitude", DecimalType(11, 9)),
    StructField("end station longitude", DecimalType(11, 9)),
    StructField("bikeid", IntegerType()),
    StructField("usertype", StringType()),
    StructField("birth year", IntegerType()),
    StructField("gender", IntegerType())
])

bike = sqlContext.read.format('csv').options(header='true', nullValue='\\N').schema(schema).load('/mnt/bike/newyork')

In [2]:
from pyspark.sql.functions import mean, min, max

bike.select([mean('tripduration'), min('tripduration'), max('tripduration')]).show()

In [3]:
# Stolen blatentnly from https://stackoverflow.com/a/51633111/404006 
# Monthly Data
from pyspark import *
from pyspark.sql import functions as F
aggregatebike = bike.groupBy(F.year("starttime").alias("year"), F.month("starttime").alias("month"), "bikeid").agg(F.mean("tripduration"), F.count("tripduration").alias("count")).sort(["year", "month", "count"], ascending=False)
display(aggregatebike.take(1))

year,month,bikeid,avg(tripduration),count
2019,2,35619,865.7807486631016,561


In [4]:
#Yearly Data
aggregatebike = bike.where(F.year("starttime") == 2018).groupBy(F.year("starttime").alias("year"), "bikeid").agg(F.mean("tripduration"), F.count("tripduration").alias("count")).sort(["year", "count"], ascending=False)
display(aggregatebike.take(1))

year,bikeid,avg(tripduration),count
2018,30657,869.991963661775,2862


In [5]:
#All Time Data
aggregatebike = bike.groupBy("bikeid").agg(F.mean("tripduration"), F.count("tripduration").alias("count")).sort("count", ascending=False)
display(aggregatebike.take(1))

bikeid,avg(tripduration),count
18104,873.9685520361991,8840


In [6]:
#All Time Last, First Ride
sortbike = bike.select("bikeid", "starttime").where(F.col("bikeid") == 18104)
df1 = sortbike.sort("starttime", ascending=True).first()
df2 = sortbike.sort("starttime", ascending=False).first()

#Year Last First Ride
sortbike = bike.select("bikeid", "starttime").where(F.col("bikeid") == 30657)
df3 = sortbike.sort("starttime", ascending=True).first()
df4 = sortbike.sort("starttime", ascending=False).first()

#Month Last First Ride
sortbike = bike.select("bikeid", "starttime").where(F.col("bikeid") == 35619)
df5 = sortbike.sort("starttime", ascending=True).first()
df6 = sortbike.sort("starttime", ascending=False).first()

display([df1, df2, df3, df4, df5, df6])



bikeid,starttime
18104,2013-06-01
18104,2019-02-28
30657,2017-08-31
30657,2019-02-28
35619,2019-01-20
35619,2019-02-28


In [7]:
from pyspark.sql import Window
sortbike = bike.select("bikeid", "starttime").where(F.col("bikeid") == 35619)
df = sortbike.groupBy("bikeid", "starttime").count()
df = df.withColumn("daily_average", F.avg("count").over(Window.partitionBy(F.window("starttime", "30 days"))))
display(df)


bikeid,starttime,count,daily_average
35619,2019-02-10,31,19.75
35619,2019-01-20,19,19.75
35619,2019-02-11,24,19.75
35619,2019-01-28,11,19.75
35619,2019-02-07,32,19.75
35619,2019-02-12,14,19.75
35619,2019-01-24,9,19.75
35619,2019-01-21,13,19.75
35619,2019-01-29,14,19.75
35619,2019-02-04,22,19.75


In [8]:
sortbike = bike.select("bikeid", "starttime", "tripduration").where(F.col("bikeid") == 35619).where(F.year("starttime") == 2019).where(F.month("starttime") == 2)
totalmiles = ((sortbike.select("tripduration").groupby().sum("tripduration").collect()[0][0])/60/60)*7.456
totalmiles