In [0]:
val df = spark.read.option("header", "true").parquet("/user/qz2166_nyu_edu/tlc_trip_data/yellow_taxi_clean_w_loc.parquet");

In [1]:
// most popular pickup location
z.show(df
    .groupBy("PULocation")
    .count
    .sort(desc("count")))

In [2]:
// most popular dropoff location
z.show(df
    .groupBy("DOLocation")
    .count
    .sort(desc("count")))

In [3]:
// most popular trip
z.show(df
    .groupBy("PULocation", "DOLocation")
    .count
    .sort(desc("count")))

In [4]:
val tipDf = df
    .filter($"payment_type" === 1)
    .withColumn("tip_percentage", $"tip_amount" / ($"total_amount" - $"tip_amount"))

In [5]:
val pickupTipPctDf = tipDf
    .groupBy("PULocation")
    .agg(median($"tip_amount").alias("median tip percent"), mean($"tip_amount").alias("average tip percent"), count("*").alias("number of trips"), sum(when($"tip_amount" === 0.0, 0).otherwise(1)).alias("number that tipped"))
    .withColumn("percentage that tipped", $"number that tipped" / $"number of trips")
    .filter($"number of trips" >= 1000)
    .sort(desc("median tip percent"))

z.show(pickupTipPctDf)

In [6]:
val dropoffTipPctDf = tipDf
    .groupBy("DOLocation")
    .agg(median($"tip_amount").alias("median tip percent"), mean($"tip_amount").alias("average tip percent"), count("*").alias("number of trips"), sum(when($"tip_amount" === 0.0, 0).otherwise(1)).alias("number that tipped"))
    .withColumn("percentage that tipped", $"number that tipped" / $"number of trips")
    .filter($"number of trips" >= 1000)
    .sort(desc("median tip percent"))

z.show(dropoffTipPctDf)

In [7]:
val hourTipPct = tipDf
    .withColumn("hour", hour($"tpep_pickup_datetime"))
    .groupBy("hour")
    .agg(median($"tip_amount").alias("median tip percent"), mean($"tip_amount").alias("average tip percent"), count("*").alias("number of trips"), sum(when($"tip_amount" === 0.0, 0).otherwise(1)).alias("number that tipped"))
    .withColumn("percentage that tipped", $"number that tipped" / $"number of trips")
    .sort(desc("median tip percent"))
    
z.show(hourTipPct)

In [8]:
val passengerTipPct = tipDf
    .groupBy("passenger_count")
    .agg(median($"tip_amount").alias("median tip percent"), mean($"tip_amount").alias("average tip percent"), count("*").alias("number of trips"), sum(when($"tip_amount" === 0.0, 0).otherwise(1)).alias("number that tipped"))
    .withColumn("percentage that tipped", $"number that tipped" / $"number of trips")
    .sort(desc("average tip percent"))
    
z.show(passengerTipPct)