In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (SparkSession.builder
            .remote("sc://localhost:15002")
            .appName("CS5305 - Big Data Example")
            .getOrCreate())

In [2]:
df = spark\
    .read.parquet("/opt/spark/data/fhv_tripdata_2015-01.parquet")

In [3]:
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropOff_datetime: timestamp_ntz (nullable = true)
 |-- PUlocationID: double (nullable = true)
 |-- DOlocationID: double (nullable = true)
 |-- SR_Flag: void (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [4]:
df = spark.read.parquet("/opt/spark/data/fhv_tripdata_2015*.parquet")
df.groupBy(F.month("pickup_datetime").alias("month"))\
    .count()\
    .orderBy("month")\
    .show()

+-----+-------+
|month|  count|
+-----+-------+
|    1|2720786|
|    2|3053183|
|    3|3245472|
+-----+-------+



In [None]:
df.show(10)

In [5]:
df = df.withColumn("month", F.month("pickup_datetime"))
df.show(10)

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+-----+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|month|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+-----+
|              B00013|2015-01-01 00:30:00|1989-01-01 00:00:00|        NULL|        NULL|   NULL|                B00013|    1|
|              B00014|2015-01-01 00:28:00|1989-01-01 00:00:00|        NULL|        NULL|   NULL|                B00014|    1|
|              B00014|2015-01-01 00:25:00|1989-01-01 00:00:00|        NULL|        NULL|   NULL|                B00014|    1|
|              B00014|2015-01-01 00:13:00|1989-01-01 00:00:00|        NULL|        NULL|   NULL|                B00014|    1|
|              B00014|2015-01-01 00:20:00|1989-01-01 00:00:00|        NULL|        NULL|   NULL|                B00014

In [6]:
df.groupBy("month").count().show()

+-----+-------+
|month|  count|
+-----+-------+
|    1|2720786|
|    2|3053183|
|    3|3245472|
+-----+-------+



In [7]:
df = spark.read.parquet("/opt/spark/data/green_tripdata_2014*.parquet")

In [8]:
df.count()

15837009

In [9]:
df.groupBy("PUlocationID")\
    .agg(F.count("*").alias("num_pickups"))\
    .orderBy(F.desc("num_pickups"))\
    .show(10)

+------------+-----------+
|PUlocationID|num_pickups|
+------------+-----------+
|         255|     806389|
|           7|     787375|
|          74|     782289|
|          41|     762377|
|          75|     699264|
|         166|     684050|
|          82|     590298|
|          42|     522620|
|         129|     465037|
|         181|     434302|
+------------+-----------+
only showing top 10 rows


In [10]:
df.createOrReplaceTempView("green_taxi")
spark.sql("SELECT * FROM green_taxi LIMIT 10").show(5)
spark.sql("""
    SELECT PULocationID, COUNT(*) as num_pickups 
    FROM green_taxi 
    GROUP BY PULocationID 
    ORDER BY num_pickups DESC 
    LIMIT 10
""").show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2014-01-01 00:17:26|  2014-01-01 00:37:11|                 N|         1|          17|         225|              1|         2.28|       13.5|  0.5|    0.