In [68]:
import pyspark

from pyspark.sql import SparkSession
from datetime import datetime

spark = SparkSession.builder.master("local[*]").appName("test").getOrCreate()

df = spark.read.option("header", "true").csv(
    "/sparkdata/data/fhv_tripdata_2019-10.csv.gz", inferSchema=True
)

spark.version

'3.5.0'

In [57]:
df.count()

1897493

In [59]:
df.repartition(6).write.parquet("/sparkdata/data/parquet", mode="overwrite")

In [60]:
from pyspark.sql import functions as F

df.withColumn("pickup_date", F.to_date(df.pickup_datetime)).filter(
    "pickup_date = '2019-10-15'"
).count()

62610

In [61]:
df.registerTempTable("fhv_2019_10")



In [62]:
spark.sql(
    """
SELECT
    to_date(pickup_datetime) AS pickup_date,
    MAX((CAST(dropOff_datetime AS LONG) - CAST(pickup_datetime AS LONG)) / 60) AS duration
FROM 
    fhv_2019_10
GROUP BY
    1
ORDER BY
    2 DESC
LIMIT 10;
"""
).show()

+-----------+-----------------+
|pickup_date|         duration|
+-----------+-----------------+
| 2019-10-28|       3.786915E7|
| 2019-10-11|       3.786915E7|
| 2019-10-31|       5260346.45|
| 2019-10-01|4207681.683333334|
| 2019-10-17|         527640.0|
| 2019-10-26|         527050.0|
| 2019-10-30|87872.06666666667|
| 2019-10-25|          63409.6|
| 2019-10-02|46153.88333333333|
| 2019-10-23|          44737.0|
+-----------+-----------------+



In [67]:
float("3.786915E7") / 60

631152.5

In [63]:
zones = spark.read.option("header", "true").csv("/sparkdata/data/taxi_zone_lookup.csv")
zones.registerTempTable("zones")

In [64]:
zones.columns

['LocationID', 'Borough', 'Zone', 'service_zone']

In [65]:
spark.sql(
    """
SELECT
    pul.zone,
    COUNT(1)
FROM 
    fhv_2019_10 fhv LEFT JOIN zones pul ON fhv.PULocationID = pul.LocationID
                      LEFT JOIN zones dol ON fhv.DOLocationID = dol.LocationID
GROUP BY 
    1
ORDER BY
    2 ASC
LIMIT 5;
"""
).show()

+--------------------+--------+
|                zone|count(1)|
+--------------------+--------+
|         Jamaica Bay|       1|
|Governor's Island...|       2|
| Green-Wood Cemetery|       5|
|       Broad Channel|       8|
|     Highbridge Park|      14|
+--------------------+--------+

