In [34]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types
from pyspark.sql import functions as F

## Question 1

The version of spark I'm using is:

In [None]:
spark-shell

In [4]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [5]:
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz

--2024-03-04 19:23:09--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/efdfcf82-6d5c-44d1-a138-4e8ea3c3a3b6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240304%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240304T192309Z&X-Amz-Expires=300&X-Amz-Signature=fec7a10b1e7cc728e75d30b60ca9ad29515e9d0590699e99a8a98d782629d3fd&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dfhv_tripdata_2019-10.csv.gz&response-content-type=application%2Foctet-stream [following]
--2024-03-04 19:23:09--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/efdfcf82-6d5c-

In [6]:
df = spark.read \
    .option("header", "true") \
    .csv("fhv_tripdata_2019-10.csv.gz")

                                                                                

In [7]:
df.schema

StructType([StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', StringType(), True), StructField('dropOff_datetime', StringType(), True), StructField('PUlocationID', StringType(), True), StructField('DOlocationID', StringType(), True), StructField('SR_Flag', StringType(), True), StructField('Affiliated_base_number', StringType(), True)])

In [11]:
schema = types.StructType([
    types.StructField('dispatching_base_num', types.StringType(), True), 
    types.StructField('pickup_datetime', types.TimestampType(), True), 
    types.StructField('dropOff_datetime', types.TimestampType(), True), 
    types.StructField('PUlocationID', types.IntegerType(), True), 
    types.StructField('DOlocationID', types.IntegerType(), True), 
    types.StructField('SR_Flag', types.StringType(), True), 
    types.StructField('Affiliated_base_number', types.StringType(), True)
])

In [12]:
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv("fhv_tripdata_2019-10.csv.gz")

In [14]:
df.head(5)

[Row(dispatching_base_num='B00009', pickup_datetime=datetime.datetime(2019, 10, 1, 0, 23), dropOff_datetime=datetime.datetime(2019, 10, 1, 0, 35), PUlocationID=264, DOlocationID=264, SR_Flag=None, Affiliated_base_number='B00009'),
 Row(dispatching_base_num='B00013', pickup_datetime=datetime.datetime(2019, 10, 1, 0, 11, 29), dropOff_datetime=datetime.datetime(2019, 10, 1, 0, 13, 22), PUlocationID=264, DOlocationID=264, SR_Flag=None, Affiliated_base_number='B00013'),
 Row(dispatching_base_num='B00014', pickup_datetime=datetime.datetime(2019, 10, 1, 0, 11, 43), dropOff_datetime=datetime.datetime(2019, 10, 1, 0, 37, 20), PUlocationID=264, DOlocationID=264, SR_Flag=None, Affiliated_base_number='B00014'),
 Row(dispatching_base_num='B00014', pickup_datetime=datetime.datetime(2019, 10, 1, 0, 56, 29), dropOff_datetime=datetime.datetime(2019, 10, 1, 0, 57, 47), PUlocationID=264, DOlocationID=264, SR_Flag=None, Affiliated_base_number='B00014'),
 Row(dispatching_base_num='B00014', pickup_datetime=

In [15]:
df.show()

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B00009|2019-10-01 00:23:00|2019-10-01 00:35:00|         264|         264|   null|                B00009|
|              B00013|2019-10-01 00:11:29|2019-10-01 00:13:22|         264|         264|   null|                B00013|
|              B00014|2019-10-01 00:11:43|2019-10-01 00:37:20|         264|         264|   null|                B00014|
|              B00014|2019-10-01 00:56:29|2019-10-01 00:57:47|         264|         264|   null|                B00014|
|              B00014|2019-10-01 00:23:09|2019-10-01 00:28:27|         264|         264|   null|                B00014|
|     B00021         |2019-10-01 00:00:4

In [17]:
df = df.repartition(6)
df = df.coalesce(6)
df.write.parquet('fhv/2019/10/')

                                                                                

## Question 2
To get the size of the files we use the following command in linux into the directory "fhv/2019/20/":

In [21]:
ls -lh 

total 19M
-rw-rw-r-- 1 jobert jobert 5.7K Mar  4 19:21 Example.ipynb
-rw-rw-r-- 1 jobert jobert  12K Mar  4 20:03 Homework_5.ipynb
drwxr-xr-x 3 jobert jobert 4.0K Mar  4 19:46 [0m[01;34mfhv[0m/
-rw-rw-r-- 1 jobert jobert  19M Dec  2  2022 [01;31mfhv_tripdata_2019-10.csv.gz[0m
-rw-rw-r-- 1 jobert jobert  13K Aug 17  2016 taxi+_zone_lookup.csv


## Question 3
The number of records in october 15th is:

In [40]:
data = spark.read.parquet('fhv/2019/10/')

In [41]:
data.show()

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B02784|2019-10-01 09:55:38|2019-10-01 10:05:43|          89|          85|   null|                  null|
|              B02429|2019-10-21 04:15:47|2019-10-21 04:36:04|         264|         264|   null|                B02429|
|              B01482|2019-10-19 12:00:00|2019-10-19 12:20:00|         264|         264|   null|                B01482|
|              B03015|2019-10-11 14:28:00|2019-10-11 14:32:44|         264|         216|   null|                B03015|
|              B01529|2019-10-21 18:00:26|2019-10-21 18:07:21|         264|          80|   null|                B01529|
|              B00477|2019-10-03 19:30:3

In [57]:
data.filter(
    (F.dayofmonth(data.pickup_datetime) == "15")
    &
    (F.dayofmonth(data.dropOff_datetime) == "15")
).count()

                                                                                

61851

In [60]:
data.registerTempTable('trips_hmw')

In [64]:
spark.sql("""
select count('dispatching_base_num')
FROM
    trips_hmw
WHERE
    pickup_datetime >= '2019-10-15'
    and
    pickup_datetime < '2019-10-16'
    
""").show()

+---------------------------+
|count(dispatching_base_num)|
+---------------------------+
|                      62610|
+---------------------------+



## Question 4
The longest trip can be calculated as:

In [65]:
spark.sql("""
SELECT 
    pickup_datetime,
    dropOff_datetime,
    ((unix_timestamp(dropOff_datetime) - unix_timestamp(pickup_datetime))/3600) as duration  
FROM 
    trips_hmw
ORDER BY
    duration desc
LIMIT 3
""").show()



+-------------------+-------------------+-----------------+
|    pickup_datetime|   dropOff_datetime|         duration|
+-------------------+-------------------+-----------------+
|2019-10-11 18:00:00|2091-10-11 18:30:00|         631152.5|
|2019-10-28 09:00:00|2091-10-28 09:30:00|         631152.5|
|2019-10-31 23:46:33|2029-11-01 00:13:00|87672.44083333333|
+-------------------+-------------------+-----------------+



                                                                                

## Question 5
The port for Spark’s User Interface is __4040__.


## Question 6
we get the data from zones from:

In [66]:
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv

--2024-03-04 21:36:24--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-4584-9c62-a6ea97ed0e6a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240304%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240304T213624Z&X-Amz-Expires=300&X-Amz-Signature=773aea8e960897981f7ae88550a7a9dc08fcff69fa26e40885d8889e9c49bca0&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dtaxi_zone_lookup.csv&response-content-type=application%2Foctet-stream [following]
--2024-03-04 21:36:24--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-4584-9c62-a6e

In [69]:
zones = spark.read.csv('taxi_zone_lookup.csv', header = "True")

In [70]:
zones.createOrReplaceTempView('zones')

In [71]:
zones.printSchema()

root
 |-- LocationID: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



In [72]:
zones.show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [78]:
spark.sql("""
SELECT 
    PUlocationID, zone, count(*) as count
FROM
    trips_hmw
INNER JOIN
    zones
ON 
    PULocationID == LocationID
GROUP BY 
    PUlocationID, Zone
ORDER BY
    count
LIMIT 7
""").show()

+------------+--------------------+-----+
|PUlocationID|                zone|count|
+------------+--------------------+-----+
|           2|         Jamaica Bay|    1|
|         105|Governor's Island...|    2|
|         111| Green-Wood Cemetery|    5|
|          30|       Broad Channel|    8|
|         120|     Highbridge Park|   14|
|          12|        Battery Park|   15|
|         207|Saint Michaels Ce...|   23|
+------------+--------------------+-----+

