# Lab Assignment 6

- Name - Aryan Gupta
- Roll No.- 230150003
- Date - 10 Sept., 2025
- Course - DA331 Big Data Analytics: Tools & Techniques

## Importing Libraries

In [9]:
import os, time, warnings
warnings.filterwarnings('ignore')

import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, to_timestamp, avg, sum as spark_sum, desc, row_number, round
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("lab6").getOrCreate()

# read all the filenames available
all_files = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        all_files.append(os.path.join(dirname, filename))
    print(all_files)

[]
['/kaggle/input/nyc-yellow-taxi-trip-data/yellow_tripdata_2016-01.csv', '/kaggle/input/nyc-yellow-taxi-trip-data/yellow_tripdata_2016-03.csv', '/kaggle/input/nyc-yellow-taxi-trip-data/yellow_tripdata_2016-02.csv', '/kaggle/input/nyc-yellow-taxi-trip-data/yellow_tripdata_2015-01.csv']


## Load Data

In [3]:
# defining paths
TRIPS_DIR = '/kaggle/input/nyc-yellow-taxi-trip-data'

In [4]:
trips_df = spark.read.csv(TRIPS_DIR, header=True, inferSchema=True)

                                                                                

In [5]:
# basic details
n=trips_df.count()
print(trips_df.printSchema())
print(f'Number of rows: {n}')
print(len(trips_df.columns))
print(trips_df.show(5))

                                                                                

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- pickup_longitude: string (nullable = true)
 |-- pickup_latitude: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: string (nullable = true)
 |-- dropoff_latitude: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)

None
Number of rows: 47248845
19
+--------+--------------------+---------------------+---------------+-------------+-----------

47.2 million rows and 19 columns!!!!!!! Truly big data

## Pre-processing

In [6]:
# How accurate the longitudes and latitudes should be
PRECISION = 2

# rounding off coordinates for better estimate
# extracting date as well for q5
df = trips_df.withColumn("pu_lon_r", round(col("Pickup_longitude"), PRECISION)) \
           .withColumn("pu_lat_r", round(col("Pickup_latitude"), PRECISION)) \
           .withColumn("do_lon_r", round(col("Dropoff_longitude"), PRECISION)) \
           .withColumn("do_lat_r", round(col("Dropoff_latitude"), PRECISION)) \
           .withColumn("pickup_date", to_date(to_timestamp(col('tpep_pickup_datetime'))))

print('Preview rows:')
df.show(5, truncate=False)

Preview rows:
+--------+--------------------+---------------------+---------------+-------------+-------------------+------------------+----------+------------------+-------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------+--------+--------+--------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|pickup_longitude   |pickup_latitude   |RatecodeID|store_and_fwd_flag|dropoff_longitude  |dropoff_latitude  |payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|pu_lon_r|pu_lat_r|do_lon_r|do_lat_r|pickup_date|
+--------+--------------------+---------------------+---------------+-------------+-------------------+------------------+----------+------------------+-------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------+--------+---

In [7]:
# Removing invalid trips
trips_valid = df.filter((col('trip_distance') > 0) & (col('passenger_count') > 0) & col('pu_lat_r').isNotNull() & col('pu_lon_r').isNotNull() & col('do_lon_r').isNotNull() & col('do_lat_r').isNotNull())

n = trips_valid.count()
print(f'New number of rows: {n}')



New number of rows: 35802455


                                                                                

## Q1 Biggest average trip distance by pickup (PySpark)

In [8]:
start = time.perf_counter()

avg_by_pu = trips_valid.groupBy('pu_lat_r', 'pu_lon_r') \
    .agg(avg(col('trip_distance')).alias('avg_trip_distance'))

q1 = avg_by_pu.orderBy(desc('avg_trip_distance')).limit(20)

end = time.perf_counter()
q1_time = end - start
print(f'Q1 (PySpark) execution time: {q1_time:.2f}s')
print('Top 20 pickup locations by avg trip distance:')
q1.show()

Q1 (PySpark) execution time: 0.10s
Top 20 pickup locations by avg trip distance:




+--------+--------+------------------+
|pu_lat_r|pu_lon_r| avg_trip_distance|
+--------+--------+------------------+
|   40.75|  -73.95|1476.0947933015127|
|   40.75|   -73.9| 318.6500810997923|
|   40.74|  -73.95|224.68512795883365|
|   40.75|  -74.43|             191.2|
|    39.8|  -75.46|             139.7|
|  -77.04|    38.9|            129.46|
|   39.92|  -75.24|             121.6|
|   40.74|  -73.93|120.06613164054187|
|   40.18|  -74.25|            117.83|
|   40.85|  -73.63|             102.7|
|   39.97|  -74.58|              94.8|
|   41.42|  -73.96|              94.2|
|   40.81|  -74.17|              92.2|
|   40.83|  -73.08|              90.1|
|   40.92|  -72.67|             87.18|
|   40.35|  -74.31|              83.2|
|   40.82|  -73.95| 82.86335495378624|
|   41.32|  -72.93|              80.9|
|    41.5|   -74.1|              78.6|
|   40.54|  -74.29|              75.6|
+--------+--------+------------------+



                                                                                

## Q2 Biggest average trip distance by pickup (Pandas)

In [12]:
import pandas as pd

In [13]:
# read data using pandas
li=[]
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df_pd = pd.concat(li, axis=0, ignore_index=True)

In [15]:
t0 = time.perf_counter()

df_pd['pu_lat_r'] = df_pd['pickup_latitude'].round(PRECISION)
df_pd['pu_lon_r'] = df_pd['pickup_longitude'].round(PRECISION)

out = df_pd.groupby(['pu_lat_r', 'pu_lon_r'])['trip_distance'].mean().reset_index(name='avg_trip_distance')
out = out.sort_values('avg_trip_distance', ascending=False).head(20)

t1 = time.perf_counter()
q2_time = t1-t0

print(f'Q2 (Pandas grouping) execution time: {q2_time:.2f}s')
print('Top 20 pickup locations by avg trip distance:')
print(out)

Q2 (Pandas grouping) execution time: 6.46s
Top 20 pickup locations by avg trip distance:
      pu_lat_r  pu_lon_r  avg_trip_distance
2233     40.75    -73.95         662.422200
2238     40.75    -73.90         236.722246
2143     40.74    -73.95         169.562411
283      39.80    -75.46         139.700000
296      39.92    -75.24         121.600000
346      40.18    -74.25         117.830000
2145     40.74    -73.93          98.222083
2195     40.75    -74.43          95.600000
308      39.97    -74.58          94.800000
4484     41.42    -73.96          94.200000
3553     40.92    -72.67          87.180000
4451     41.32    -72.93          80.900000
4497     41.50    -74.10          78.600000
309      39.98    -75.14          73.790000
278      39.75    -75.61          72.500000
4505     41.53    -73.89          71.300000
335      40.14    -75.11          67.900000
628      40.50    -74.87          66.000000
2879     40.82    -73.95          65.636407
347      40.18    -74.02       

### Comparing Q1 and Q2 times

In [17]:
print(f'Q1 execution time: {q1_time:.2f}s')
print(f'Q2 execution time: {q2_time:.2f}s')
print(f'Q1 is {-(q1_time-q2_time):.2f}s faster than Q2')

Q1 execution time: 0.10s
Q2 execution time: 6.46s
Q1 is 6.36s faster than Q2


## Q3 Total passengers by Drop-off location

In [18]:
pass_by_do = trips_valid.groupBy('do_lat_r', 'do_lon_r').agg(spark_sum(col('passenger_count')).alias('total_passengers_arrived'))
q3 = pass_by_do.orderBy(desc('total_passengers_arrived')).limit(20)
q3.show()



+--------+--------+------------------------+
|do_lat_r|do_lon_r|total_passengers_arrived|
+--------+--------+------------------------+
|   40.75|  -73.99|               2975666.0|
|   40.76|  -73.98|               2949878.0|
|   40.76|  -73.97|               2937406.0|
|   40.75|  -73.98|               2541824.0|
|   40.76|  -73.99|               2097084.0|
|   40.74|  -73.99|               2057248.0|
|   40.77|  -73.96|               1766842.0|
|   40.73|  -73.99|               1584650.0|
|   40.74|  -73.98|               1571133.0|
|   40.77|  -73.98|               1546508.0|
|   40.78|  -73.95|               1495322.0|
|   40.73|   -74.0|               1446497.0|
|   40.74|   -74.0|               1368254.0|
|   40.78|  -73.98|               1362509.0|
|   40.78|  -73.96|               1346641.0|
|   40.75|  -73.97|               1260662.0|
|   40.71|  -74.01|               1196425.0|
|   40.72|   -74.0|               1124691.0|
|   40.77|  -73.95|               1088236.0|
|   40.72|

                                                                                

## Q4 Total passengers by Pickup location

In [19]:
pass_by_pu = trips_valid.groupBy('pu_lat_r', 'pu_lon_r').agg(spark_sum(col('passenger_count')).alias('total_passengers_started'))
q4 = pass_by_pu.orderBy(desc('total_passengers_started')).limit(20)
q4.show()



+--------+--------+------------------------+
|pu_lat_r|pu_lon_r|total_passengers_started|
+--------+--------+------------------------+
|   40.75|  -73.99|               3592782.0|
|   40.76|  -73.97|               3557477.0|
|   40.76|  -73.98|               3140117.0|
|   40.75|  -73.98|               2764544.0|
|   40.76|  -73.99|               2606077.0|
|   40.74|  -73.99|               2601030.0|
|   40.73|  -73.99|               2124200.0|
|   40.77|  -73.98|               1923318.0|
|   40.73|   -74.0|               1866839.0|
|   40.77|  -73.96|               1825984.0|
|   40.74|   -74.0|               1679518.0|
|   40.74|  -73.98|               1654037.0|
|   40.72|  -73.99|               1494092.0|
|   40.78|  -73.96|               1446796.0|
|   40.78|  -73.95|               1436291.0|
|   40.78|  -73.98|               1427774.0|
|   40.75|  -73.97|               1324093.0|
|   40.72|   -74.0|               1225956.0|
|   40.71|  -74.01|               1118059.0|
|   40.77|

                                                                                

## Q5 Top total passengers number by Pickup location daily

In [20]:
daily_pu = trips_valid.groupBy('pickup_date', 'pu_lat_r', 'pu_lon_r').agg(spark_sum(col('passenger_count')).alias('total_passengers'))
daily_ranked = daily_pu.orderBy(desc('total_passengers'))
q5 = daily_ranked.limit(20)
q5.show()



+-----------+--------+--------+----------------+
|pickup_date|pu_lat_r|pu_lon_r|total_passengers|
+-----------+--------+--------+----------------+
| 2015-01-22|   40.76|  -73.97|         38719.0|
| 2015-01-14|   40.76|  -73.97|         38169.0|
| 2015-01-15|   40.76|  -73.97|         38094.0|
| 2015-01-13|   40.76|  -73.97|         37325.0|
| 2015-01-16|   40.76|  -73.97|         36829.0|
| 2015-01-31|   40.75|  -73.99|         36732.0|
| 2015-01-10|   40.75|  -73.99|         36654.0|
| 2015-01-08|   40.76|  -73.97|         36320.0|
| 2015-01-23|   40.75|  -73.99|         36000.0|
| 2015-01-16|   40.75|  -73.99|         35930.0|
| 2015-01-17|   40.75|  -73.99|         35865.0|
| 2015-01-21|   40.76|  -73.97|         35843.0|
| 2015-01-29|   40.76|  -73.97|         35588.0|
| 2015-01-23|   40.76|  -73.97|         35399.0|
| 2016-03-05|   40.75|  -73.99|         35205.0|
| 2015-01-30|   40.76|  -73.97|         35140.0|
| 2015-01-10|   40.76|  -73.97|         35065.0|
| 2015-01-09|   40.7

                                                                                