In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=4ca436cf00f65d28eb58eb472fa03086da17f81fb5977ae5d6b79f89c517821c
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import FloatType

# Create a Spark session
spark = SparkSession.builder.appName("NYCTaxiEmissions").getOrCreate()

# Load the NYC Taxi data
df = spark.read.parquet("nyc_taxi_data.parquet")

# Define UDF for calculating CO2 emissions
def calculate_emissions(distance):
    fuel_consumed = distance / 22  # 22 miles per gallon
    emissions = fuel_consumed * 8.89  # 8.89 kg of CO2 per gallon
    return emissions

calculate_emissions_udf = udf(calculate_emissions, FloatType())

# Calculate emissions for each trip
df_with_emissions = df.withColumn("co2_emissions", calculate_emissions_udf(col("trip_distance")))

# Show the calculated emissions for each trip
df_with_emissions.show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+-------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|co2_emissions|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+-------------+
|       2| 2024-01-01 00:57:55|  2024-01-01 01:17:43|              1|         1.72|         1|                 N|         186|          79|      

In [None]:
# Read the CSV file created in the previous step
df = pd.read_csv("emissions_by_passenger_count.csv")

# Convert pickup_date to datetime
df['pickup_date'] = pd.to_datetime(df['pickup_date'])

# Sort by date
df = df.sort_values('pickup_date')

# Create the plot
plt.figure(figsize=(15, 8))
plt.plot(df['pickup_date'], df['emissions_1_2'], label='1-2 Passengers')
plt.plot(df['pickup_date'], df['emissions_3plus'], label='3+ Passengers')

plt.title('Total CO2 Emissions per Day: 1-2 Passengers vs 3+ Passengers')
plt.xlabel('Date')
plt.ylabel('CO2 Emissions (kg)')
plt.legend()

# Rotate and align the tick labels so they look better
plt.gcf().autofmt_xdate()

# Use a tight layout
plt.tight_layout()

# Save and display the plot
plt.savefig("emissions_comparison.png")
plt.show()
