In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import pandas as pd
import os

# Starts a Spark session
spark = (
    SparkSession.builder
        .appName("Taxi vs Rideshare Profitability")
        .config("spark.sql.repl.eagerEval.enabled", True)
        .config("spark.sql.parquet.cacheMetadata", "true")
        .config("spark.sql.session.timeZone", "Etc/UTC")
        .getOrCreate()
)

# Define months
months = ["2024-01","2024-02","2024-03","2024-04","2024-05","2024-06"]

# Load in data files 
yellow_files = [f"data/yellow/yellow_tripdata_{m}.parquet" for m in months]
fhvhv_files  = [f"data/fhvhv/fhvhv_tripdata_{m}.parquet" for m in months]

df_yellow = (
    spark.read.parquet(*yellow_files)
         .withColumn("service_type", lit("yellow"))
)
df_fhvhv = (
    spark.read.parquet(*fhvhv_files)
         .withColumn("service_type", lit("hv_fhv"))
)

# Merge
df = df_yellow.unionByName(df_fhvhv, allowMissingColumns=True)

# header of yellow
#df_yellow.printSchema()
# header of fhvhv
#df_fhvhv.printSchema()

electricity = spark.read.csv("data/external/electricity.csv", header=True, inferSchema=True)
fuel = spark.read.csv("data/external/fuel.csv", header=True, inferSchema=True)
#electricity.show(); fuel.show()



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/13 11:35:38 WARN Utils: Your hostname, LAPTOP-E04ANIN1, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/08/13 11:35:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/13 11:35:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [None]:
# Preprocess data
# Merge some of the expenses 
df = (
    df.withColumn("expenses",
        df["tolls_amount"] + df["mta_tax"] + df["tip_amount"] + df["extra"] + df["improvement_surcharge"]
    )
    .withColumn("revenue",
        df["total_amount"] - df["expenses"]
    )
    .withColumn("profitability",
        df["revenue"] - df["cost"]
    )
    .withColumn("cost_per_mile",
        df["cost"] / df["trip_distance"]
    )
    .withColumn("cost_per_minute",
        df["cost"] / df["trip_time_in_secs"] * 60
    )
    .withColumn("profitability_per_mile",
        df["profitability"] / df["trip_distance"]
    )
    .withColumn("profitability_per_minute",
        df["profitability"] / (df["trip_time_in_secs"] / 60)
    )
)















