In [None]:
import pandas as pd
import numpy as np

# Load & clean trip data 
# High Volume Trip Data

# unstub this before running. will not push top main without stubbing
# df = pd.read_csv('/workspaces/MSE446-Optimization-Ride-Pooling-For-Emissions-Reduction/.gitignore/2023_High_Volume_FHV_Trip_Data_20260227.csv')

trip_columns = [
    "pickup_datetime", "dropoff_datetime",
    "PULocationID", "DOLocationID",
    "trip_miles", "trip_time", "shared_match_flag"
]
df = df[trip_columns]

# Convert data types to appropriate formats and handle errors

df["pickup_datetime"]  = pd.to_datetime(df["pickup_datetime"],  format="%m/%d/%Y %I:%M:%S %p", errors="coerce")
df["dropoff_datetime"] = pd.to_datetime(df["dropoff_datetime"], format="%m/%d/%Y %I:%M:%S %p", errors="coerce")
df["trip_time"]  = pd.to_numeric(df["trip_time"],  errors="coerce")
df["trip_miles"] = pd.to_numeric(df["trip_miles"], errors="coerce")

# Remove rows with missing or invalid data

df = df.dropna(subset=["pickup_datetime", "dropoff_datetime", "trip_miles", "trip_time", "PULocationID", "DOLocationID"])
df = df[df["trip_miles"] > 0]
df = df[df["trip_time"]  > 0]
df = df[df["dropoff_datetime"] > df["pickup_datetime"]]

# Load & clean EPA data
epa_df = pd.read_csv('/workspaces/MSE446-Optimization-Ride-Pooling-For-Emissions-Reduction/.gitignore/vehicles.csv', low_memory=False)

# remove rows with invalid data

epa_columns = ["VClass", "comb08", "co2TailpipeGpm"]
epa_df = epa_df[epa_columns]
epa_df = epa_df.dropna()
epa_df = epa_df[epa_df["co2TailpipeGpm"] > 0]
epa_df = epa_df[epa_df["comb08"] > 0]
epa_df.columns = ["vehicle_class", "comb_mpg", "co2_g_per_mile"]

# Summary
print("Trip Data:")
print(f"Rows: {len(df):,}")
print(df.head())
print(df.info())



# Summary
print("EPA Data:")
print(f"Rows: {len(epa_df):,}")
print(epa_df.head())
print(epa_df.info())


          pickup_datetime        dropoff_datetime  PULocationID  DOLocationID  \
0  01/01/2023 12:07:15 AM  01/01/2023 12:44:36 AM            10            62   
1  01/01/2023 12:41:26 AM  01/01/2023 01:09:55 AM           210            39   
2  01/01/2023 12:49:39 AM  01/01/2023 01:28:08 AM           155           177   
3  01/01/2023 12:56:59 AM  01/01/2023 01:18:01 AM            91            35   
4  01/01/2023 12:44:00 AM  01/01/2023 01:03:58 AM            21           181   

   trip_miles trip_time shared_match_flag  
0      11.589     2,241                 Y  
1       4.699     1,709                 Y  
2       7.008     2,309                 Y  
3       3.974     1,262                 Y  
4       5.207     1,198                 Y  
<class 'pandas.DataFrame'>
RangeIndex: 2064852 entries, 0 to 2064851
Data columns (total 7 columns):
 #   Column             Dtype  
---  ------             -----  
 0   pickup_datetime    str    
 1   dropoff_datetime   str    
 2   PULocationID   

  epa_df = pd.read_csv('/workspaces/MSE446-Optimization-Ride-Pooling-For-Emissions-Reduction/vehicles.csv')
