In [29]:
import os
import numpy
import pandas

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping


In [2]:
# File Processing

In [3]:
for dirname, _, filenames in os.walk('./dataset'):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        print(f"Processing file: {filepath}")


Processing file: ./dataset\fhvhv_tripdata_2022-01.parquet
Processing file: ./dataset\fhvhv_tripdata_2022-02.parquet


In [4]:
# Loading Data

In [5]:
path_files = []

for year in range(2022, 2025):
    year_files = [files for files in os.listdir('./dataset') if f'_{year}-' in files]
    year_files.sort() 
    
    for file in year_files:
        path_files.append(pandas.read_parquet('./dataset/' + file))
        
print(f"Loaded {len(path_files)} files.")

Loaded 2 files.


In [6]:
df = pandas.concat(path_files, ignore_index=True)
print(f"Total rows: {len(df)}")

Total rows: 30770874


In [7]:
df.columns
df['base_passenger_fare'].describe()

count    3.077087e+07
mean     2.041490e+01
std      1.598893e+01
min     -5.201100e+02
25%      1.029000e+01
50%      1.609000e+01
75%      2.502000e+01
max      4.995960e+03
Name: base_passenger_fare, dtype: float64

In [8]:
# Data Cleaning

In [9]:
df = df[(df['base_passenger_fare'] >= 0) & (df['base_passenger_fare'] < 100)]
print(f"Rows after filtering by fare: {len(df)}")

Rows after filtering by fare: 30589303


In [10]:
df = df[df['airport_fee'] == 0]
print(f"Rows after removing airport fee: {len(df)}")

Rows after removing airport fee: 28799598


In [11]:
df = df[['hvfhs_license_num', 'request_datetime', 'pickup_datetime','dropoff_datetime','trip_miles','trip_time', 'base_passenger_fare', 'tips']]
df.describe()

Unnamed: 0,request_datetime,pickup_datetime,dropoff_datetime,trip_miles,trip_time,base_passenger_fare,tips
count,28799598,28799598,28799598,28799600.0,28799600.0,28799600.0,28799600.0
mean,2022-02-01 05:06:40.889685,2022-02-01 05:11:13.478900,2022-02-01 05:27:50.942393,4.022582,1000.048,18.59486,0.7175391
min,2021-12-31 22:55:05,2022-01-01 00:00:00,2022-01-01 00:02:49,0.0,0.0,0.0,0.0
25%,2022-01-17 19:52:30,2022-01-17 19:56:07.250000,2022-01-17 20:10:11,1.48,548.0,9.97,0.0
50%,2022-02-02 08:48:51,2022-02-02 08:54:44,2022-02-02 09:16:13,2.633,845.0,15.37,0.0
75%,2022-02-15 14:59:04,2022-02-15 15:03:53,2022-02-15 15:22:44,5.045,1279.0,23.23,0.0
max,2022-03-01 00:10:00,2022-02-28 23:59:59,2022-03-01 01:38:23,361.113,99152.0,99.99,200.0
std,,,,4.030963,641.5795,11.76026,2.048696


In [12]:
# Feature Engineering

In [13]:
df['request_hour'] = df['request_datetime'].dt.hour
df['request_day_of_week'] = df['request_datetime'].dt.dayofweek

In [14]:
encoder = LabelEncoder()
df['hvfhs_license_num_encoded'] = encoder.fit_transform(df['hvfhs_license_num'])
df = df.drop(columns=['hvfhs_license_num'])

In [15]:
# Spliting Data (Train, Validation, Test)

In [16]:
train_data = df[df['request_datetime'].dt.day <= 20]

# Validation: Days 21–25
validation_data = df[(df['request_datetime'].dt.day >= 21) & (df['request_datetime'].dt.day <= 25)]

# Test: Days 26–end of the month
test_data = df[df['request_datetime'].dt.day >= 26]

In [17]:
train_data = train_data.drop(columns=['request_datetime', 'pickup_datetime', 'dropoff_datetime'])
validation_data = validation_data.drop(columns=['request_datetime', 'pickup_datetime', 'dropoff_datetime'])
test_data = test_data.drop(columns=['request_datetime', 'pickup_datetime', 'dropoff_datetime'])

In [18]:
# Data Normalization

In [19]:
columns_to_scale = ['trip_miles', 'trip_time', 'base_passenger_fare', 'tips']

scaler = StandardScaler()

# Training data
train_data_scaled = train_data.copy()
train_data_scaled[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])

# Validation data
validation_data_scaled = validation_data.copy()
validation_data_scaled[columns_to_scale] = scaler.transform(validation_data[columns_to_scale])

# Test data
test_data_scaled = test_data.copy()
test_data_scaled[columns_to_scale] = scaler.transform(test_data[columns_to_scale])

In [20]:
print("Train Data Sample:")
print(train_data[:5])

Train Data Sample:
   trip_miles  trip_time  base_passenger_fare  tips  request_hour  \
0        1.18        664                24.90   0.0             0   
1        0.82        460                11.97   0.0             0   
2        1.18        595                29.82   0.0             0   
3        1.65        303                 7.91   0.0             0   
4        1.65        461                 9.44   0.0             0   

   request_day_of_week  hvfhs_license_num_encoded  
0                    5                          0  
1                    5                          0  
2                    5                          0  
3                    5                          0  
4                    5                          0  


In [21]:
print("Validation Data Sample:")
print(validation_data[:2])

Validation Data Sample:
         trip_miles  trip_time  base_passenger_fare  tips  request_hour  \
9149683        4.46        760                17.93   0.0             0   
9149807        2.38        611                12.08   0.0             0   

         request_day_of_week  hvfhs_license_num_encoded  
9149683                    4                          0  
9149807                    4                          0  


In [22]:
df[['trip_miles', 'trip_time', 'base_passenger_fare', 'tips']] = scaler.fit_transform(
    df[['trip_miles', 'trip_time', 'base_passenger_fare', 'tips']]
)


In [23]:
df['hour'] = pandas.to_datetime(df['pickup_datetime']).dt.hour
df['day_of_week'] = pandas.to_datetime(df['pickup_datetime']).dt.dayofweek

In [24]:
target_column = 'base_passenger_fare'

# Extract features and targets
X_train = train_data_scaled.drop(columns=[target_column])
y_train = train_data_scaled[target_column]

X_val = validation_data_scaled.drop(columns=[target_column])
y_val = validation_data_scaled[target_column]

X_test = test_data_scaled.drop(columns=[target_column])
y_test = test_data_scaled[target_column]


In [25]:
# Define the model
model = Sequential([
    # Explicit Input Layer
    Input(shape=(X_train.shape[1],)),

    # First Hidden Layer
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    # Second Hidden Layer
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    # Third Hidden Layer
    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    # Output Layer
    Dense(1, activation='linear')  # Linear activation for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])


In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

Epoch 1/20
[1m604250/604250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1751s[0m 3ms/step - loss: 0.2444 - mae: 0.3203 - val_loss: 0.4241 - val_mae: 0.2553
Epoch 2/20
[1m604250/604250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1921s[0m 3ms/step - loss: 0.2266 - mae: 0.3103 - val_loss: 0.5476 - val_mae: 0.2697
Epoch 3/20
[1m604250/604250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2195s[0m 4ms/step - loss: 0.2258 - mae: 0.3099 - val_loss: 0.2837 - val_mae: 0.2793
Epoch 4/20
[1m604250/604250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2278s[0m 4ms/step - loss: 0.2243 - mae: 0.3091 - val_loss: 3.0885 - val_mae: 0.3094
Epoch 5/20
[1m604250/604250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2345s[0m 4ms/step - loss: 0.2242 - mae: 0.3090 - val_loss: 0.4354 - val_mae: 0.2764
Epoch 6/20
[1m604250/604250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2031s[0m 3ms/step - loss: 0.2240 - mae: 0.3088 - val_loss: 0.4771 - val_mae: 0.2900
Epoch 7/20
[1m604250/604250[0m 

In [None]:
# Predictions
y_pred = model.predict(X_test)

# Metrics
rmse = numpy.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print("RMSE:", rmse)
print("MAE:", mae)

[1m139878/139878[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 1ms/step
RMSE: 0.6976405288220727
MAE: 0.38325424579034323
