In [1]:
# Parsing Data

In [2]:
import os
import pandas

In [3]:
for dirname, _, filenames in os.walk('./dataset'):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        print(f"Processing file: {filepath}")


Processing file: ./dataset/fhvhv_tripdata_2022-01.parquet
Processing file: ./dataset/fhvhv_tripdata_2022-02.parquet


In [4]:
path_files = []

for year in range(2020, 2025):
    year_files = [files for files in os.listdir('./dataset') if f'_{year}-' in files]
    year_files.sort() 
    
    for file in year_files:
        path_files.append(pandas.read_parquet('./dataset/' + file))
print(path_files)

[         hvfhs_license_num dispatching_base_num originating_base_num  \
0                   HV0003               B03404               B03404   
1                   HV0003               B03404               B03404   
2                   HV0003               B03404               B03404   
3                   HV0003               B03404               B03404   
4                   HV0003               B03404               B03404   
...                    ...                  ...                  ...   
14751586            HV0003               B03404               B03404   
14751587            HV0003               B03404               B03404   
14751588            HV0003               B03404               B03404   
14751589            HV0003               B03404               B03404   
14751590            HV0003               B03404               B03404   

            request_datetime   on_scene_datetime     pickup_datetime  \
0        2022-01-01 00:05:31 2022-01-01 00:05:40 2022-01-01 00

In [5]:
df = pandas.concat(path_files, ignore_index=True)
print(f"Total rows: {len(df)}")

Total rows: 30770874


In [6]:
df.columns

Index(['hvfhs_license_num', 'dispatching_base_num', 'originating_base_num',
       'request_datetime', 'on_scene_datetime', 'pickup_datetime',
       'dropoff_datetime', 'PULocationID', 'DOLocationID', 'trip_miles',
       'trip_time', 'base_passenger_fare', 'tolls', 'bcf', 'sales_tax',
       'congestion_surcharge', 'airport_fee', 'tips', 'driver_pay',
       'shared_request_flag', 'shared_match_flag', 'access_a_ride_flag',
       'wav_request_flag', 'wav_match_flag'],
      dtype='object')

In [7]:
df['base_passenger_fare'].describe()

count    3.077087e+07
mean     2.041490e+01
std      1.598893e+01
min     -5.201100e+02
25%      1.029000e+01
50%      1.609000e+01
75%      2.502000e+01
max      4.995960e+03
Name: base_passenger_fare, dtype: float64

In [8]:
df['tolls'].describe()

count    3.077087e+07
mean     9.472340e-01
std      3.503236e+00
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      5.045800e+02
Name: tolls, dtype: float64

In [9]:
df = df[(df['base_passenger_fare'] >= 0) & (df['base_passenger_fare'] < 100)]

In [10]:
print(f"Total rows: {len(df)}")

Total rows: 30589303


In [11]:
df = df[(df['airport_fee'] == 0)]

In [12]:
print(f"Total rows: {len(df)}")

Total rows: 28799598


In [13]:
df = df[['hvfhs_license_num', 'request_datetime', 'pickup_datetime','dropoff_datetime','trip_miles','trip_time', 'base_passenger_fare', 'tips']]

In [14]:
df.describe()

Unnamed: 0,request_datetime,pickup_datetime,dropoff_datetime,trip_miles,trip_time,base_passenger_fare,tips
count,28799598,28799598,28799598,28799600.0,28799600.0,28799600.0,28799600.0
mean,2022-02-01 05:06:40.889685,2022-02-01 05:11:13.478900,2022-02-01 05:27:50.942393,4.022582,1000.048,18.59486,0.7175391
min,2021-12-31 22:55:05,2022-01-01 00:00:00,2022-01-01 00:02:49,0.0,0.0,0.0,0.0
25%,2022-01-17 19:52:30,2022-01-17 19:56:07.250000,2022-01-17 20:10:11,1.48,548.0,9.97,0.0
50%,2022-02-02 08:48:51,2022-02-02 08:54:44,2022-02-02 09:16:13,2.633,845.0,15.37,0.0
75%,2022-02-15 14:59:04,2022-02-15 15:03:53,2022-02-15 15:22:44,5.045,1279.0,23.23,0.0
max,2022-03-01 00:10:00,2022-02-28 23:59:59,2022-03-01 01:38:23,361.113,99152.0,99.99,200.0
std,,,,4.030963,641.5795,11.76026,2.048696


In [15]:
df.isnull().sum()

hvfhs_license_num      0
request_datetime       0
pickup_datetime        0
dropoff_datetime       0
trip_miles             0
trip_time              0
base_passenger_fare    0
tips                   0
dtype: int64

In [16]:
df['request_hour'] = df['request_datetime'].dt.hour
df['request_day_of_week'] = df['request_datetime'].dt.dayofweek

In [17]:
from sklearn.preprocessing import LabelEncoder

# Initialize the encoder
encoder = LabelEncoder()

# Apply the encoder to the 'hvfhs_license_num' column
df['hvfhs_license_num_encoded'] = encoder.fit_transform(df['hvfhs_license_num'])
df = df.drop(columns=['hvfhs_license_num'])


In [18]:
train_data = df[(df['request_datetime'].dt.day >= 1) & (df['request_datetime'].dt.day <= 20)]

# Validation: Days 21–25
validation_data = df[(df['request_datetime'].dt.day >= 21) & (df['request_datetime'].dt.day <= 25)]

# Test: Days 26–end of the month
test_data = df[(df['request_datetime'].dt.day >= 26)]

In [19]:
train_data = train_data.drop(columns=['request_datetime', 'pickup_datetime', 'dropoff_datetime'])
validation_data = validation_data.drop(columns=['request_datetime', 'pickup_datetime', 'dropoff_datetime'])
test_data = test_data.drop(columns=['request_datetime', 'pickup_datetime', 'dropoff_datetime'])

In [60]:
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
validation_data = scaler.transform(validation_data)
test_data = scaler.transform(test_data)

In [20]:
train_data.head(20)

Unnamed: 0,trip_miles,trip_time,base_passenger_fare,tips,request_hour,request_day_of_week,hvfhs_license_num_encoded
0,1.18,664,24.9,0.0,0,5,0
1,0.82,460,11.97,0.0,0,5,0
2,1.18,595,29.82,0.0,0,5,0
3,1.65,303,7.91,0.0,0,5,0
4,1.65,461,9.44,0.0,0,5,0
5,4.51,762,17.67,0.0,0,5,0
6,3.68,931,16.68,0.0,0,5,0
7,2.77,843,14.41,4.0,0,5,0
8,2.04,710,10.64,0.0,0,5,0
10,11.29,1387,34.9,0.0,0,5,0


In [21]:
validation_data.head()

Unnamed: 0,trip_miles,trip_time,base_passenger_fare,tips,request_hour,request_day_of_week,hvfhs_license_num_encoded
9149683,4.46,760,17.93,0.0,0,4,0
9149807,2.38,611,12.08,0.0,0,4,0
9150931,14.47,2193,43.78,0.0,0,4,0
9154127,1.38,583,8.95,0.0,0,4,0
9154380,9.47,995,27.7,0.0,0,4,0


In [22]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['trip_miles', 'trip_time', 'base_passenger_fare', 'tips']] = scaler.fit_transform(
    df[['trip_miles', 'trip_time', 'base_passenger_fare', 'tips']]
)


In [23]:
df['hour'] = pandas.to_datetime(df['pickup_datetime']).dt.hour
df['day_of_week'] = pandas.to_datetime(df['pickup_datetime']).dt.dayofweek

In [24]:
from sklearn.model_selection import train_test_split

# Define target variable (y) and features (X)
target_column = 'base_passenger_fare'

# Training data
X_train = train_data.drop(columns=[target_column])  # Drop target column to get features
y_train = train_data[target_column]  # Extract target column

# Validation data
X_val = validation_data.drop(columns=[target_column])
y_val = validation_data[target_column]

# Test data
X_test = test_data.drop(columns=[target_column])
y_test = test_data[target_column]


In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization

# Define the model
model = Sequential([
    # Explicit Input Layer
    Input(shape=(X_train.shape[1],)),

    # First Hidden Layer
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    # Second Hidden Layer
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    # Third Hidden Layer
    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    # Output Layer
    Dense(1, activation='linear')  # Linear activation for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])


In [26]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,  # Increase number of epochs
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

Epoch 1/2
[1m604250/604250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m511s[0m 844us/step - loss: 38.8143 - mae: 3.9546 - val_loss: 37.9561 - val_mae: 3.8230
Epoch 2/2
[1m604250/604250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m513s[0m 848us/step - loss: 34.0770 - mae: 3.7278 - val_loss: 34.9889 - val_mae: 3.6531


In [27]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Predictions
y_pred = model.predict(X_test)

# Metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print("RMSE:", rmse)
print("MAE:", mae)


[1m139878/139878[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 339us/step
RMSE: 8.482271840489252
MAE: 5.028087445398579
