In [None]:
import os
import pandas

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
# File Processing

In [None]:
for dirname, _, filenames in os.walk('./dataset'):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        print(f"Processing file: {filepath}")


In [None]:
# Loading Data

In [None]:
path_files = []

for year in range(2022, 2025):
    year_files = [files for files in os.listdir('./dataset') if f'_{year}-' in files]
    year_files.sort() 
    
    for file in year_files:
        path_files.append(pandas.read_parquet('./dataset/' + file))
        
print(f"Loaded {len(path_files)} files.")

In [None]:
df = pandas.concat(path_files, ignore_index=True)
print(f"Total rows: {len(df)}")

In [None]:
df.columns
df['base_passenger_fare'].describe()
df['tolls'].describe()

In [None]:
# Data Cleaning

In [None]:
df = df[(df['base_passenger_fare'] >= 0) & (df['base_passenger_fare'] < 100)]
print(f"Rows after filtering by fare: {len(df)}")

In [None]:
df = df[df['airport_fee'] == 0]
print(f"Rows after removing airport fee: {len(df)}")

In [None]:
df = df[['hvfhs_license_num', 'request_datetime', 'pickup_datetime','dropoff_datetime','trip_miles','trip_time', 'base_passenger_fare', 'tips']]
print(df.describe())

In [None]:
# Feature Engineering

In [None]:
df['request_hour'] = df['request_datetime'].dt.hour
df['request_day_of_week'] = df['request_datetime'].dt.dayofweek

In [None]:
encoder = LabelEncoder()
df['hvfhs_license_num_encoded'] = encoder.fit_transform(df['hvfhs_license_num'])
df = df.drop(columns=['hvfhs_license_num'])

In [None]:
# Spliting Data (Train, Validation, Test)

In [None]:
train_data = df[df['request_datetime'].dt.day <= 20]

# Validation: Days 21–25
validation_data = df[(df['request_datetime'].dt.day >= 21) & (df['request_datetime'].dt.day <= 25)]

# Test: Days 26–end of the month
test_data = df[df['request_datetime'].dt.day >= 26]

In [None]:
train_data = train_data.drop(columns=['request_datetime', 'pickup_datetime', 'dropoff_datetime'])
validation_data = validation_data.drop(columns=['request_datetime', 'pickup_datetime', 'dropoff_datetime'])
test_data = test_data.drop(columns=['request_datetime', 'pickup_datetime', 'dropoff_datetime'])

In [None]:
# Data Normalization

In [None]:
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
validation_data = scaler.transform(validation_data)
test_data = scaler.transform(test_data)

In [None]:
print("Train Data Sample:")
print(train_data[:5])

In [None]:
print("Validation Data Sample:")
print(validation_data[:2])

In [None]:
df[['trip_miles', 'trip_time', 'base_passenger_fare', 'tips']] = scaler.fit_transform(
    df[['trip_miles', 'trip_time', 'base_passenger_fare', 'tips']]
)


In [None]:
df['hour'] = pandas.to_datetime(df['pickup_datetime']).dt.hour
df['day_of_week'] = pandas.to_datetime(df['pickup_datetime']).dt.dayofweek

In [None]:
target_column = 'base_passenger_fare'

# Training data
X_train = train_data.drop(columns=[target_column]) 
y_train = train_data[target_column]  

# Validation data
X_val = validation_data.drop(columns=[target_column])
y_val = validation_data[target_column]

# Test data
X_test = test_data.drop(columns=[target_column])
y_test = test_data[target_column]


In [None]:
# Define the model
model = Sequential([
    # Explicit Input Layer
    Input(shape=(X_train.shape[1],)),

    # First Hidden Layer
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    # Second Hidden Layer
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    # Third Hidden Layer
    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    # Output Layer
    Dense(1, activation='linear')  # Linear activation for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])


In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,  # Increase number of epochs
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
# Predictions
y_pred = model.predict(X_test)

# Metrics
rmse = numpy.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print("RMSE:", rmse)
print("MAE:", mae)