# Imports

In [None]:
import os
import numpy
import pandas
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping

# File Processing

In [None]:
for dirname, _, filenames in os.walk('./dataset'):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        print(f"Processing file: {filepath}")


# Loading Data & Data Verification

In [None]:
path_files = []

for year in range(2022, 2025):
    year_files = [files for files in os.listdir('./dataset') if f'_{year}-' in files]
    year_files.sort() 
    
    for file in year_files:
        path_files.append(pandas.read_parquet('./dataset/' + file))
        
print(f"Loaded {len(path_files)} files.")

In [None]:
df = pandas.concat(path_files, ignore_index=True)
print(f"Total rows: {len(df)}")

In [None]:
df.columns

In [None]:
df['base_passenger_fare'].describe()

# Data Cleaning

In [None]:
df = df[(df['base_passenger_fare'] >= 0) & (df['base_passenger_fare'] < 100)]
print(f"Rows after filtering by fare: {len(df)}")

In [None]:
df = df[df['airport_fee'] == 0]
print(f"Rows after removing airport fee: {len(df)}")

In [None]:
df = df[['hvfhs_license_num', 'request_datetime','trip_miles','trip_time', 'base_passenger_fare', 'tips']]
df.describe()

# Feature Engineering

In [None]:
df['request_hour'] = df['request_datetime'].dt.hour
df['request_day_of_week'] = df['request_datetime'].dt.dayofweek

In [None]:
encoder = LabelEncoder()
df['hvfhs_license_num_encoded'] = encoder.fit_transform(df['hvfhs_license_num'])
df = df.drop(columns=['hvfhs_license_num'])

# Feature Correlation Heatmap

In [None]:
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Feature Correlations")
plt.show()

# Spliting Data (Train, Validation, Test)

In [None]:
# Train: Days 1–20
train_data = df[df['request_datetime'].dt.day <= 20]

# Validation: Days 21–25
validation_data = df[(df['request_datetime'].dt.day >= 21) & (df['request_datetime'].dt.day <= 25)]

# Test: Days 26–end of the month
test_data = df[df['request_datetime'].dt.day >= 26]

In [None]:
train_data = train_data.drop(columns=['request_datetime'])
validation_data = validation_data.drop(columns=['request_datetime'])
test_data = test_data.drop(columns=['request_datetime'])

# Data Normalization

In [None]:
columns_to_scale = ['trip_miles', 'trip_time', 'base_passenger_fare', 'tips']

scaler = StandardScaler()

# Training data
train_data_scaled = train_data.copy()
train_data_scaled[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])

# Validation data
validation_data_scaled = validation_data.copy()
validation_data_scaled[columns_to_scale] = scaler.transform(validation_data[columns_to_scale])

# Test data
test_data_scaled = test_data.copy()
test_data_scaled[columns_to_scale] = scaler.transform(test_data[columns_to_scale])

In [None]:
print("Train Data Sample:")
train_data[:10]

In [None]:
print("Validation Data Sample:")
validation_data[:10]

In [None]:
df[['trip_miles', 'trip_time', 'base_passenger_fare', 'tips']] = scaler.fit_transform(
    df[['trip_miles', 'trip_time', 'base_passenger_fare', 'tips']]
)

# Prepare Data for Hyperparameter Tuning

In [None]:
def prepare_data(target_column):
    X_train = train_data.drop(columns=[target_column])
    y_train = train_data[target_column]

    X_val = validation_data.drop(columns=[target_column])
    y_val = validation_data[target_column]

    X_test = test_data.drop(columns=[target_column])
    y_test = test_data[target_column]
    
    return X_train, y_train, X_val, y_val, X_test, y_test

# Neural Network Model with Hyperparameter Optimization

In [None]:
def build_model(neurons, optimizer, dropout_rate):
    model = Sequential()
    for n in neurons:
        model.add(Dense(n, activation='relu'))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1))  # Regression output
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

In [None]:
def custom_grid_search(X_train, y_train, param_grid, cv=3):
    best_params = None
    best_score = float('inf')

    for neurons in param_grid['neurons']:
        for optimizer in param_grid['optimizer']:
            for dropout_rate in param_grid['dropout_rate']:
                for batch_size in param_grid['batch_size']:
                    for epochs in param_grid['epochs']:
                        print(f"Training with: neurons={neurons}, optimizer={optimizer.__class__.__name__}, dropout_rate={dropout_rate}, "
                              f"batch_size={batch_size}, epochs={epochs}")
                        
                        # Train model
                        model = build_model(neurons=neurons, optimizer=optimizer, dropout_rate=dropout_rate)
                        history = model.fit(
                            X_train, y_train,
                            batch_size=batch_size,
                            epochs=epochs,
                            verbose=0,
                            validation_split=1/cv
                        )
                        
                        # Evaluate with validation loss
                        val_loss = history.history['val_loss'][-1]
                        if val_loss < best_score:
                            best_score = val_loss
                            best_params = {
                                'neurons': neurons,
                                'optimizer': optimizer.__class__.__name__,
                                'dropout_rate': dropout_rate,
                                'batch_size': batch_size,
                                'epochs': epochs
                            }

    print("Best Parameters:", str(best_params))
    return best_params

# Train and Evaluate Model

In [None]:
def train_and_evaluate(target_column, neurons, optimizer, dropout_rate, batch_size, epochs):
    X_train, y_train, X_val, y_val, X_test, y_test = prepare_data(target_column)
    model = build_model(neurons=neurons, optimizer=optimizer, dropout_rate=dropout_rate)

    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )

    history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(X_val, y_val), callbacks=[early_stopping])
    y_pred = model.predict(X_test)

    mse = numpy.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)

    print(f"Target: {target_column} | MSE: {mse:.2f} | MAE: {mae:.2f}")

    return model, history

# Plot Training History

In [None]:
def plot_training_history(history, name):
    plt.figure(figsize=(12, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'{name} Model Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

    plt.figure(figsize=(12, 6))
    plt.plot(history.history['mae'], label='Training MAE')
    plt.plot(history.history['val_mae'], label='Validation MAE')
    plt.title(f'{name} Model Mean Absolute Error (MAE)')
    plt.xlabel('Epochs')
    plt.ylabel('MAE')
    plt.legend()
    plt.grid(True)
    plt.show()

# Base Passenger Fare Model

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = prepare_data('base_passenger_fare')

In [None]:
param_grid = {
    'neurons': [(128, 64, 32), (64, 32)],
    'optimizer': [Adam(), RMSprop()],
    'dropout_rate': [0.2, 0.3, 0.4],
    'batch_size': [32, 64],
    'epochs': [2]
}

best_params = custom_grid_search(X_train, y_train, param_grid)

In [None]:
base_model, base_history = train_and_evaluate(
    target_column='base_passenger_fare',
    neurons=best_params['neurons'],
    optimizer=best_params['optimizer'],
    dropout_rate=best_params['dropout_rate'],
    batch_size=best_params['batch_size'],
    epochs=20
)

In [None]:
plot_training_history(base_history, 'Base Passenger Fare')

# Compare Fare Predictions For Uber/Lyft

In [None]:
license_plates = {'HV0003': 'Uber', 'HV0005': 'Lyft'}
license_plates_encoded = {encoder.transform([plate])[0]: company for plate, company in license_plates.items()}
target_column = 'base_passenger_fare'

for encoded_plate, company in license_plates_encoded.items():
    group_data = test_data_scaled[test_data_scaled['hvfhs_license_num_encoded'] == encoded_plate]
    
    X_group = group_data.drop(columns=[target_column])
    y_group_true = group_data[target_column]
    
    y_group_pred = base_model.predict(X_group)
    
    group_mse = numpy.sqrt(mean_squared_error(y_group_true, y_group_pred))
    group_mae = mean_absolute_error(y_group_true, y_group_pred)
    
    print(f"{company} - MSE: {group_mse:.2f}, MAE: {group_mae:.2f}")
    
    # Plot true vs. predicted fares
    plt.figure(figsize=(10, 5))
    plt.scatter(y_group_true, y_group_pred, alpha=0.5, label=f"{company} Predictions")
    plt.plot([y_group_true.min(), y_group_true.max()], [y_group_true.min(), y_group_true.max()], 'r--', label='Ideal Prediction')
    plt.title(f"True vs. Predicted Base Passenger Fare for {company}")
    plt.xlabel('True Fare')
    plt.ylabel('Predicted Fare')
    plt.legend(loc='upper right')
    plt.grid(True)
    plt.show()

# Evaluate Accuracy by Hour

In [None]:
test_data_scaled['request_hour'] = test_data['request_hour']

# Group by hour and compute average MSE
hourly_mse = test_data_scaled.groupby('request_hour', group_keys=False).apply(
    lambda group: numpy.sqrt(mean_squared_error(group[target_column], base_model.predict(group.drop(columns=[target_column]))))
)

In [None]:
# Plot hourly MSE
plt.figure(figsize=(12, 6))
hourly_mse.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Hourly MSE for Fare Predictions')
plt.xlabel('Hour of the Day')
plt.ylabel('MSE')
plt.grid(axis='y')
plt.show()

# Tips Model

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = prepare_data('tips')

In [None]:
param_grid = {
    'neurons': [(128, 64, 32), (64, 32)],
    'optimizer': [Adam(), RMSprop()],
    'dropout_rate': [0.2, 0.3, 0.4],
    'batch_size': [32, 64],
    'epochs': [2]
}

best_params = custom_grid_search(X_train, y_train, param_grid)

In [None]:
tips_model, tips_history = train_and_evaluate(
    target_column='base_passenger_fare',
    neurons=best_params['neurons'],
    optimizer=best_params['optimizer'],
    dropout_rate=best_params['dropout_rate'],
    batch_size=best_params['batch_size'],
    epochs=20
)

In [None]:
plot_training_history(tips_history, 'Tips')