In [70]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load the data
df = pd.read_csv('dataset.csv')

# Convert 'Airport' and 'Terminal' into categorical numerical codes
label_encoder = LabelEncoder()
df['Airport'] = label_encoder.fit_transform(df['Airport'])
df['Terminal'] = label_encoder.fit_transform(df['Terminal'])

In [71]:
# Features
X = df[['Airport', 'Terminal', 'Month', 'Day of Month', 'Hour', 'Total Passengers', 'Flights']]

# Targets
Y = df[['Citizen Avg Wait Time', 'Citizen Max Wait Time', 'Non citizen Avg Wait Time', 'Non citizen Max Wait Time']]

In [72]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

Y_train = 400 * np.log(Y_train + 100) - 800

In [73]:
# Initialize XGBoost regressors
model = xgb.XGBRegressor(objective='reg:squarederror')

# Train the model on each target
for column in Y.columns:
    model.fit(X_train, Y_train[column])
    print(f"Trained model for {column}")

Trained model for Citizen Avg Wait Time
Trained model for Citizen Max Wait Time
Trained model for Non citizen Avg Wait Time
Trained model for Non citizen Max Wait Time


In [74]:
from sklearn.metrics import mean_absolute_error
import numpy as np

# Evaluate the model
for column in Y.columns:
    predictions = model.predict(X_test)
    # exponentiate the predictions, forward was 400\log\left(x+100\right)\ -\ 800
    predictions = np.exp((predictions + 800) / 400) - 100
    predictions = np.maximum(predictions, 0)
    mae = mean_absolute_error(Y_test[column], predictions)
    print(f"Mean Absolute Error for {column}: {mae}")

print("\n")
print("Mean Absolute Error for each target when Flights > 0:")


for column in Y.columns:
    X_test_nonzero_flights = X_test[X_test['Flights'] != 0]
    Y_test_nonzero_flights = Y_test[X_test['Flights'] != 0]
    predictions = model.predict(X_test_nonzero_flights)
    # exponentiate the predictions
    predictions = np.exp((predictions + 800) / 400) - 100
    predictions = np.maximum(predictions, 0)
    mae = mean_absolute_error(Y_test_nonzero_flights[column], predictions)
    print(f"Mean Absolute Error for {column}: {mae}")

ValueError: y_true and y_pred have different number of output (4!=1)

Mean Absolute Error for Citizen Avg Wait Time: 11.49705275263308
Mean Absolute Error for Citizen Max Wait Time: 5.54209301803514
Mean Absolute Error for Non citizen Avg Wait Time: 8.778628715753072
Mean Absolute Error for Non citizen Max Wait Time: 5.536438235559152


Mean Absolute Error for Citizen Avg Wait Time: 29.83325615517878
Mean Absolute Error for Citizen Max Wait Time: 14.251647780401461
Mean Absolute Error for Non citizen Avg Wait Time: 22.720291395207095
Mean Absolute Error for Non citizen Max Wait Time: 14.236851608821413