In [10]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the data
df = pd.read_csv('dataset.csv')

# Convert 'Airport' and 'Terminal' into categorical numerical codes
label_encoder = LabelEncoder()
df['Airport'] = label_encoder.fit_transform(df['Airport'])
df['Terminal'] = label_encoder.fit_transform(df['Terminal'])

# Ensure date and hour fields are in a usable format
df['Date'] = pd.to_datetime(df['Date'])  # Ensure 'Date' is a datetime object
df['Hour'] = df['Hour'].str.replace(' - ', '').astype(int)  # Convert 'Hour' into a single integer

In [11]:
# Features
X = df[['Airport', 'Terminal', 'Date', 'Hour', 'Total Passengers', 'Flights']]
X['Date'] = X['Date'].view(int)  # Convert datetime to integer for model compatibility

# Targets
Y = df[['Citizen Avg Wait Time', 'Citizen Max Wait Time', 'Non citizen Avg Wait Time', 'Non citizen Max Wait Time']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Date'] = X['Date'].view(int)  # Convert datetime to integer for model compatibility


In [12]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [13]:
# Initialize XGBoost regressors
model = xgb.XGBRegressor(objective='reg:squarederror')

# Train the model on each target
for column in Y.columns:
    model.fit(X_train, Y_train[column])
    print(f"Trained model for {column}")

Trained model for Citizen Avg Wait Time
Trained model for Citizen Max Wait Time
Trained model for Non citizen Avg Wait Time
Trained model for Non citizen Max Wait Time


In [15]:
from sklearn.metrics import mean_absolute_error

# Evaluate the model
for column in Y.columns:
    predictions = model.predict(X_test)
    mae = mean_absolute_error(Y_test[column], predictions)
    print(f"Mean Absolute Error for {column}: {mae}")

Mean Absolute Error for Citizen Avg Wait Time: 11.595244514001323
Mean Absolute Error for Citizen Max Wait Time: 5.640284779403384
Mean Absolute Error for Non citizen Avg Wait Time: 8.876820477121317
Mean Absolute Error for Non citizen Max Wait Time: 5.634629996927397


Mean Absolute Error for Citizen Avg Wait Time: 11.595244514001323
Mean Absolute Error for Citizen Max Wait Time: 5.640284779403384
Mean Absolute Error for Non citizen Avg Wait Time: 8.876820477121317
Mean Absolute Error for Non citizen Max Wait Time: 5.634629996927397