In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical, Real
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import numpy as np
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.amp import GradScaler, autocast
from sklearn.neural_network import MLPClassifier

pd.set_option('display.max_columns', None)

flight_data_train = pd.read_csv(r'D:\MasterUniversity\AdvancedML\Project\DubaiData\flight_data_train_ts_wx.csv')
flight_data_test = pd.read_csv(r'D:\MasterUniversity\AdvancedML\Project\DubaiData\flight_data_test_ts_wx.csv')

print(f'Data shape: Train: {flight_data_train.shape}, Test: {flight_data_test.shape}')

print('Preprocessing')
flight_data_train['scheduledoffblocktime'] = pd.to_datetime(flight_data_train['scheduledoffblocktime'])
flight_data_test['scheduledoffblocktime'] = pd.to_datetime(flight_data_test['scheduledoffblocktime'])

flight_data_train.sort_values(by='scheduledoffblocktime', inplace=True)
flight_data_test.sort_values(by='scheduledoffblocktime', inplace=True)


departdatetime = flight_data_train['scheduledoffblocktime'].dt

flight_data_train['depart_day'] = departdatetime.day
flight_data_train['depart_month'] = departdatetime.month
flight_data_train['depart_dayofweek'] = departdatetime.dayofweek
flight_data_train['depart_minute'] = departdatetime.hour * 60 + departdatetime.minute
# Test
departdatetime = flight_data_test['scheduledoffblocktime'].dt
flight_data_test['depart_day'] = departdatetime.day
flight_data_test['depart_month'] = departdatetime.month
flight_data_test['depart_dayofweek'] = departdatetime.dayofweek
flight_data_test['depart_minute'] = departdatetime.hour * 60 + departdatetime.minute

flight_data_train.drop(columns=['scheduledoffblocktime'], axis=1, inplace=True)
flight_data_test.drop(columns=['scheduledoffblocktime'], axis=1, inplace=True)

X_train = flight_data_train.drop(columns=['delay_in_secs', 'finalflightstatus'], axis=1)
X_test = flight_data_test.drop(columns=['delay_in_secs', 'finalflightstatus'], axis=1)

y_train = flight_data_train['finalflightstatus']
y_test = flight_data_test['finalflightstatus']

y_train = y_train.map({'On-Time': 0, 'Delayed':1})
y_test = y_test.map({'On-Time': 0, 'Delayed':1})

print('Encoding')
# High cardinality columns - CatBoostEncoder
high_cardinality_cols = ['airlinecode_iata', 'destination_iata', 'aircraft_iata', 'publicgatenumber']

# One-hot encoding
one_hot_column =  ['skyc1', 'skyc2', 'traffictypecode', 'aircraftterminal', 'wxcodes'] + high_cardinality_cols
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoded = ohe.fit_transform(X_train[one_hot_column])
ohe_new_columns = ohe.get_feature_names_out(one_hot_column)
encoded_df = pd.DataFrame(encoded, columns=ohe_new_columns)
X_train = pd.concat([X_train.drop(columns=one_hot_column), encoded_df], axis=1)
encoded = ohe.transform(X_test[one_hot_column])
encoded_df = pd.DataFrame(encoded, columns=ohe_new_columns)
X_test = pd.concat([X_test.drop(columns=one_hot_column), encoded_df], axis=1)

Data shape: Train: (197944, 22), Test: (49487, 22)
Preprocessing
Encoding




In [2]:
# StandardScaler
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using: {device}')

Using: cuda


In [3]:
# %% Prepare sequences for LSTM
def create_sequences(features, target, sequence_length=7):
    X, y = [], []
    for i in range(len(features) - sequence_length):
        X.append(features.iloc[i:i + sequence_length].values)
        y.append(target.iloc[i + sequence_length])
    return np.array(X), np.array(y)


sequence_length = 7
X_train_seq, y_train_seq = create_sequences(X_train, y_train, sequence_length)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, sequence_length)

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

n_features = X_train_seq.shape[2]

model = Sequential([
    LSTM(128, input_shape=(sequence_length, n_features), return_sequences=True),
    Dropout(0.2),
    LSTM(128, return_sequences=False),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Single neuron with sigmoid activation for binary classification
])

from tensorflow.keras.backend import epsilon, round, mean, cast, sum as Ksum

def f1_score(y_true, y_pred):
    """
    Custom F1 score metric for Keras.
    """
    y_true = cast(y_true, 'float32')  # Ensure y_true is float32
    y_pred = round(y_pred)  # Convert probabilities to 0 or 1
    y_pred = cast(y_pred, 'float32')  # Ensure y_pred is float32
    tp = Ksum(y_true * y_pred)  # True positives
    precision = tp / (Ksum(y_pred) + epsilon())  # Precision
    recall = tp / (Ksum(y_true) + epsilon())  # Recall
    f1 = 2 * (precision * recall) / (precision + recall + epsilon())
    return f1

# Compile the model with F1 score as a metric
model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy', f1_score])

# Train the model
history = model.fit(X_train_seq, y_train_seq, validation_data=(X_test_seq, y_test_seq),
                    batch_size=128, epochs=4, verbose=1)

  super().__init__(**kwargs)


Epoch 1/4
[1m1547/1547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 16ms/step - accuracy: 0.7165 - f1_score: 1.2614 - loss: 0.5938 - val_accuracy: 0.6745 - val_f1_score: 3.4952 - val_loss: 0.6302
Epoch 2/4
[1m1547/1547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 15ms/step - accuracy: 0.7193 - f1_score: 3.7664 - loss: 0.5807 - val_accuracy: 0.6783 - val_f1_score: 0.6932 - val_loss: 0.6333
Epoch 3/4
[1m1547/1547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 14ms/step - accuracy: 0.7212 - f1_score: 5.3360 - loss: 0.5762 - val_accuracy: 0.6787 - val_f1_score: 0.0000e+00 - val_loss: 0.6315
Epoch 4/4
[1m1547/1547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 14ms/step - accuracy: 0.7213 - f1_score: 6.9627 - loss: 0.5744 - val_accuracy: 0.6675 - val_f1_score: 10.5505 - val_loss: 0.6332


In [5]:
from sklearn.metrics import classification_report
# Evaluate the model
y_pred = model.predict(X_test_seq)

y_pred_classes = (y_pred > 0.5).astype(int)  # Apply threshold to convert probabilities to class labels

# Classification report
print(classification_report(y_test_seq, y_pred_classes))

[1m1547/1547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step
              precision    recall  f1-score   support

           0       0.68      0.95      0.80     33583
           1       0.40      0.07      0.11     15897

    accuracy                           0.67     49480
   macro avg       0.54      0.51      0.45     49480
weighted avg       0.59      0.67      0.58     49480



In [None]:
Accuracy: 0.67
Precision: 0.40
Recall: 0.07
F1 Score: 0.11