In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load dataset
file_path = "combined_simulated.csv"  # Update this path if needed
df = pd.read_csv(file_path)

In [3]:
# Display dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 376554 entries, 0 to 376553
Data columns (total 35 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   UTC_TIME                   376554 non-null  object 
 1   FUEL_USED_2                376554 non-null  float64
 2   FUEL_USED_3                376554 non-null  float64
 3   FUEL_USED_4                376554 non-null  float64
 4   FW_GEO_ALTITUDE            376554 non-null  float64
 5   VALUE_FOB                  376554 non-null  float64
 6   VALUE_FUEL_QTY_CT          376554 non-null  float64
 7   VALUE_FUEL_QTY_FT1         376554 non-null  float64
 8   VALUE_FUEL_QTY_FT2         376554 non-null  float64
 9   VALUE_FUEL_QTY_FT3         376554 non-null  float64
 10  VALUE_FUEL_QTY_FT4         376554 non-null  float64
 11  VALUE_FUEL_QTY_LXT         376554 non-null  float64
 12  VALUE_FUEL_QTY_RXT         376554 non-null  float64
 13  FLIGHT_PHASE_COUNT         37

## Model Setups

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def split_train_test(df, leak_column="LEAK_FLOW_FLAG", test_size=0.15):
    """
    Splits the dataset into train and test sets while maintaining the chronological order.
    Ensures the same proportion of leaks in train and test sets.
    """
    df = df.sort_values(by="UTC_TIME")  # Sort by time to prevent data leakage

    split_idx = int(len(df) * (1 - test_size))  # Compute split index
    train_df = df.iloc[:split_idx]
    test_df = df.iloc[split_idx:]

    # Count the number of leaks in each dataset
    train_leak_count = train_df['LEAK_FLOW_FLAG'].sum()
    test_leak_count = test_df['LEAK_FLOW_FLAG'].sum()
    total_leak_count = df['LEAK_FLOW_FLAG'].sum()
    # Compute expected leak split (should be 75%-25%)
    expected_train_leaks = int(total_leak_count * 0.85)
    expected_test_leaks = total_leak_count - expected_train_leaks

    # Display class distributions
    print(f"Total Leak Count: {total_leak_count}")
    print(f"Train set leak rate: {100 * train_df[leak_column].mean():.4f}% == Training Leak Count: {train_leak_count} (Expected: {expected_train_leaks}) ")
    print(f"Test set leak rate: {100 * test_df[leak_column].mean():.4f}% == Testing Leak Count: {test_leak_count} (Expected: {expected_test_leaks})")
    print(f"Training Data Shape: {train_df.shape}, Testing Data Shape: {test_df.shape}")

    return train_df, test_df

# Apply split
train_df, test_df = split_train_test(df)


Total Leak Count: 33475
Train set leak rate: 8.2841% == Training Leak Count: 26515 (Expected: 28453) 
Test set leak rate: 12.3221% == Testing Leak Count: 6960 (Expected: 5022)
Training Data Shape: (320070, 35), Testing Data Shape: (56484, 35)


Autoencoder

In [6]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from keras.models import Model
from keras.layers import Input, Dense
from keras import regularizers
import numpy as np

# Filter out the entries with leaks from training data
train_data = train_df[train_df['LEAK_FLOW_FLAG'] == 0].drop(columns=['LEAK_FLOW_FLAG'])

# Identify numerical columns and scale only those
numerical_cols = train_data.select_dtypes(include=[np.number]).columns
scaler = MinMaxScaler()
train_data_scaled = scaler.fit_transform(train_data[numerical_cols])

# Define the autoencoder model architecture
input_dim = train_data_scaled.shape[1]  # Number of features
encoding_dim = 14  # Dimension of the encoded representation

# Input layer
input_layer = Input(shape=(input_dim,))
# Encoder layers
encoder = Dense(encoding_dim, activation="tanh", activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
# Decoder layers
decoder = Dense(int(encoding_dim / 2), activation='tanh')(encoder)
decoder = Dense(input_dim, activation='relu')(decoder)

# Define the autoencoder model
autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the autoencoder on the scaled training data
autoencoder.fit(train_data_scaled, train_data_scaled,
                epochs=50,
                batch_size=32,
                shuffle=False,
                validation_split=0.2,
                verbose=1)

# Prepare the test data by selecting only numerical columns and scaling
test_data = test_df[numerical_cols]
test_data_scaled = scaler.transform(test_data)

# Get the reconstruction loss (MSE) on the test data
reconstructions = autoencoder.predict(test_data_scaled)
mse = np.mean(np.power(test_data_scaled - reconstructions, 2), axis=1)

# Determine the threshold for anomaly detection based on 95th percentile of MSE
threshold = np.percentile(mse, 95)

# Detect anomalies based on the threshold
test_df = test_df.copy()  # Avoid SettingWithCopyWarning
test_df['reconstruction_error'] = mse
test_df['anomaly'] = test_df['reconstruction_error'] > threshold

# Calculate evaluation metrics: Confusion Matrix, Accuracy, Precision, Recall, F1 Score
y_true = test_df['LEAK_FLOW_FLAG']  # True labels
y_pred = test_df['anomaly'].astype(int)  # Predicted labels (0 or 1)

conf_matrix = confusion_matrix(y_true, y_pred)  # Confusion Matrix
accuracy = accuracy_score(y_true, y_pred)  # Accuracy
precision = precision_score(y_true, y_pred, zero_division=0)  # Precision
recall = recall_score(y_true, y_pred, zero_division=0)  # Recall
f1 = f1_score(y_true, y_pred, zero_division=0)  # F1 Score

# Print the evaluation results
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Epoch 1/50
[1m7339/7339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step - loss: 0.0629 - val_loss: 0.0329
Epoch 2/50
[1m7339/7339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 0.0233 - val_loss: 0.0065
Epoch 3/50
[1m7339/7339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 0.0048 - val_loss: 0.0050
Epoch 4/50
[1m7339/7339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - loss: 0.0044 - val_loss: 0.0047
Epoch 5/50
[1m7339/7339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step - loss: 0.0043 - val_loss: 0.0044
Epoch 6/50
[1m7339/7339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - loss: 0.0042 - val_loss: 0.0042
Epoch 7/50
[1m7339/7339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step - loss: 0.0039 - val_loss: 0.0039
Epoch 8/50
[1m7339/7339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step - loss: 0.0039 - val_loss: 0.0042
Epoch 9/50
[1m7

Isolation Forest

In [7]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np

# Remove unwanted columns from training and test data
test_df = test_df.drop(columns=['reconstruction_error'], errors='ignore')
train_df = train_df.drop(columns=['reconstruction_error'], errors='ignore')

# Extract numerical features for training and testing
train_x = train_df.select_dtypes(include=[np.number]).drop(columns=['LEAK_FLOW_FLAG'])
test_x = test_df.select_dtypes(include=[np.number]).drop(columns=['LEAK_FLOW_FLAG'])

# Create an Isolation Forest model
isolation_forest = IsolationForest(contamination=0.025, random_state=42)

# Fit the model on training data
isolation_forest.fit(train_x)

# Predict anomalies
train_x_predictions = isolation_forest.predict(train_x)
test_x_predictions = isolation_forest.predict(test_x)

# Convert predictions to binary labels (-1 indicates an anomaly)
train_x_predictions = (train_x_predictions == -1).astype(int)
test_x_predictions = (test_x_predictions == -1).astype(int)

# Print the confusion matrix for training data
cm_train = confusion_matrix(train_df['LEAK_FLOW_FLAG'], train_x_predictions)
print("Training Confusion Matrix:\n", cm_train)

# Print the confusion matrix for test data
cm_test = confusion_matrix(test_df['LEAK_FLOW_FLAG'], test_x_predictions)
print("Testing Confusion Matrix:\n", cm_test)

# Print precision, recall, accuracy, and F1 score
accuracy = accuracy_score(test_df['LEAK_FLOW_FLAG'], test_x_predictions)
recall = recall_score(test_df['LEAK_FLOW_FLAG'], test_x_predictions, zero_division=0)
precision = precision_score(test_df['LEAK_FLOW_FLAG'], test_x_predictions, zero_division=0)
f1 = f1_score(test_df['LEAK_FLOW_FLAG'], test_x_predictions, zero_division=0)

print(f'Accuracy: {accuracy}')
print(f'Recall: {recall}')
print(f'Precision: {precision}')
print(f'F1 Score: {f1}')


Training Confusion Matrix:
 [[286800   6755]
 [ 25268   1247]]
Testing Confusion Matrix:
 [[48027  1497]
 [ 6861    99]]
Accuracy: 0.8520288931378798
Recall: 0.014224137931034483
Precision: 0.06203007518796992
F1 Score: 0.02314165497896213


XGBoost

In [8]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

# Feature selection for XGBoost
drop_columns = ['UTC_TIME', 'FLIGHT_ID', 'MSN']
selected_features = [
    'VALUE_FOB', 'EXPECTED_FOB', 'FOB_DIFFERENCE', 'FOB_CHANGE',
    'START_FOB_VS_FOB_FUELUSED', 'TOTAL_FUEL_USED', 'FLIGHT_PHASE_COUNT', 'LEAK_FLOW_FLAG'
]

# Ensure rolling features exist before selecting features
train_df['VALUE_FOB_mean'] = train_df['VALUE_FOB'].rolling(window=10).mean()
train_df['VALUE_FOB_mean'] = train_df['VALUE_FOB_mean'].fillna(train_df['VALUE_FOB_mean'].mean())
train_df['lagged'] = train_df['VALUE_FOB'].shift(1).fillna(train_df['VALUE_FOB'].mean())
test_df['VALUE_FOB_mean'] = test_df['VALUE_FOB'].rolling(window=10).mean()
test_df['VALUE_FOB_mean'] = test_df['VALUE_FOB_mean'].fillna(test_df['VALUE_FOB_mean'].mean())
test_df['lagged'] = test_df['VALUE_FOB'].shift(1).fillna(test_df['VALUE_FOB'].mean())

# Add rolling features to selected features
selected_features.extend(['VALUE_FOB_mean', 'lagged'])

xgboostdata = train_df[selected_features]

# Ensure test set contains all necessary features
X_train = xgboostdata.drop(columns=['LEAK_FLOW_FLAG'])
y_train = xgboostdata['LEAK_FLOW_FLAG']
X_test = test_df[selected_features].drop(columns=['LEAK_FLOW_FLAG'], errors='ignore')
y_test = test_df['LEAK_FLOW_FLAG']

# Normalize the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train XGBoost classifier
xgb = XGBClassifier()
xgb.fit(X_train_scaled, y_train)

# Predictions
train_x_predictions = xgb.predict(X_train_scaled)
test_x_predictions = xgb.predict(X_test_scaled)

# Evaluate performance
cm_train = confusion_matrix(y_train, train_x_predictions)
cm_test = confusion_matrix(y_test, test_x_predictions)

accuracy = accuracy_score(y_test, test_x_predictions)
recall = recall_score(y_test, test_x_predictions, zero_division=0)
precision = precision_score(y_test, test_x_predictions, zero_division=0)
f1 = f1_score(y_test, test_x_predictions, zero_division=0)

print("Training Confusion Matrix:\n", cm_train)
print("Testing Confusion Matrix:\n", cm_test)
print(f'Accuracy: {accuracy}')
print(f'Recall: {recall}')
print(f'Precision: {precision}')
print(f'F1 Score: {f1}')

# Feature importance
feature_importances = xgb.feature_importances_
features = X_train.columns
feature_importances_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)
print(feature_importances_df)





Training Confusion Matrix:
 [[290314   3241]
 [ 20008   6507]]
Testing Confusion Matrix:
 [[49312   212]
 [ 6958     2]]
Accuracy: 0.8730613979179945
Recall: 0.00028735632183908046
Precision: 0.009345794392523364
F1 Score: 0.0005575689991636465
                     Feature  Importance
4  START_FOB_VS_FOB_FUELUSED    0.160377
5            TOTAL_FUEL_USED    0.157967
8                     lagged    0.145000
2             FOB_DIFFERENCE    0.141719
1               EXPECTED_FOB    0.140515
0                  VALUE_FOB    0.132797
7             VALUE_FOB_mean    0.101483
3                 FOB_CHANGE    0.020142
6         FLIGHT_PHASE_COUNT    0.000000


We tested HPT in another instance and it only upped the F1 up to 0.0134453

``` python
# Hyperparameter tuning
tuning_params = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [2, 4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': [0.5, 0.7, 0.9],
    'colsample_bytree': [0.5, 0.7, 0.9],
}

grid_search = GridSearchCV(estimator=XGBClassifier(), param_grid=tuning_params, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Predict with the best model
train_x_predictions = grid_search.best_estimator_.predict(X_train_scaled)
test_x_predictions = grid_search.best_estimator_.predict(X_test_scaled)

# Final evaluation
cm_train = confusion_matrix(y_train, train_x_predictions)
cm_test = confusion_matrix(y_test, test_x_predictions)
accuracy = accuracy_score(y_test, test_x_predictions)
recall = recall_score(y_test, test_x_predictions, zero_division=0)
precision = precision_score(y_test, test_x_predictions, zero_division=0)
f1 = f1_score(y_test, test_x_predictions, zero_division=0)

print("Final Training Confusion Matrix:\n", cm_train)
print("Final Testing Confusion Matrix:\n", cm_test)
print(f'Final Accuracy: {accuracy}')
print(f'Final Recall: {recall}')
print(f'Final Precision: {precision}')
print(f'Final F1 Score: {f1}')

# Print final feature importance
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)
print(feature_importances_df)


In [9]:
train_df.columns

Index(['UTC_TIME', 'FUEL_USED_2', 'FUEL_USED_3', 'FUEL_USED_4',
       'FW_GEO_ALTITUDE', 'VALUE_FOB', 'VALUE_FUEL_QTY_CT',
       'VALUE_FUEL_QTY_FT1', 'VALUE_FUEL_QTY_FT2', 'VALUE_FUEL_QTY_FT3',
       'VALUE_FUEL_QTY_FT4', 'VALUE_FUEL_QTY_LXT', 'VALUE_FUEL_QTY_RXT',
       'FLIGHT_PHASE_COUNT', 'FUEL_USED_1', 'Flight', 'MSN', 'NEW_FLIGHT',
       'FLIGHT_INSTANCE', 'FLIGHT_ID', 'START_FOB', 'TOTAL_FUEL_USED',
       'EXPECTED_FOB', 'FOB_DIFFERENCE', 'FOB_CHANGE', 'EXPECTED_FOB_CHANGE',
       'FUEL_LEAK_RATE', 'TOTAL_FUEL_LW', 'TOTAL_FUEL_RW', 'LW_RW_DIFF',
       'FUEL_IN_TANKS', 'CALC_VALUE_FOB_DIFF', 'START_FOB_VS_FOB_FUELUSED',
       'ALTITUDE_DIFF', 'LEAK_FLOW_FLAG', 'VALUE_FOB_mean', 'lagged'],
      dtype='object')

LSTM

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, RepeatVector, TimeDistributed

# Identify numerical columns and scale only those
numerical_cols = train_df.select_dtypes(include=[np.number]).columns
scaler = MinMaxScaler()
train_data_scaled = scaler.fit_transform(train_df[numerical_cols])
test_data_scaled = scaler.transform(test_df[numerical_cols])

# Reshape data for LSTM (samples, timesteps, features)
time_steps = 10  # Number of time steps for LSTM input
train_data_reshaped = train_data_scaled.reshape((train_data_scaled.shape[0], 1, train_data_scaled.shape[1]))
test_data_reshaped = test_data_scaled.reshape((test_data_scaled.shape[0], 1, test_data_scaled.shape[1]))

# Define LSTM Autoencoder model
model = Sequential([
    LSTM(128, activation='relu', input_shape=(1, train_data_scaled.shape[1]), return_sequences=True),
    Dropout(0.2),
    LSTM(64, activation='relu', return_sequences=False),
    RepeatVector(1),
    LSTM(64, activation='relu', return_sequences=True),
    Dropout(0.2),
    LSTM(128, activation='relu', return_sequences=True),
    TimeDistributed(Dense(train_data_scaled.shape[1]))
])

model.compile(optimizer='adam', loss='mse')

# Train the LSTM Autoencoder
model.fit(train_data_reshaped, train_data_reshaped,
          epochs=50,
          batch_size=64,
          validation_split=0.2,
          shuffle=True,
          verbose=1)

# Get reconstruction loss (MSE) on the test data
reconstructions = model.predict(test_data_reshaped)
mse = np.mean(np.power(test_data_reshaped - reconstructions, 2), axis=(1, 2))

# Determine anomaly threshold based on 95th percentile
threshold = np.percentile(mse, 95)

# Detect anomalies based on the threshold
test_df = test_df.copy()
test_df['reconstruction_error'] = mse
test_df['anomaly'] = test_df['reconstruction_error'] > threshold

# Evaluation metrics
y_true = test_df['LEAK_FLOW_FLAG']
y_pred = test_df['anomaly'].astype(int)

conf_matrix = confusion_matrix(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

# Print evaluation results
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Epoch 1/50


  super().__init__(**kwargs)


[1m4001/4001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 6ms/step - loss: 0.0226 - val_loss: 0.0014
Epoch 2/50
[1m4001/4001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 6ms/step - loss: 0.0013 - val_loss: 0.0016
Epoch 3/50
[1m4001/4001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 7ms/step - loss: 8.4629e-04 - val_loss: 0.0017
Epoch 4/50
[1m4001/4001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 7ms/step - loss: 6.9454e-04 - val_loss: 0.0017
Epoch 5/50
[1m4001/4001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 7ms/step - loss: 5.9202e-04 - val_loss: 0.0021
Epoch 6/50
[1m4001/4001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 8ms/step - loss: 5.3071e-04 - val_loss: 0.0023
Epoch 7/50
[1m4001/4001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 7ms/step - loss: 4.9703e-04 - val_loss: 0.0022
Epoch 8/50
[1m4001/4001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 8ms/step - loss: 4.6409e-04 - val_loss: 0.0023
Epo