# Imports

In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import *
from sklearn.model_selection import train_test_split

# The dataset

In [88]:
df = pd.read_csv("combined_simulated.csv")

In [89]:
# Convert 'UTC_TIME' column to datetime format and sort by time
df['UTC_TIME'] = pd.to_datetime(df['UTC_TIME'])
df.sort_values(by=['FLIGHT_INSTANCE', 'UTC_TIME'], inplace=True)

In [90]:
# Dropping irrelvant columns
df = df.drop(columns=['FLIGHT_PHASE_COUNT', 'Flight','MSN', 'FLIGHT_INSTANCE', 'NEW_FLIGHT'])

In [91]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 376554 entries, 0 to 337922
Data columns (total 30 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   UTC_TIME                   376554 non-null  datetime64[ns]
 1   FUEL_USED_2                376554 non-null  float64       
 2   FUEL_USED_3                376554 non-null  float64       
 3   FUEL_USED_4                376554 non-null  float64       
 4   FW_GEO_ALTITUDE            376554 non-null  float64       
 5   VALUE_FOB                  376554 non-null  float64       
 6   VALUE_FUEL_QTY_CT          376554 non-null  float64       
 7   VALUE_FUEL_QTY_FT1         376554 non-null  float64       
 8   VALUE_FUEL_QTY_FT2         376554 non-null  float64       
 9   VALUE_FUEL_QTY_FT3         376554 non-null  float64       
 10  VALUE_FUEL_QTY_FT4         376554 non-null  float64       
 11  VALUE_FUEL_QTY_LXT         376554 non-null  float64      

In [92]:
df['LEAK_FLOW_FLAG'].value_counts()

LEAK_FLOW_FLAG
0    343079
1     33475
Name: count, dtype: int64

### Cutting out all No-Leak Flights before 2017

In [93]:
# Identify flights with leaks (Keep these)
leak_flights = df[df["LEAK_FLOW_FLAG"] == 1]["FLIGHT_ID"].unique()

# Identify no-leak flights that started before 2017
no_leak_flights = df[df["LEAK_FLOW_FLAG"] == 0].groupby("FLIGHT_ID")["UTC_TIME"].min()
no_leak_flights_to_remove = no_leak_flights[no_leak_flights < "2017-10-22"].index

In [94]:
# Step 3: Keep all leak flights + no-leak flights that started in 2017 or later
df_filtered = df[df["FLIGHT_ID"].isin(leak_flights) | ~df["FLIGHT_ID"].isin(no_leak_flights_to_remove)]

In [95]:
print(f"Original dataset size: {df.shape[0]}")
print(f"Filtered dataset size: {df_filtered.shape[0]}")

Original dataset size: 376554
Filtered dataset size: 247584


In [96]:
df_filtered['LEAK_FLOW_FLAG'].value_counts()

LEAK_FLOW_FLAG
0    214109
1     33475
Name: count, dtype: int64

In [None]:
# Get mean of leaks per flight (better than sum bc each flight may have varying flight durations)
flight_leak_counts = df_filtered.groupby("FLIGHT_ID")["LEAK_FLOW_FLAG"].mean()

# Compute the media Number of Leaks per Flight
median_leaks_per_flight = flight_leak_counts[flight_leak_counts > 0].median()
print(f"median Number of Leaks per Flight: {median_leaks_per_flight:.2f}")

# Select flights where the number of leaks is higher than the median
flights_with_leaks = flight_leak_counts[flight_leak_counts > median_leaks_per_flight].index

# EXTREME Reduction: Only keep 80% as many no-leak flights as leak flights
num_leak_flights = len(flights_with_leaks)  # Total number of flights with leaks
num_no_leak_flights = int(num_leak_flights * 0.80)  # Cut down no-leak flights 

# Sample only the reduced number of no-leak flights
flights_no_leaks = flight_leak_counts[flight_leak_counts == 0].sample(
    n=num_no_leak_flights, 
    random_state=42
).index

# Keep only selected flights (Preserves sequences!)
df_balanced = df_filtered[df_filtered["FLIGHT_ID"].isin(flights_with_leaks.union(flights_no_leaks))]

median Number of Leaks per Flight: 0.25


In [98]:
df_balanced["LEAK_FLOW_FLAG"].value_counts(normalize=True) * 100

LEAK_FLOW_FLAG
0    76.336882
1    23.663118
Name: proportion, dtype: float64

In [99]:
df_balanced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 106423 entries, 153 to 337922
Data columns (total 30 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   UTC_TIME                   106423 non-null  datetime64[ns]
 1   FUEL_USED_2                106423 non-null  float64       
 2   FUEL_USED_3                106423 non-null  float64       
 3   FUEL_USED_4                106423 non-null  float64       
 4   FW_GEO_ALTITUDE            106423 non-null  float64       
 5   VALUE_FOB                  106423 non-null  float64       
 6   VALUE_FUEL_QTY_CT          106423 non-null  float64       
 7   VALUE_FUEL_QTY_FT1         106423 non-null  float64       
 8   VALUE_FUEL_QTY_FT2         106423 non-null  float64       
 9   VALUE_FUEL_QTY_FT3         106423 non-null  float64       
 10  VALUE_FUEL_QTY_FT4         106423 non-null  float64       
 11  VALUE_FUEL_QTY_LXT         106423 non-null  float64    

# Modeling

In [None]:
# Select all features (except the target)
features = df_balanced.columns.tolist()
features.remove('LEAK_FLOW_FLAG')  # Remove target column from features

# Target variable
target = 'LEAK_FLOW_FLAG'

In [101]:
# Drop rows with NaN values (due to lagging)
df_balanced.dropna(inplace=True)

In [102]:
df_balanced.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 106423 entries, 153 to 337922
Data columns (total 30 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   UTC_TIME                   106423 non-null  datetime64[ns]
 1   FUEL_USED_2                106423 non-null  float64       
 2   FUEL_USED_3                106423 non-null  float64       
 3   FUEL_USED_4                106423 non-null  float64       
 4   FW_GEO_ALTITUDE            106423 non-null  float64       
 5   VALUE_FOB                  106423 non-null  float64       
 6   VALUE_FUEL_QTY_CT          106423 non-null  float64       
 7   VALUE_FUEL_QTY_FT1         106423 non-null  float64       
 8   VALUE_FUEL_QTY_FT2         106423 non-null  float64       
 9   VALUE_FUEL_QTY_FT3         106423 non-null  float64       
 10  VALUE_FUEL_QTY_FT4         106423 non-null  float64       
 11  VALUE_FUEL_QTY_LXT         106423 non-null  float64    

### LSTM

In [None]:
# Step 1: Sort flights by first recorded timestamp
flight_start_times = df_balanced.groupby("FLIGHT_ID")["UTC_TIME"].min().sort_values()

# Step 2: Define the split point (80% train, 20% test)
split_index = int(len(flight_start_times) * 0.8)  # Get the 80% split point

# Step 3: Assign the first 80% of flights to training, last 20% to testing
train_flights = flight_start_times.index[:split_index]  # First 80% of flights
test_flights = flight_start_times.index[split_index:]  # Last 20% of flights

train_df = df_balanced[df_balanced["FLIGHT_ID"].isin(train_flights)]
test_df = df_balanced[df_balanced["FLIGHT_ID"].isin(test_flights)]

# Step 4: Reset index 
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [None]:
# Select features & target
features = [col for col in train_df.columns if col not in ['FLIGHT_ID', 'UTC_TIME', 'LEAK_FLOW_FLAG']]
target = "LEAK_FLOW_FLAG"

In [105]:
train_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75861 entries, 0 to 75860
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   UTC_TIME                   75861 non-null  datetime64[ns]
 1   FUEL_USED_2                75861 non-null  float64       
 2   FUEL_USED_3                75861 non-null  float64       
 3   FUEL_USED_4                75861 non-null  float64       
 4   FW_GEO_ALTITUDE            75861 non-null  float64       
 5   VALUE_FOB                  75861 non-null  float64       
 6   VALUE_FUEL_QTY_CT          75861 non-null  float64       
 7   VALUE_FUEL_QTY_FT1         75861 non-null  float64       
 8   VALUE_FUEL_QTY_FT2         75861 non-null  float64       
 9   VALUE_FUEL_QTY_FT3         75861 non-null  float64       
 10  VALUE_FUEL_QTY_FT4         75861 non-null  float64       
 11  VALUE_FUEL_QTY_LXT         75861 non-null  float64       
 12  VALU

In [None]:
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.utils import to_categorical

# Select numerical features only
numerical_features = train_df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Remove the target column (`LEAK_FLOW_FLAG`) from scaling
numerical_features.remove("LEAK_FLOW_FLAG")  

# Normalize only numerical features (LSTMs perform better with scaled data)
scaler = MinMaxScaler()
train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])

In [None]:
# Define time steps for LSTM (how many past steps to use for each prediction)
time_steps = 10  # Use last 10 timesteps to predict next one

In [None]:
# Function to Convert Data Into LSTM Format (3D: samples, time_steps, features)
def create_lstm_sequences(df, features, target, time_steps=10):
    X, y = [], []
    
    # Process each flight separately
    for flight_id, flight_data in df.groupby("FLIGHT_ID"):
        flight_data = flight_data.sort_values("UTC_TIME")  # Ensure correct order
        feature_values = flight_data[features].values
        target_values = flight_data[target].values

        # Create sequences
        for i in range(time_steps, len(flight_data)):
            X.append(feature_values[i - time_steps:i])  # Last `time_steps` for each row
            y.append(target_values[i])  # Next step's target value

    return np.array(X), np.array(y)

In [None]:
# Apply function to training & testing sets
X_train, y_train = create_lstm_sequences(train_df, numerical_features, "LEAK_FLOW_FLAG", time_steps=10)
X_test, y_test = create_lstm_sequences(test_df, numerical_features, "LEAK_FLOW_FLAG", time_steps=10)

# Convert target to categorical (for binary classification)
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

In [None]:
# Print Final Shape
print("Train Shape:", X_train.shape, y_train.shape)
print("Test Shape:", X_test.shape, y_test.shape)

Train Shape: (74041, 10, 27) (74041, 2)
Test Shape: (30102, 10, 27) (30102, 2)


In [None]:
# Need the f1 score
import tensorflow as tf
from tensorflow.keras import backend as K

# Custom Precision
def precision(y_true, y_pred):
    true_positives = K.sum(K.round(y_true * y_pred))
    predicted_positives = K.sum(K.round(y_pred))
    return true_positives / (predicted_positives + K.epsilon())

# Custom Recall
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(y_true * y_pred))
    possible_positives = K.sum(K.round(y_true))
    return true_positives / (possible_positives + K.epsilon())

# Custom F1 Score
def f1_score(y_true, y_pred):
    prec = precision(y_true, y_pred)
    rec = recall(y_true, y_pred)
    return 2 * ((prec * rec) / (prec + rec + K.epsilon()))

In [None]:
from sklearn.utils.class_weight import compute_class_weight
# Compute class weights
class_weights = compute_class_weight(
    class_weight="balanced", 
    classes=np.unique(np.argmax(y_train, axis=1)),  # Get class labels from one-hot encoding
    y=np.argmax(y_train, axis=1)  # Convert one-hot encoding back to labels
)

# Convert to dictionary format for Keras
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

In [None]:
# Convert to dictionary format for Keras
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Define Deeper LSTM Model
model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.3),
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    LSTM(32, return_sequences=False),
    Dropout(0.3),
    Dense(2, activation="softmax")  # Binary classification
])

In [None]:
# Compile with Adam Optimizer & Learning Rate Adjustment
model.compile(
    loss="categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),  # Reduce learning rate
    metrics=["accuracy", precision, recall, f1_score, tf.keras.metrics.AUC(name="auc")]
)


In [None]:
# Train Model
history = model.fit(
    X_train, y_train, 
    epochs=30,  # Train longer for better learning
    batch_size=64,  # Larger batch sizes for stability
    validation_data=(X_test, y_test), 
    class_weight=class_weight_dict  # Use class weighting
)

Epoch 1/30
[1m1157/1157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 16ms/step - accuracy: 0.6360 - auc: 0.7155 - f1_score: 0.6360 - loss: 0.5667 - precision: 0.6360 - recall: 0.6360 - val_accuracy: 0.7469 - val_auc: 0.7932 - val_f1_score: 0.7466 - val_loss: 0.6714 - val_precision: 0.7466 - val_recall: 0.7466
Epoch 2/30
[1m1157/1157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 14ms/step - accuracy: 0.7222 - auc: 0.8443 - f1_score: 0.7222 - loss: 0.4328 - precision: 0.7222 - recall: 0.7222 - val_accuracy: 0.6234 - val_auc: 0.7020 - val_f1_score: 0.6233 - val_loss: 0.7115 - val_precision: 0.6233 - val_recall: 0.6233
Epoch 3/30
[1m1157/1157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 14ms/step - accuracy: 0.7173 - auc: 0.8417 - f1_score: 0.7173 - loss: 0.4367 - precision: 0.7173 - recall: 0.7173 - val_accuracy: 0.7140 - val_auc: 0.7771 - val_f1_score: 0.7138 - val_loss: 0.6685 - val_precision: 0.7138 - val_recall: 0.7138
Epoch 4/30
[1m1157/1157[0m [32m

In [None]:
# Save Model
model.save("lstm_fuel_leak_model101.h5")



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Get Predictions
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)  # Convert softmax output to class labels
y_true = np.argmax(y_test, axis=1)  # Convert one-hot encoding back to labels

# Compute Classification Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
roc_auc = roc_auc_score(y_true, y_pred)  # Added ROC-AUC

# Print Results
print(f"LSTM Model - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, ROC-AUC: {roc_auc:.4f}")

[1m941/941[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
LSTM Model - Accuracy: 0.5202, Precision: 0.2460, Recall: 0.6573, F1-Score: 0.3580, ROC-AUC: 0.5712
