# Imports

In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import *
from sklearn.model_selection import train_test_split

# The dataset

In [2]:
df = pd.read_csv("combined_simulated.csv")

In [3]:
# Convert 'UTC_TIME' column to datetime format and sort by time
df['UTC_TIME'] = pd.to_datetime(df['UTC_TIME'])
df.sort_values(by=['FLIGHT_INSTANCE', 'UTC_TIME'], inplace=True)

In [4]:
# Dropping irrelvant columns
df = df.drop(columns=['FLIGHT_PHASE_COUNT', 'Flight','MSN', 'FLIGHT_INSTANCE', 'NEW_FLIGHT'])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 376554 entries, 0 to 337922
Data columns (total 30 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   UTC_TIME                   376554 non-null  datetime64[ns]
 1   FUEL_USED_2                376554 non-null  float64       
 2   FUEL_USED_3                376554 non-null  float64       
 3   FUEL_USED_4                376554 non-null  float64       
 4   FW_GEO_ALTITUDE            376554 non-null  float64       
 5   VALUE_FOB                  376554 non-null  float64       
 6   VALUE_FUEL_QTY_CT          376554 non-null  float64       
 7   VALUE_FUEL_QTY_FT1         376554 non-null  float64       
 8   VALUE_FUEL_QTY_FT2         376554 non-null  float64       
 9   VALUE_FUEL_QTY_FT3         376554 non-null  float64       
 10  VALUE_FUEL_QTY_FT4         376554 non-null  float64       
 11  VALUE_FUEL_QTY_LXT         376554 non-null  float64      

In [6]:
df['LEAK_FLOW_FLAG'].value_counts()

LEAK_FLOW_FLAG
0    343079
1     33475
Name: count, dtype: int64

### Cutting out all No-Leak Flights before 2017

In [7]:
# Identify flights with leaks (Keep these)
leak_flights = df[df["LEAK_FLOW_FLAG"] == 1]["FLIGHT_ID"].unique()

# Identify no-leak flights that started before 2017
no_leak_flights = df[df["LEAK_FLOW_FLAG"] == 0].groupby("FLIGHT_ID")["UTC_TIME"].min()
no_leak_flights_to_remove = no_leak_flights[no_leak_flights < "2017-10-22"].index

In [8]:
# Step 3: Keep all leak flights + no-leak flights that started in 2017 or later
df_filtered = df[df["FLIGHT_ID"].isin(leak_flights) | ~df["FLIGHT_ID"].isin(no_leak_flights_to_remove)]

In [9]:
print(f"Original dataset size: {df.shape[0]}")
print(f"Filtered dataset size: {df_filtered.shape[0]}")

Original dataset size: 376554
Filtered dataset size: 247584


In [10]:
df_filtered['LEAK_FLOW_FLAG'].value_counts()

LEAK_FLOW_FLAG
0    214109
1     33475
Name: count, dtype: int64

In [None]:
# Get mean of leaks per flight (better than sum bc each flight may have varying flight durations)
flight_leak_counts = df_filtered.groupby("FLIGHT_ID")["LEAK_FLOW_FLAG"].mean()

# Compute the media Number of Leaks per Flight
median_leaks_per_flight = flight_leak_counts[flight_leak_counts > 0].median()
print(f"median Number of Leaks per Flight: {median_leaks_per_flight:.2f}")

# Select flights where the number of leaks is higher than the median
flights_with_leaks = flight_leak_counts[flight_leak_counts > median_leaks_per_flight].index

# EXTREME Reduction: Only keep 1% as many no-leak flights as leak flights
num_leak_flights = len(flights_with_leaks)  # Total number of flights with leaks
num_no_leak_flights = int(num_leak_flights * 0.80)  # Cut down no-leak flights aggressively

# Sample only the reduced number of no-leak flights
flights_no_leaks = flight_leak_counts[flight_leak_counts == 0].sample(
    n=num_no_leak_flights, 
    random_state=42
).index

# Keep only selected flights (Preserves sequences!)
df_balanced = df_filtered[df_filtered["FLIGHT_ID"].isin(flights_with_leaks.union(flights_no_leaks))]

median Number of Leaks per Flight: 0.25


In [12]:
df_balanced["LEAK_FLOW_FLAG"].value_counts(normalize=True) * 100

LEAK_FLOW_FLAG
0    76.336882
1    23.663118
Name: proportion, dtype: float64

In [13]:
df_balanced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 106423 entries, 153 to 337922
Data columns (total 30 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   UTC_TIME                   106423 non-null  datetime64[ns]
 1   FUEL_USED_2                106423 non-null  float64       
 2   FUEL_USED_3                106423 non-null  float64       
 3   FUEL_USED_4                106423 non-null  float64       
 4   FW_GEO_ALTITUDE            106423 non-null  float64       
 5   VALUE_FOB                  106423 non-null  float64       
 6   VALUE_FUEL_QTY_CT          106423 non-null  float64       
 7   VALUE_FUEL_QTY_FT1         106423 non-null  float64       
 8   VALUE_FUEL_QTY_FT2         106423 non-null  float64       
 9   VALUE_FUEL_QTY_FT3         106423 non-null  float64       
 10  VALUE_FUEL_QTY_FT4         106423 non-null  float64       
 11  VALUE_FUEL_QTY_LXT         106423 non-null  float64    

# Modeling

In [None]:
# Select all features (except the target)
features = df_balanced.columns.tolist()
features.remove('LEAK_FLOW_FLAG')  # Remove target column from features

# Target variable
target = 'LEAK_FLOW_FLAG'

In [15]:
# Create lag features (memory for time-series)
def create_lag_features(df, features, lags=[1, 3, 5, 10]):
    """
    Adds past values (lags) as new features to help capture time dependency.
    """
    df = df.copy()
    for lag in lags:
        for feature in features:
            df[f'{feature}_lag{lag}'] = df.groupby('FLIGHT_ID')[feature].shift(lag)
    return df

df_balanced = create_lag_features(df_balanced, features, lags=[1, 3, 5, 10]) 

In [16]:
# Drop rows with NaN values (due to lagging)
df_balanced.dropna(inplace=True)

In [17]:
df_balanced.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 104143 entries, 163 to 337922
Data columns (total 146 columns):
 #    Column                           Non-Null Count   Dtype         
---   ------                           --------------   -----         
 0    UTC_TIME                         104143 non-null  datetime64[ns]
 1    FUEL_USED_2                      104143 non-null  float64       
 2    FUEL_USED_3                      104143 non-null  float64       
 3    FUEL_USED_4                      104143 non-null  float64       
 4    FW_GEO_ALTITUDE                  104143 non-null  float64       
 5    VALUE_FOB                        104143 non-null  float64       
 6    VALUE_FUEL_QTY_CT                104143 non-null  float64       
 7    VALUE_FUEL_QTY_FT1               104143 non-null  float64       
 8    VALUE_FUEL_QTY_FT2               104143 non-null  float64       
 9    VALUE_FUEL_QTY_FT3               104143 non-null  float64       
 10   VALUE_FUEL_QTY_FT4               

In [None]:
# Sort flights by their first recorded timestamp
flight_start_times = df_balanced.groupby("FLIGHT_ID")["UTC_TIME"].min().sort_values()

# Define the split point (80% train, 20% test)
split_index = int(len(flight_start_times) * 0.8)  # Get the 80% split point

# Assign the first 80% of flights to training, last 20% to testing
train_flights = flight_start_times.index[:split_index]  # First 80% of flights
test_flights = flight_start_times.index[split_index:]  # Last 20% of flights

In [19]:
train_df = df_balanced[df_balanced["FLIGHT_ID"].isin(train_flights)]
test_df = df_balanced[df_balanced["FLIGHT_ID"].isin(test_flights)]

In [20]:
train_df["LEAK_FLOW_FLAG"].value_counts(normalize=True) * 100

LEAK_FLOW_FLAG
0    74.962521
1    25.037479
Name: proportion, dtype: float64

In [21]:
# Reset index (PyCaret needs a clean index)
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

### Pycaret Classification

In [None]:
# Use `timeseries` fold strategy while enforcing all required settings
clf_setup = setup(
    data=train_df, 
    target=target,
    train_size=0.8,
    session_id=42, 
    fold_strategy="timeseries",  # Time-based validation
    fold=5,
    data_split_shuffle=False,  # Prevents PyCaret from shuffling time order
    fold_shuffle=False,  # Ensures validation comes AFTER training
    data_split_stratify=False  # Disables stratification (not allowed with time-series)
)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,LEAK_FLOW_FLAG
2,Target type,Binary
3,Original data shape,"(74041, 146)"
4,Transformed data shape,"(74041, 156)"
5,Transformed train set shape,"(59232, 156)"
6,Transformed test set shape,"(14809, 156)"
7,Numeric features,135
8,Date features,5
9,Categorical features,5


In [None]:
# Use `timeseries` fold strategy WITHOUT re-splitting
clf_setup = setup(
    data=train_df, 
    target=target,
    session_id=42, 
    fold_strategy="timeseries",  # Time-based validation
    fold=5,
    data_split_shuffle=False,  # Prevents PyCaret from shuffling time order
    fold_shuffle=False,  # Ensures validation comes AFTER training
    data_split_stratify=False  # Disables stratification (not allowed with time-series)
)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,LEAK_FLOW_FLAG
2,Target type,Binary
3,Original data shape,"(74041, 146)"
4,Transformed data shape,"(74041, 156)"
5,Transformed train set shape,"(51828, 156)"
6,Transformed test set shape,"(22213, 156)"
7,Numeric features,135
8,Date features,5
9,Categorical features,5


In [27]:
best_model = compare_models(sort='f1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.7557,0.8166,0.6451,0.5081,0.5623,0.3725,0.3818,4.552
gbc,Gradient Boosting Classifier,0.7559,0.8171,0.6102,0.5111,0.5479,0.3601,0.369,22.584
lightgbm,Light Gradient Boosting Machine,0.7558,0.0,0.5376,0.5127,0.5233,0.3379,0.3389,0.702
dt,Decision Tree Classifier,0.7569,0.0,0.504,0.5167,0.5099,0.3269,0.3272,1.274
rf,Random Forest Classifier,0.7531,0.0,0.3497,0.5115,0.4143,0.2554,0.2619,3.25
svm,SVM - Linear Kernel,0.7169,0.7742,0.4498,0.4859,0.3733,0.1939,0.2367,1.532
qda,Quadratic Discriminant Analysis,0.4805,0.492,0.5309,0.2976,0.2924,0.0321,0.042,0.318
knn,K Neighbors Classifier,0.5798,0.0,0.3688,0.2719,0.2891,0.0296,0.0257,1.002
lr,Logistic Regression,0.7514,0.8051,0.1971,0.5066,0.2779,0.164,0.1875,2.202
nb,Naive Bayes,0.5699,0.0,0.3419,0.2967,0.254,0.061,0.0455,0.18


In [None]:
# Prepare `test_df` (drop target column)
X_test = test_df.drop(columns=[target])
y_test = test_df[target]

# Make Predictions on `test_df` (Unseen Data)
y_pred = predict_model(best_model, data=X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Extract Predictions
y_pred_labels = y_pred["prediction_label"]  # Predicted classes (0 or 1)
y_pred_scores = y_pred["prediction_score"]  # Predicted probability for class 1

# Compute Final Test Metrics
accuracy = accuracy_score(y_test, y_pred_labels)
precision = precision_score(y_test, y_pred_labels)
recall = recall_score(y_test, y_pred_labels)
f1 = f1_score(y_test, y_pred_labels)
roc_auc = roc_auc_score(y_test, y_pred_scores)  # Uses probability for AUC-ROC

# Print Final Test Results
print(f"Final Test Set Results:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print(f"ROC-AUC:   {roc_auc:.4f}")


Final Test Set Results:
Accuracy:  0.7949
Precision: 0.4952
Recall:    0.3837
F1-Score:  0.4324
ROC-AUC:   0.1287
