# Imports

In [200]:
import pandas as pd
import numpy as np
from pycaret.classification import *
from sklearn.model_selection import train_test_split

# The dataset

In [323]:
df = pd.read_csv("combined_simulated.csv")

In [324]:
# Convert 'UTC_TIME' column to datetime format and sort by time
df['UTC_TIME'] = pd.to_datetime(df['UTC_TIME'])
df.sort_values(by=['FLIGHT_INSTANCE', 'UTC_TIME'], inplace=True)

In [325]:
# Dropping irrelvant columns
df = df.drop(columns=['FLIGHT_PHASE_COUNT', 'Flight','MSN', 'FLIGHT_INSTANCE', 'NEW_FLIGHT'])

In [326]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 376554 entries, 0 to 337922
Data columns (total 30 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   UTC_TIME                   376554 non-null  datetime64[ns]
 1   FUEL_USED_2                376554 non-null  float64       
 2   FUEL_USED_3                376554 non-null  float64       
 3   FUEL_USED_4                376554 non-null  float64       
 4   FW_GEO_ALTITUDE            376554 non-null  float64       
 5   VALUE_FOB                  376554 non-null  float64       
 6   VALUE_FUEL_QTY_CT          376554 non-null  float64       
 7   VALUE_FUEL_QTY_FT1         376554 non-null  float64       
 8   VALUE_FUEL_QTY_FT2         376554 non-null  float64       
 9   VALUE_FUEL_QTY_FT3         376554 non-null  float64       
 10  VALUE_FUEL_QTY_FT4         376554 non-null  float64       
 11  VALUE_FUEL_QTY_LXT         376554 non-null  float64      

In [327]:
df['LEAK_FLOW_FLAG'].value_counts()

LEAK_FLOW_FLAG
0    343079
1     33475
Name: count, dtype: int64

### Cutting out all No-Leak Flights before 2017

In [328]:
# Identify flights with leaks (Keep these)
leak_flights = df[df["LEAK_FLOW_FLAG"] == 1]["FLIGHT_ID"].unique()

# Identify no-leak flights that started before 2017
no_leak_flights = df[df["LEAK_FLOW_FLAG"] == 0].groupby("FLIGHT_ID")["UTC_TIME"].min()
no_leak_flights_to_remove = no_leak_flights[no_leak_flights < "2017-10-22"].index

In [329]:
# Step 3: Keep all leak flights + no-leak flights that started in 2017 or later
df_filtered = df[df["FLIGHT_ID"].isin(leak_flights) | ~df["FLIGHT_ID"].isin(no_leak_flights_to_remove)]

In [330]:
print(f"Original dataset size: {df.shape[0]}")
print(f"Filtered dataset size: {df_filtered.shape[0]}")

Original dataset size: 376554
Filtered dataset size: 247584


In [331]:
df_filtered['LEAK_FLOW_FLAG'].value_counts()

LEAK_FLOW_FLAG
0    214109
1     33475
Name: count, dtype: int64

In [None]:
# Get mean of leaks per flight (better than sum bc each flight may have varying flight durations)
flight_leak_counts = df_filtered.groupby("FLIGHT_ID")["LEAK_FLOW_FLAG"].mean()

#  Compute the media Number of Leaks per Flight
median_leaks_per_flight = flight_leak_counts[flight_leak_counts > 0].median()
print(f"median Number of Leaks per Flight: {median_leaks_per_flight:.2f}")

# Select flights where the number of leaks is higher than the median
flights_with_leaks = flight_leak_counts[flight_leak_counts > median_leaks_per_flight].index

# EXTREME Reduction: Only keep 1% as many no-leak flights as leak flights
num_leak_flights = len(flights_with_leaks)  # Total number of flights with leaks
num_no_leak_flights = int(num_leak_flights * 0.80)  # Cut down no-leak flights aggressively

# Sample only the reduced number of no-leak flights
flights_no_leaks = flight_leak_counts[flight_leak_counts == 0].sample(
    n=num_no_leak_flights, 
    random_state=42
).index

# Keep only selected flights (Preserves sequences!)
df_balanced = df_filtered[df_filtered["FLIGHT_ID"].isin(flights_with_leaks.union(flights_no_leaks))]

median Number of Leaks per Flight: 0.25


In [333]:
df_balanced["LEAK_FLOW_FLAG"].value_counts(normalize=True) * 100

LEAK_FLOW_FLAG
0    76.336882
1    23.663118
Name: proportion, dtype: float64

In [334]:
df_balanced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 106423 entries, 153 to 337922
Data columns (total 30 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   UTC_TIME                   106423 non-null  datetime64[ns]
 1   FUEL_USED_2                106423 non-null  float64       
 2   FUEL_USED_3                106423 non-null  float64       
 3   FUEL_USED_4                106423 non-null  float64       
 4   FW_GEO_ALTITUDE            106423 non-null  float64       
 5   VALUE_FOB                  106423 non-null  float64       
 6   VALUE_FUEL_QTY_CT          106423 non-null  float64       
 7   VALUE_FUEL_QTY_FT1         106423 non-null  float64       
 8   VALUE_FUEL_QTY_FT2         106423 non-null  float64       
 9   VALUE_FUEL_QTY_FT3         106423 non-null  float64       
 10  VALUE_FUEL_QTY_FT4         106423 non-null  float64       
 11  VALUE_FUEL_QTY_LXT         106423 non-null  float64    

# Modeling

In [None]:
# Select all features (except the target)
features = df_balanced.columns.tolist()
features.remove('LEAK_FLOW_FLAG')  # Remove target column from features

# Target variable
target = 'LEAK_FLOW_FLAG'

In [180]:
# Create lag features (memory for time-series)
def create_lag_features(df, features, lags=[1, 3, 5, 10]):
    """
    Adds past values (lags) as new features to help capture time dependency.
    """
    df = df.copy()
    for lag in lags:
        for feature in features:
            df[f'{feature}_lag{lag}'] = df.groupby('FLIGHT_ID')[feature].shift(lag)
    return df

df_balanced = create_lag_features(df_balanced, features, lags=[1, 3, 5, 10]) 

In [181]:
# Drop rows with NaN values (due to lagging)
df_balanced.dropna(inplace=True)

In [182]:
df_balanced.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 104143 entries, 163 to 337922
Data columns (total 146 columns):
 #    Column                           Non-Null Count   Dtype         
---   ------                           --------------   -----         
 0    UTC_TIME                         104143 non-null  datetime64[ns]
 1    FUEL_USED_2                      104143 non-null  float64       
 2    FUEL_USED_3                      104143 non-null  float64       
 3    FUEL_USED_4                      104143 non-null  float64       
 4    FW_GEO_ALTITUDE                  104143 non-null  float64       
 5    VALUE_FOB                        104143 non-null  float64       
 6    VALUE_FUEL_QTY_CT                104143 non-null  float64       
 7    VALUE_FUEL_QTY_FT1               104143 non-null  float64       
 8    VALUE_FUEL_QTY_FT2               104143 non-null  float64       
 9    VALUE_FUEL_QTY_FT3               104143 non-null  float64       
 10   VALUE_FUEL_QTY_FT4               

In [None]:
# Sort flights by their first recorded timestamp
flight_start_times = df_balanced.groupby("FLIGHT_ID")["UTC_TIME"].min().sort_values()

# Define the split point (80% train, 20% test)
split_index = int(len(flight_start_times) * 0.8)  # Get the 80% split point

# Assign the first 80% of flights to training, last 20% to testing
train_flights = flight_start_times.index[:split_index]  # First 80% of flights
test_flights = flight_start_times.index[split_index:]  # Last 20% of flights

In [184]:
train_df = df_balanced[df_balanced["FLIGHT_ID"].isin(train_flights)]
test_df = df_balanced[df_balanced["FLIGHT_ID"].isin(test_flights)]

In [185]:
train_df["LEAK_FLOW_FLAG"].value_counts(normalize=True) * 100

LEAK_FLOW_FLAG
0    74.962521
1    25.037479
Name: proportion, dtype: float64

In [186]:
# Reset index (PyCaret needs a clean index)
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

### Pycaret Classification

In [None]:
# Use `timeseries` fold strategy while enforcing all required settings
clf_setup = setup(
    data=train_df, 
    target=target,
    train_size=0.8,
    session_id=42, 
    fold_strategy="timeseries",  # Time-based validation
    fold=5,
    data_split_shuffle=False,  # Prevents PyCaret from shuffling time order
    fold_shuffle=False,  # Ensures validation comes AFTER training
    data_split_stratify=False  # Disables stratification (not allowed with time-series)
)



Unnamed: 0,Description,Value
0,Session id,42
1,Target,LEAK_FLOW_FLAG
2,Target type,Binary
3,Original data shape,"(74041, 151)"
4,Transformed data shape,"(74041, 161)"
5,Transformed train set shape,"(59232, 161)"
6,Transformed test set shape,"(14809, 161)"
7,Numeric features,135
8,Date features,5
9,Categorical features,9


In [156]:
best_model = compare_models(sort='Recall')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
qda,Quadratic Discriminant Analysis,0.3612,0.4757,0.7921,0.2441,0.3623,-0.0005,0.007,0.384
gbc,Gradient Boosting Classifier,0.7607,0.8244,0.7528,0.5105,0.6051,0.4237,0.4419,26.232
ada,Ada Boost Classifier,0.7592,0.8222,0.6764,0.5114,0.5728,0.3912,0.4054,5.028
lightgbm,Light Gradient Boosting Machine,0.7614,0.0,0.5731,0.5161,0.5421,0.3651,0.3666,0.68
dt,Decision Tree Classifier,0.7589,0.0,0.4811,0.5128,0.4951,0.3216,0.3225,1.412
rf,Random Forest Classifier,0.7583,0.0,0.4082,0.5124,0.4419,0.2799,0.2887,3.604
nb,Naive Bayes,0.5937,0.0,0.4068,0.2306,0.2421,0.0556,0.0678,0.276
svm,SVM - Linear Kernel,0.7197,0.7521,0.3967,0.4974,0.3092,0.1179,0.1798,1.204
knn,K Neighbors Classifier,0.5703,0.0,0.3842,0.2722,0.2978,0.0379,0.0296,1.212
lda,Linear Discriminant Analysis,0.7539,0.8027,0.3745,0.486,0.3619,0.2189,0.2393,0.604


In [233]:
# Tune the best model for better recall
# tuned_model = tune_model(best_model, optimize='Recall')

### Pycaret Anomaly Detection

In [None]:
# Drop target variable before anomaly detection setup
train_df_anomaly = train_df.drop(columns=['LEAK_FLOW_FLAG'], errors='ignore')

# Define features to use (exclude object & datetime columns)
numeric_features = [col for col in train_df_anomaly.columns if train_df_anomaly[col].dtype in ['float64', 'int64']]
ignore_features = ['FLIGHT_ID', 'UTC_TIME'] 

In [196]:
train_df_anomaly.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74041 entries, 0 to 74040
Data columns (total 145 columns):
 #    Column                           Dtype         
---   ------                           -----         
 0    UTC_TIME                         datetime64[ns]
 1    FUEL_USED_2                      float64       
 2    FUEL_USED_3                      float64       
 3    FUEL_USED_4                      float64       
 4    FW_GEO_ALTITUDE                  float64       
 5    VALUE_FOB                        float64       
 6    VALUE_FUEL_QTY_CT                float64       
 7    VALUE_FUEL_QTY_FT1               float64       
 8    VALUE_FUEL_QTY_FT2               float64       
 9    VALUE_FUEL_QTY_FT3               float64       
 10   VALUE_FUEL_QTY_FT4               float64       
 11   VALUE_FUEL_QTY_LXT               float64       
 12   VALUE_FUEL_QTY_RXT               float64       
 13   FUEL_USED_1                      float64       
 14   FLIGHT_ID           

In [None]:

# Display the number of unique values for each column in train_df
unique_counts = train_df_anomaly.nunique().sort_values()

# Print unique values per column
print(unique_counts)

CALC_VALUE_FOB_DIFF_lag10       70
CALC_VALUE_FOB_DIFF             71
CALC_VALUE_FOB_DIFF_lag1        71
CALC_VALUE_FOB_DIFF_lag5        71
CALC_VALUE_FOB_DIFF_lag3        71
                             ...  
UTC_TIME_lag1                74041
UTC_TIME_lag10               74041
UTC_TIME_lag5                74041
UTC_TIME_lag3                74041
UTC_TIME                     74041
Length: 145, dtype: int64


In [None]:
from pycaret.anomaly import setup, create_model, assign_model

#PyCaret anomaly detection setup
ano_setup = setup(
    data=train_df_anomaly,  # Use modified dataset without LEAK_FLOW_FLAG
    session_id=42,
    numeric_features=numeric_features,  
    ignore_features=ignore_features,  
    normalize=True,  
    transformation=True,  
    remove_multicollinearity=True,  
    multicollinearity_threshold=0.95,  
    outliers_threshold=0.05,  
    profile=False  
)

In [None]:
# Train multiple anomaly detection models
iforest_model = create_model('iforest')  # Isolation Forest

In [204]:
lof_model = create_model('lof')  # Local Outlier Factor

In [None]:
# Assign anomaly predictions for each model
iforest_results = assign_model(iforest_model)
lof_results = assign_model(lof_model)

# Rename anomaly columns for clarity
iforest_results.rename(columns={"Anomaly": "Anomaly_iforest"}, inplace=True)
lof_results.rename(columns={"Anomaly": "Anomaly_lof"}, inplace=True)

In [None]:
#  Merge anomaly results back into the original dataset
df_eval = train_df[['FLIGHT_ID', 'LEAK_FLOW_FLAG']].copy()  # Ensure original dataset columns are available
df_eval = df_eval.merge(iforest_results[['Anomaly_iforest']], left_index=True, right_index=True)
df_eval = df_eval.merge(lof_results[['Anomaly_lof']], left_index=True, right_index=True)

In [None]:
#  Convert anomalies (1 = Outlier) to match `LEAK_FLOW_FLAG` (1 = Leak)
for col in ['Anomaly_iforest', 'Anomaly_lof']:
    df_eval[col] = df_eval[col].astype(int)  # Ensure integer labels

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define a function to compute evaluation metrics
def evaluate_model(predictions, true_labels):
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, zero_division=0)
    recall = recall_score(true_labels, predictions, zero_division=0)
    f1 = f1_score(true_labels, predictions, zero_division=0)
    return accuracy, precision, recall, f1

# Compute metrics for each model
metrics = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1-Score": []
}

for model_name in ['Anomaly_iforest', 'Anomaly_lof']:
    acc, prec, rec, f1 = evaluate_model(df_eval[model_name], df_eval["LEAK_FLOW_FLAG"])
    metrics["Model"].append(model_name)
    metrics["Accuracy"].append(acc)
    metrics["Precision"].append(prec)
    metrics["Recall"].append(rec)
    metrics["F1-Score"].append(f1)

In [None]:
# Convert results into a DataFrame
metrics_df = pd.DataFrame(metrics)

In [215]:
metrics_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Anomaly_iforest,0.725044,0.254187,0.050761,0.084622
1,Anomaly_lof,0.726854,0.272285,0.054375,0.090647


### LSTM

In [None]:
# Step 1: Sort flights by first recorded timestamp
flight_start_times = df_balanced.groupby("FLIGHT_ID")["UTC_TIME"].min().sort_values()

# Step 2: Define the split point (80% train, 20% test)
split_index = int(len(flight_start_times) * 0.8)  # Get the 80% split point

# Step 3: Assign the first 80% of flights to training, last 20% to testing
train_flights = flight_start_times.index[:split_index]  # First 80% of flights
test_flights = flight_start_times.index[split_index:]  # Last 20% of flights

train_df = df_balanced[df_balanced["FLIGHT_ID"].isin(train_flights)]
test_df = df_balanced[df_balanced["FLIGHT_ID"].isin(test_flights)]

# Step 4: Reset index (ensures PyCaret compatibility)
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [None]:
# Select features & target
features = [col for col in train_df.columns if col not in ['FLIGHT_ID', 'UTC_TIME', 'LEAK_FLOW_FLAG']]
target = "LEAK_FLOW_FLAG"

In [294]:
train_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75861 entries, 0 to 75860
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   UTC_TIME                   75861 non-null  datetime64[ns]
 1   FUEL_USED_2                75861 non-null  float64       
 2   FUEL_USED_3                75861 non-null  float64       
 3   FUEL_USED_4                75861 non-null  float64       
 4   FW_GEO_ALTITUDE            75861 non-null  float64       
 5   VALUE_FOB                  75861 non-null  float64       
 6   VALUE_FUEL_QTY_CT          75861 non-null  float64       
 7   VALUE_FUEL_QTY_FT1         75861 non-null  float64       
 8   VALUE_FUEL_QTY_FT2         75861 non-null  float64       
 9   VALUE_FUEL_QTY_FT3         75861 non-null  float64       
 10  VALUE_FUEL_QTY_FT4         75861 non-null  float64       
 11  VALUE_FUEL_QTY_LXT         75861 non-null  float64       
 12  VALU

In [None]:
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.utils import to_categorical

# Select numerical features only
numerical_features = train_df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Remove the target column (`LEAK_FLOW_FLAG`) from scaling
numerical_features.remove("LEAK_FLOW_FLAG")  

# Normalize only numerical features (LSTMs perform better with scaled data)
scaler = MinMaxScaler()
train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])

In [None]:
# Define time steps for LSTM (how many past steps to use for each prediction)
time_steps = 10  # Use last 10 timesteps to predict next one

In [None]:
# Function to Convert Data Into LSTM Format (3D: samples, time_steps, features)
def create_lstm_sequences(df, features, target, time_steps=10):
    X, y = [], []
    
    # Process each flight separately
    for flight_id, flight_data in df.groupby("FLIGHT_ID"):
        flight_data = flight_data.sort_values("UTC_TIME")  # Ensure correct order
        feature_values = flight_data[features].values
        target_values = flight_data[target].values

        # Create sequences
        for i in range(time_steps, len(flight_data)):
            X.append(feature_values[i - time_steps:i])  # Last `time_steps` for each row
            y.append(target_values[i])  # Next step's target value

    return np.array(X), np.array(y)

In [None]:
# Apply function to training & testing sets
X_train, y_train = create_lstm_sequences(train_df, numerical_features, "LEAK_FLOW_FLAG", time_steps=10)
X_test, y_test = create_lstm_sequences(test_df, numerical_features, "LEAK_FLOW_FLAG", time_steps=10)

# Convert target to categorical (for binary classification)
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

In [None]:
# Print Final Shape
print("Train Shape:", X_train.shape, y_train.shape)
print("Test Shape:", X_test.shape, y_test.shape)

Train Shape: (74041, 10, 27) (74041, 2)
Test Shape: (30102, 10, 27) (30102, 2)


In [302]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.2),
    LSTM(32, return_sequences=False),
    Dropout(0.2),
    Dense(2, activation="softmax")  # Binary classification (leak=1, no leak=0)
])

In [None]:
# Need the f1 score
import tensorflow as tf
from tensorflow.keras import backend as K

# Custom Precision
def precision(y_true, y_pred):
    true_positives = K.sum(K.round(y_true * y_pred))
    predicted_positives = K.sum(K.round(y_pred))
    return true_positives / (predicted_positives + K.epsilon())

# Custom Recall
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(y_true * y_pred))
    possible_positives = K.sum(K.round(y_true))
    return true_positives / (possible_positives + K.epsilon())

# Custom F1 Score
def f1_score(y_true, y_pred):
    prec = precision(y_true, y_pred)
    rec = recall(y_true, y_pred)
    return 2 * ((prec * rec) / (prec + rec + K.epsilon()))

In [None]:
# Compile the model
model.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=[
        "accuracy", 
        precision, 
        recall, 
        f1_score, 
        tf.keras.metrics.AUC(name="auc")  # Built-in AUC-ROC
    ]
)

In [None]:
# Print Model Summary
model.summary()

In [None]:
# Train the LSTM Model
history = model.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_data=(X_test, y_test)
)

Epoch 1/10
[1m2314/2314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - accuracy: 0.7516 - auc: 0.8735 - f1_score: 0.7516 - loss: 0.3797 - precision: 0.7516 - recall: 0.7516 - val_accuracy: 0.4698 - val_auc: 0.5630 - val_f1_score: 0.4699 - val_loss: 0.6679 - val_precision: 0.4699 - val_recall: 0.4699
Epoch 2/10
[1m2314/2314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.7490 - auc: 0.8715 - f1_score: 0.7490 - loss: 0.3805 - precision: 0.7490 - recall: 0.7490 - val_accuracy: 0.3106 - val_auc: 0.4136 - val_f1_score: 0.3107 - val_loss: 0.7376 - val_precision: 0.3107 - val_recall: 0.3107
Epoch 3/10
[1m2314/2314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.7532 - auc: 0.8756 - f1_score: 0.7532 - loss: 0.3709 - precision: 0.7532 - recall: 0.7532 - val_accuracy: 0.3103 - val_auc: 0.3830 - val_f1_score: 0.3104 - val_loss: 0.7210 - val_precision: 0.3104 - val_recall: 0.3104
Epoch 4/10
[1m2314/2314[0m [32m━━━━

In [None]:
# Save Model 
model.save("lstm_fuel_leak_model1.h5")



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Get Predictions
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)  # Convert softmax output to class labels
y_true = np.argmax(y_test, axis=1)  # Convert one-hot encoding back to labels

# Compute Classification Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Print Results
print(f"LSTM Model - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

[1m941/941[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
LSTM Model - Accuracy: 0.4766, Precision: 0.2489, Recall: 0.7788, F1-Score: 0.3773


### Again

In [None]:
from sklearn.utils.class_weight import compute_class_weight
# Compute class weights
class_weights = compute_class_weight(
    class_weight="balanced", 
    classes=np.unique(np.argmax(y_train, axis=1)),  # Get class labels from one-hot encoding
    y=np.argmax(y_train, axis=1)  # Convert one-hot encoding back to labels
)

# Convert to dictionary format for Keras
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

In [None]:
# Convert to dictionary format for Keras
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Define Deeper LSTM Model
model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.3),
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    LSTM(32, return_sequences=False),
    Dropout(0.3),
    Dense(2, activation="softmax")  # Binary classification
])

In [None]:
# Compile with Adam Optimizer & Learning Rate Adjustment
model.compile(
    loss="categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),  # Reduce learning rate
    metrics=["accuracy", precision, recall, f1_score, tf.keras.metrics.AUC(name="auc")]
)


In [None]:
# Train Model
history = model.fit(
    X_train, y_train, 
    epochs=30,  # Train longer for better learning
    batch_size=64,  # Larger batch sizes for stability
    validation_data=(X_test, y_test), 
    class_weight=class_weight_dict  # Use class weighting
)

Epoch 1/30
[1m1157/1157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 16ms/step - accuracy: 0.6267 - auc: 0.6989 - f1_score: 0.6267 - loss: 0.5861 - precision: 0.6267 - recall: 0.6267 - val_accuracy: 0.4154 - val_auc: 0.4631 - val_f1_score: 0.4155 - val_loss: 0.9042 - val_precision: 0.4155 - val_recall: 0.4155
Epoch 2/30
[1m1157/1157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 14ms/step - accuracy: 0.7237 - auc: 0.8451 - f1_score: 0.7237 - loss: 0.4313 - precision: 0.7237 - recall: 0.7237 - val_accuracy: 0.2880 - val_auc: 0.3133 - val_f1_score: 0.2884 - val_loss: 1.0577 - val_precision: 0.2884 - val_recall: 0.2884
Epoch 3/30
[1m1157/1157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 13ms/step - accuracy: 0.7200 - auc: 0.8430 - f1_score: 0.7200 - loss: 0.4325 - precision: 0.7200 - recall: 0.7200 - val_accuracy: 0.2642 - val_auc: 0.3024 - val_f1_score: 0.2646 - val_loss: 1.0528 - val_precision: 0.2646 - val_recall: 0.2646
Epoch 4/30
[1m1157/1157[0m [32m

In [None]:
# Save Model (Optional)
model.save("lstm_fuel_leak_model2.h5")



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Get Predictions
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)  # Convert softmax output to class labels
y_true = np.argmax(y_test, axis=1)  # Convert one-hot encoding back to labels

# Compute Classification Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Print Results
print(f"LSTM Model - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

[1m941/941[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
LSTM Model - Accuracy: 0.2814, Precision: 0.2064, Recall: 0.8895, F1-Score: 0.3351


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Get Predictions
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)  # Convert softmax output to class labels
y_true = np.argmax(y_test, axis=1)  # Convert one-hot encoding back to labels

# Compute Classification Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
roc_auc = roc_auc_score(y_true, y_pred)  # Added ROC-AUC

# Print Results
print(f"LSTM Model - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, ROC-AUC: {roc_auc:.4f}")

[1m941/941[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
LSTM Model - Accuracy: 0.2814, Precision: 0.2064, Recall: 0.8895, F1-Score: 0.3351, ROC-AUC: 0.5078
