# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from kerastuner.tuners import RandomSearch

  from kerastuner.tuners import RandomSearch


# The dataset

In [33]:
df = pd.read_csv("combined_simulated.csv")

In [34]:
# Convert 'UTC_TIME' column to datetime format (if exists) and set as index
df['UTC_TIME'] = pd.to_datetime(df['UTC_TIME'])
df.set_index('UTC_TIME', inplace=True)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 376554 entries, 2015-07-21 03:51:31 to 2018-04-22 11:45:45
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   FUEL_USED_2                376554 non-null  float64
 1   FUEL_USED_3                376554 non-null  float64
 2   FUEL_USED_4                376554 non-null  float64
 3   FW_GEO_ALTITUDE            376554 non-null  float64
 4   VALUE_FOB                  376554 non-null  float64
 5   VALUE_FUEL_QTY_CT          376554 non-null  float64
 6   VALUE_FUEL_QTY_FT1         376554 non-null  float64
 7   VALUE_FUEL_QTY_FT2         376554 non-null  float64
 8   VALUE_FUEL_QTY_FT3         376554 non-null  float64
 9   VALUE_FUEL_QTY_FT4         376554 non-null  float64
 10  VALUE_FUEL_QTY_LXT         376554 non-null  float64
 11  VALUE_FUEL_QTY_RXT         376554 non-null  float64
 12  FLIGHT_PHASE_COUNT         376554 non-null  float64


In [None]:
# Dropping irrelvant columns
df = df.drop(columns=['FLIGHT_PHASE_COUNT', 'Flight','MSN'])

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 376554 entries, 2015-07-21 03:51:31 to 2018-04-22 11:45:45
Data columns (total 30 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   FUEL_USED_2                376554 non-null  float64
 1   FUEL_USED_3                376554 non-null  float64
 2   FUEL_USED_4                376554 non-null  float64
 3   FW_GEO_ALTITUDE            376554 non-null  float64
 4   VALUE_FOB                  376554 non-null  float64
 5   VALUE_FUEL_QTY_CT          376554 non-null  float64
 6   VALUE_FUEL_QTY_FT1         376554 non-null  float64
 7   VALUE_FUEL_QTY_FT2         376554 non-null  float64
 8   VALUE_FUEL_QTY_FT3         376554 non-null  float64
 9   VALUE_FUEL_QTY_FT4         376554 non-null  float64
 10  VALUE_FUEL_QTY_LXT         376554 non-null  float64
 11  VALUE_FUEL_QTY_RXT         376554 non-null  float64
 12  FUEL_USED_1                376554 non-null  float64


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 376554 entries, 2015-07-21 03:51:31 to 2018-04-22 11:45:45
Data columns (total 30 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   FUEL_USED_2                376554 non-null  float64
 1   FUEL_USED_3                376554 non-null  float64
 2   FUEL_USED_4                376554 non-null  float64
 3   FW_GEO_ALTITUDE            376554 non-null  float64
 4   VALUE_FOB                  376554 non-null  float64
 5   VALUE_FUEL_QTY_CT          376554 non-null  float64
 6   VALUE_FUEL_QTY_FT1         376554 non-null  float64
 7   VALUE_FUEL_QTY_FT2         376554 non-null  float64
 8   VALUE_FUEL_QTY_FT3         376554 non-null  float64
 9   VALUE_FUEL_QTY_FT4         376554 non-null  float64
 10  VALUE_FUEL_QTY_LXT         376554 non-null  float64
 11  VALUE_FUEL_QTY_RXT         376554 non-null  float64
 12  FUEL_USED_1                376554 non-null  float64


# Modeling

In [None]:
features = df.columns.tolist()
features.remove('LEAK_FLOW_FLAG') 

# Target variable
target = ['LEAK_FLOW_FLAG']

# Group by FLIGHT_INSTANCE to process each flight separately
flight_groups = df.groupby('FLIGHT_INSTANCE')

In [38]:
# Create a function to process each flight separately
def process_flight_data(flight_df):
    """ Prepares time-series data for a single flight instance. """
    X_flight = flight_df[features].values
    y_flight = flight_df[target].values

    # Normalize using MinMaxScaler (fit per flight to prevent leakage)
    scaler_X = MinMaxScaler()
    X_flight_scaled = scaler_X.fit_transform(X_flight)

    return X_flight_scaled, y_flight

In [39]:
# Create lists to store all processed flights
all_X, all_y = [], []
flight_ids = []

In [40]:
for flight_id, flight_df in flight_groups:
    X_flight, y_flight = process_flight_data(flight_df)
    all_X.append(X_flight)
    all_y.append(y_flight)
    flight_ids.append(flight_id)

In [41]:
flight_ids

[0, 1, 2, 3, 4, 5, 6, 7, 9]

In [44]:
train_flights, test_flights = train_test_split(
    flight_ids, test_size=0.2, random_state=42, stratify=[df[df['FLIGHT_INSTANCE'] == f]['LEAK_FLOW_FLAG'].values[0] for f in flight_ids]
)

In [45]:
# Convert data into sequences for each flight
def create_flight_sequences(X_flight, y_flight, time_steps=30):
    """
    Converts data from a single flight into sequences of time steps.
    Ensures time-dependency within each flight.
    """
    Xs, ys = [], []
    for i in range(len(X_flight) - time_steps):
        Xs.append(X_flight[i:i+time_steps])
        ys.append(y_flight[i+time_steps])  # Predict next step
    return np.array(Xs), np.array(ys)

In [46]:
# Store train and test sequences
X_train_seq, y_train_seq, X_test_seq, y_test_seq = [], [], [], []

In [47]:
for i, flight_id in enumerate(flight_ids):
    X_flight_seq, y_flight_seq = create_flight_sequences(all_X[i], all_y[i], time_steps=30)

    if flight_id in train_flights:
        X_train_seq.append(X_flight_seq)
        y_train_seq.append(y_flight_seq)
    else:
        X_test_seq.append(X_flight_seq)
        y_test_seq.append(y_flight_seq)

In [48]:
# Convert lists to numpy arrays
X_train_seq = np.concatenate(X_train_seq, axis=0)
y_train_seq = np.concatenate(y_train_seq, axis=0)
X_test_seq = np.concatenate(X_test_seq, axis=0)
y_test_seq = np.concatenate(y_test_seq, axis=0)

In [49]:
# Define the GRU model with hyperparameter tuning
def build_model(hp):
    model = Sequential()
    
    # First GRU Layer
    model.add(GRU(units=hp.Int('gru_units_1', min_value=32, max_value=128, step=16),
                  return_sequences=True,
                  input_shape=(30, len(features))))
    
    # Dropout after first GRU layer
    model.add(Dropout(hp.Float('dropout_1', min_value=0.1, max_value=0.5, step=0.1)))
    
    # Second GRU Layer
    model.add(GRU(units=hp.Int('gru_units_2', min_value=16, max_value=64, step=16),
                  return_sequences=False))  # Last recurrent layer
    
    # Dropout after second GRU layer
    model.add(Dropout(hp.Float('dropout_2', min_value=0.1, max_value=0.5, step=0.1)))
    
    # Fully connected layers
    model.add(Dense(units=hp.Int('dense_units', min_value=8, max_value=64, step=8), activation='relu'))
    
    # Output layer for binary classification (sigmoid activation)
    model.add(Dense(1, activation='sigmoid'))  

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[0.001, 0.0005, 0.0001])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    return model

In [None]:
# Set up Random Search using Keras Tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='tuner_results',
    project_name='gru_fuel_leak_classification'
)

  super().__init__(**kwargs)


In [52]:
# Perform hyperparameter tuning
tuner.search(X_train_seq, y_train_seq, epochs=10, batch_size=32,
             validation_data=(X_test_seq, y_test_seq), verbose=1)

In [None]:
# Get the best model from tuning
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
Best hyperparameters found:
GRU Units 1: {best_hps.get('gru_units_1')}
GRU Units 2: {best_hps.get('gru_units_2')}
Dropout 1: {best_hps.get('dropout_1')}
Dropout 2: {best_hps.get('dropout_2')}
Dense Units: {best_hps.get('dense_units')}
Learning Rate: {best_hps.get('learning_rate')}
""")