In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from pickle import dump, load
import scipy.stats as stats

from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
import tensorflow as tf
from keras_tuner import BayesianOptimization, Objective

import plotly.express as px
import plotly.io as pio
pio.templates.default="simple_white"

from progtools.preprocessing import RawFlightData
from progtools.modelling import gpu_check, PhysicalSensorHyperModel

In [None]:
gpu_check()

In [None]:
#read the raw flight data in
filename = "data/N-CMAPSS_DS05.h5"
raw_data_read = RawFlightData(filename=filename)

#create the development dataframes
df_W_dev = raw_data_read.dev_flight_data()
df_X_s_dev = raw_data_read.dev_sensor_data()
df_A_dev = raw_data_read.dev_aux_data()

In [None]:
#downsample to 10% for faster model iteration
sample_index = np.arange(1,max(df_A_dev.index),10)

df_A_dev = df_A_dev.loc[sample_index]
df_W_dev = df_W_dev.loc[sample_index]
df_X_s_dev = df_X_s_dev.loc[sample_index]

#split out the healthy samples for the healthy behaviour model
df_A_healthy=df_A_dev[df_A_dev["hs"]==1]
df_W_healthy=df_W_dev[df_A_dev["hs"]==1]
df_X_s_healthy=df_X_s_dev[df_A_dev["hs"]==1]

In [None]:
for unit in df_A_dev["unit"].unique():
    cyc = df_A_dev[(df_A_dev["unit"]==unit) & (df_A_dev["hs"]==1)]["cycle"].max()
    print(f"Maximum number of healthy cycles for unit {unit:.0f}: {cyc:.0f}")

# Splitting Training / Test Sets & Scaling

In [None]:
#use units 1-4 for training then units 5 and 6 for testing
training_W = np.asarray(df_W_healthy[df_A_healthy["unit"]<=4])
test_W = np.asarray(df_W_healthy[df_A_healthy["unit"]>4])

training_X_s = np.asarray(df_X_s_healthy[df_A_healthy["unit"]<=4])
test_X_s = np.asarray(df_X_s_healthy[df_A_healthy["unit"]>4])

In [None]:
#create a scaler
scaler_W = MinMaxScaler().fit(training_W)

#store the scaler
dump(scaler_W, open("sensor_models/scaler_W.pkl","wb"))

In [None]:
#scale the inputs
training_W = scaler_W.transform(training_W)
test_W = scaler_W.transform(test_W)

print(f"Input training shape: {training_W.shape}")
print(f"Output training shape: {training_X_s.shape}")

print(f"Input test shape: {test_W.shape}")
print(f"Output test shape: {test_X_s.shape}")

# Model

In [None]:
#loop through each sensor, optimizing a hyper-model and storing it for later use
for s in tqdm(df_X_s_healthy.columns):
    #filter only select sensors
    df_working=df_X_s_healthy[[s]]
    
    #use units 1-4 for training
    training_X_s = np.asarray(df_working[df_A_healthy["unit"]<=4])
    
    #create the initial model gridsearch
    hyper_model_search=BayesianOptimization(PhysicalSensorHyperModel(),
                                            objective=Objective("val_mse",
                                                                direction="min"),
                                            max_trials=50,
                                            seed=42,
                                            directory="sensor_searches",
                                            project_name=s,
                                            overwrite=False)
    #complete the gridsearch
    hyper_model_search.search(training_W,
                              training_X_s,
                              validation_split=0.2)
    
    #set the baseline model to be the best model from the gridsearch
    model_out = hyper_model_search.get_best_models(num_models=1)[0]
    
    model_out.predict(training_W)
    
    #print summary for each sensor
    print(f"Summary for sensor: {s}")

    #save model for later use
    model_out.save(f"sensor_models/{s}_model.h5")

# Testing Model Quality

In [None]:
#reset dataframes from 0.1Hz to 1hz
df_W_dev = raw_data_read.dev_flight_data()
df_X_s_dev = raw_data_read.dev_sensor_data()
df_A_dev = raw_data_read.dev_aux_data()

#split out the healthy samples for development sets only
df_A_healthy=df_A_dev[df_A_dev["hs"]==1]
df_W_healthy=df_W_dev[df_A_dev["hs"]==1]
df_X_s_healthy=df_X_s_dev[df_A_dev["hs"]==1]

W_healthy = np.asarray(df_W_healthy)

In [None]:
#load the scaler used in model training
scaler_W = load(open("sensor_models/scaler_W.pkl","rb"))
testing_W = scaler_W.transform(W_healthy)

In [None]:
#create a dataframe for predictions using each model
df_test_results=pd.DataFrame(index=df_W_healthy.index)

#loop through each sensor
for s in tqdm(df_X_s_healthy.columns):
    
    #filter only select sensors
    df_working=df_X_s_healthy[[s]]    
    testing_X_s = np.asarray(df_working)
    
    #load the specific model sensor model
    X_s_model = keras.models.load_model(f"sensor_models/{s}_model.h5")
    
    #create predictions on the training set
    X_s_pred = X_s_model.predict(testing_W)
    
    #determine the delta between prediction and ground truth [absolute of yhat minus y]
    X_s_delta = abs(X_s_pred-testing_X_s)
    
    #normalize the results
    X_s_delta_normal=X_s_delta/testing_X_s
    
    #wrangle into dataframe
    df_out = pd.DataFrame(data=X_s_delta_normal,
                          columns=[s],
                         index=df_W_healthy.index)
    
    #append dataframe to master resuls dataframe
    df_test_results=df_test_results.join(df_out)
    
    #print progress
    print(f"Sensor: {s} complete")

In [None]:
#normalise to MAPE
df_test_results=df_test_results*100

#join unit and cycle columns to the delta dataframe
df_test_results=df_test_results.join(df_A_healthy[["unit","cycle"]])

In [None]:
#group all results by unit and then cycle, calculating the mean MAPE
df_test_results=df_test_results.groupby(["unit","cycle"],as_index=False).agg("mean")

In [None]:
#create a dataframe to store resuts
gen = []

#loop through all sensors
for i in df_test_results.columns.drop(["unit","cycle"]):
    df_A=df_test_results[df_test_results["unit"]<5][i] #split out training data
    df_B=df_test_results[df_test_results["unit"]>4][i] #split out test data
    
    mean_MAPE_A = df_A.mean() #calculate the mean MAPE for training data
    mean_MAPE_B = df_B.mean() #calculate the mean MAPE for test data
    
    #complete Mann-Whitney U test for training versus test data
    test_stat, p_val = stats.mannwhitneyu(df_A,df_B)
    
    #append results to master dataframe
    
    gen.append([i, mean_MAPE_A, mean_MAPE_B, test_stat, p_val])

df_gen = pd.DataFrame(data=gen, columns=["sensor", "average_MAPE_train", "average_MAPE_test", "test_stat", "p-value"])

In [None]:
df_test_results

In [None]:
#create a boxplot of each sensor versus MAPE by individual units in the development set
fig1 = px.box(df_test_results,
             y=["T24","T30","T48","T50","Nf","Nc","Wf"],
             color="unit",
             title="Boxplot of Mean Absolute Percentage Error by Sensor: Individual Models")

fig1.update_xaxes(title="Physical Sensor")
fig1.update_yaxes(title="MAPE [%]",
            #range=[0,12]
            )

In [None]:
#create a boxplot of each sensor versus MAPE by individual units in the development set
fig2 = px.box(df_test_results,
             y=["P15","P2","P21","P24","Ps30","P40","P50"],
             color="unit",
             title="Boxplot of Mean Absolute Percentage Error by Sensor: Individual Models")

fig2.update_xaxes(title="Physical Sensor")
fig2.update_yaxes(title="MAPE [%]",
            #range=[0,12]
            )

In [None]:
#create lineplots of each sensor for Cycle versus MAPE by individual units 
for i in df_test_results.drop(columns=["unit","cycle"]).columns:
    fig3=px.line(df_test_results,
                 x="cycle",
                 y=i,
                 color="unit",
                title=f"Mean Absolute Percentage Error by Cycle for Sensor: {i}",
                )
    
    fig3.update_xaxes(title="Cycle [-]")
    fig3.update_yaxes(title="MAPE [%]",
                     #range=[0,2]
                     )
    fig3.show()