<a href="https://colab.research.google.com/github/arpdm/predictive-maintenance-platform/blob/main/PdM_NASA_ENGINES_002.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Colab data file preparation
from google.colab import drive

drive.mount("/content/drive")

!cp drive/MyDrive/Predictive_Maintenence_Fault_Detection/predictive-maintenance-platform/data_processor.py .
!cp drive/MyDrive/Predictive_Maintenence_Fault_Detection/predictive-maintenance-platform/visualizer_analyzer.py .

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from data_processor import DataProcessor
from visualizer_analyzer import DataAV

import tensorflow as tf

# Load all datasets
DS_001 = "/content/drive/MyDrive/Predictive_Maintenence_Fault_Detection/data_set/N-CMAPSS_DS01-005.h5"
DS_002 = "/content/drive/MyDrive/Predictive_Maintenence_Fault_Detection/data_set/N-CMAPSS_DS02-006.h5"
DS_003 = "/content/drive/MyDrive/Predictive_Maintenence_Fault_Detection/data_set/N-CMAPSS_DS03-012.h5"
DS_004 = "/content/drive/MyDrive/Predictive_Maintenence_Fault_Detection/data_set/N-CMAPSS_DS04.h5"
DS_005 = "/content/drive/MyDrive/Predictive_Maintenence_Fault_Detection/data_set/N-CMAPSS_DS05.h5"
DS_006 = "/content/drive/MyDrive/Predictive_Maintenence_Fault_Detection/data_set/N-CMAPSS_DS06.h5"
DS_007 = "/content/drive/MyDrive/Predictive_Maintenence_Fault_Detection/data_set/N-CMAPSS_DS07.h5"
DS_008 = "/content/drive/MyDrive/Predictive_Maintenence_Fault_Detection/data_set/N-CMAPSS_DS08a-009.h5"
DS_009 = "/content/drive/MyDrive/Predictive_Maintenence_Fault_Detection/data_set/N-CMAPSS_DS08c-008.h5"
DS_010 = "/content/drive/MyDrive/Predictive_Maintenence_Fault_Detection/data_set/N-CMAPSS_DS08d-010.h5"

In [3]:
# Load data set and prepare data frames
pros = DataProcessor()
pros.load_hdf5_to_numpy_arr(DS_004)

Process data for NASA's engines run-to-failure datasets
Operation time (sec):  7.817488355


In [4]:
# Prepare Data Frames

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df_rul_train = pd.DataFrame(data=pros.y_rul_dev, columns=["RUL"])
df_rul_test = pd.DataFrame(data=pros.y_rul_test, columns=["RUL"])
df_x_s_train = pd.DataFrame(data=pros.x_s_dev, columns=pros.x_s_var_names)
df_x_s_test = pd.DataFrame(data=pros.x_s_test, columns=pros.x_s_var_names)
df_x_v_train = pd.DataFrame(data=pros.x_v_dev, columns=pros.x_v_var_names)
df_x_v_test = pd.DataFrame(data=pros.x_v_test, columns=pros.x_v_var_names)
df_aux_test = pd.DataFrame(data=pros.aux_test, columns=pros.aux_var_names)
df_aux_train = pd.DataFrame(data=pros.aux_dev, columns=pros.aux_var_names)
df_w_test = pd.DataFrame(data=pros.w_test, columns=pros.w_var_names)
df_w_train = pd.DataFrame(data=pros.w_dev, columns=pros.w_var_names)


In [None]:
df_x_s_train

In [7]:
df_x_s_train["cycle"] = df_aux_train["cycle"].values
df_x_s_train["RUL_TRUTH"] = df_rul_train.values
df_x_s_train["id"] = df_aux_train["unit"].values

# df_x_s_test["cycle"] = df_aux_test["cycle"].values
# df_x_s_test["RUL_TRUTH"] = df_rul_test.values
# df_x_s_test["id"] = df_aux_test["unit"].values

# df_x_s_train = df_x_s_train.drop(columns=['P15'])

df_x_v_train["cycle"] = df_aux_train["cycle"].values
# df_x_v_train["RUL"] = df_rul_train.values

df_w_train["cycle"] = df_aux_train["cycle"].values
# df_w_train["RUL"] = df_rul_train.values

In [None]:
# Generate heatmap to figure out correlations between features and the RUL
# This helps to determine which features to drop and which features to keep
sns.heatmap(df_x_s_train.corr(),annot=True,cmap='RdYlGn',linewidths=0.2)
fig=plt.gcf()
fig.set_size_inches(20,20)
plt.show()

In [None]:
# Generate heatmap to figure out correlations between features and the RUL
# This helps to determine which features to drop and which features to keep
sns.heatmap(df_x_v_train.corr(),annot=True,cmap='RdYlGn',linewidths=0.2)
fig=plt.gcf()
fig.set_size_inches(20,20)
plt.show()

In [None]:
# Generate heatmap to figure out correlations between features and the RUL
# This helps to determine which features to drop and which features to keep
sns.heatmap(df_w_train.corr(),annot=True,cmap='RdYlGn',linewidths=0.2)
fig=plt.gcf()
fig.set_size_inches(20,20)
plt.show()

In [None]:
df_x_s_train = df_x_s_train.sort_values(['id','cycle'])
df_x_s_train

In [None]:
# Data Labeling - generate column RUL
rul = pd.DataFrame(df_x_s_train.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
df_x_s_train = df_x_s_train.merge(rul, on=['id'], how='left')
df_x_s_train['RUL'] = df_x_s_train['max'] - df_x_s_train['cycle']
df_x_s_train.drop('max', axis=1, inplace=True)
df_x_s_train.head()

In [None]:
df_x_s_train

In [None]:
# Generate heatmap to figure out correlations between features and the RUL
# This helps to determine which features to drop and which features to keep
sns.heatmap(df_x_s_train.corr(),annot=True,cmap='RdYlGn',linewidths=0.2)
fig=plt.gcf()
fig.set_size_inches(20,20)
plt.show()

In [None]:
#Here, we will only make use of "label1" for binary clasification, while trying to answer the question: is a specific engine going to fail within w1 cycles?
# generate label columns for training data
import numpy as np

w1 = 30
w0 = 15
df_x_s_train['label1'] = np.where(df_x_s_train['RUL'] <= w1, 1, 0 )
df_x_s_train['label2'] = df_x_s_train['label1']
df_x_s_train.loc[df_x_s_train['RUL'] <= w0, 'label2'] = 2
df_x_s_train.head()

In [None]:
# Normalize Data
# In the Predictive Maintenance Template , cycle column is also used for training so we will also include the cycle column. Here, we normalize the columns in the training data.
# MinMax normalization
from sklearn import preprocessing

df_x_s_train['cycle_norm'] = df_x_s_train['cycle']
cols_normalize = df_x_s_train.columns.difference(['id','cycle','RUL','label1','label2','RUL_TRUTH'])
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(df_x_s_train[cols_normalize]), 
                             columns=cols_normalize, 
                             index=df_x_s_train.index)
join_df = df_x_s_train[df_x_s_train.columns.difference(cols_normalize)].join(norm_train_df)
df_x_s_train = join_df.reindex(columns = df_x_s_train.columns)
df_x_s_train.head()

In [18]:
## prepare test data
#  generate label columns w0 and w1 for test data
df_x_s_test["cycle"] = df_aux_test["cycle"].values
df_x_s_test["RUL"] = df_rul_test.values
df_x_s_test["id"] = df_aux_test["unit"].values

df_x_s_test['label1'] = np.where(df_x_s_test['RUL'] <= w1, 1, 0 )
df_x_s_test['label2'] = df_x_s_test['label1']
df_x_s_test.loc[df_x_s_test['RUL'] <= w0, 'label2'] = 2
df_x_s_test.head()

Unnamed: 0,T24,T30,T48,T50,P15,P2,P21,P24,Ps30,P40,P50,Nf,Nc,Wf,cycle,RUL,id,label1,label2
0,620.506639,1488.080832,1886.473168,1287.156838,19.372008,14.154111,19.676307,24.547359,409.28423,415.810164,16.163146,2188.171877,8751.322228,4.890535,1.0,86,7.0,0,0
1,620.478004,1487.46721,1885.286698,1288.280169,19.363287,14.15693,19.664207,24.537576,408.996413,415.692275,16.159712,2187.423272,8743.692986,4.888862,1.0,86,7.0,0,0
2,619.526099,1487.808291,1884.016137,1287.319699,19.350864,14.150683,19.643745,24.533771,408.775613,415.401764,16.158847,2188.4683,8753.865949,4.88432,1.0,86,7.0,0,0
3,620.244129,1488.433841,1885.29697,1288.064834,19.348055,14.15077,19.653264,24.530714,408.948377,415.447424,16.155031,2186.216852,8750.814968,4.886681,1.0,86,7.0,0,0
4,619.971242,1487.979131,1885.384862,1286.838059,19.345743,14.153215,19.638651,24.540065,408.531735,415.402249,16.152368,2187.986398,8745.38765,4.884689,1.0,86,7.0,0,0


# Modeling

The traditional predictive maintenance machine learning models are based on feature engineering which is manual construction of right features using domain expertise and similar methods. This usually makes these models hard to reuse since feature engineering is specific to the problem scenario and the available data which varies from one business to the other. Perhaps the most attractive part of applying deep learning in the predictive maintenance domain is the fact that these networks can automatically extract the right features from the data, eliminating the need for manual feature engineering.

he idea of using LSTMs is to let the model extract abstract features out of the sequence of sensor values in the window rather than engineering those manually. The expectation is that if there is a pattern in these sensor values within the window prior to failure, the pattern should be encoded by the LSTM.

One critical advantage of LSTMs is their ability to remember from long-term sequences (window sizes) which is hard to achieve by traditional feature engineering. For example, computing rolling averages over a window size of 50 cycles may lead to loss of information due to smoothing and abstracting of values over such a long period, instead, using all 50 values as input may provide better results. While feature engineering over large window sizes may not make sense, LSTMs are able to use larger window sizes and use all the information in the window as input. 
LSTM also has this long term memory over regular RNN architectecture.

In [None]:
df_x_s_test.columns

In [None]:
"""
Genearate the tf dataset with proper dimensionality and shape given input parameters.
This dataset will split into training and validation subsets.
"""

window = 50
horizon = 1
train_split = 6377452
batch_size = 256
buffer_size = 150

# Let's first look at an example of the sensor values 50 cycles prior to the failure for engine id 3. 
# We will be feeding LSTM network this type of data for each time step for each engine id.

# preparing data for visualizations 
# window of 50 cycles prior to a failure point for engine id 3
engine_id3 = df_x_s_train[df_x_s_train['id'] == 3]
engine_id3_50cycleWindow = engine_id3[engine_id3['RUL'] <= engine_id3['RUL'].min() + window]

# cols1 = ['T24', 'T30', 'T48', 'T50', 'P15', 'P2', 'P21', 'P24', 's9', 's10']
# engine_id3_50cycleWindow1 = engine_id3_50cycleWindow[cols1]
# cols2 = ['s11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']
# engine_id3_50cycleWindow2 = engine_id3_50cycleWindow[cols2]

ax1 = engine_id3_50cycleWindow.plot(subplots=True, sharex=True, figsize=(40,40))

## Data Sequence Generation for Model

Keras LSTM layers expect an input in the shape of a numpy array of 3 dimensions (samples, time steps, features) where samples is the number of training sequences, time steps is the look back window or sequence length and features is the number of features of each sequence at each time step.


In [28]:
# function to reshape features into (samples, time steps, features) 
def gen_sequence(id_df, seq_length, seq_cols):
    """ Only sequences that meet the window-length are considered, no padding is used. This means for testing
    we need to drop those which are below the window-length. An alternative would be to pad sequences so that
    we can use shorter ones """
    data_array = id_df[id_df.columns].values
    num_elements = data_array.shape[0]
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        yield data_array[start:stop, :]

In [31]:
# generator for the sequences
id = 3
seq_gen = (list(gen_sequence(df_x_s_train[df_x_s_train['id']==id],  50, df_x_s_train.columns))
           for id in df_x_s_train['id'].unique())

In [32]:
seq_gen

<generator object <genexpr> at 0x7f5580a6af50>

In [None]:
# generate sequences and convert to numpy array
seq_array = np.concatenate(list(seq_gen)).astype(np.float32)
seq_array.shape