In [51]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation
from keras.callbacks import EarlyStopping

In [55]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [2]:
####################################
# train 데이터 load
####################################
# train set load
train_dataset = pd.read_csv('./CMAPSSData/train_FD001.txt', sep=' ', encoding= 'cp949', header=None).drop([26,27], axis=1)
uniq_col = lstm_def.uniq_columns(train_dataset)  # unique_columns

# uniq columns 추출
train_dataset = train_dataset.iloc[:,uniq_col]
train_dataset.rename(columns={0:'id', 1:'cycle'}, inplace=True)
print("train_dataset.shape: ", train_dataset.shape)

train_dataset.shape:  (20631, 19)


In [3]:
train_dataset['ttf'] = train_dataset.groupby(['id'])['cycle'].transform(max) - train_dataset['cycle']
train_dataset.head()

Unnamed: 0,id,cycle,2,3,6,7,8,10,11,12,13,15,16,17,18,19,21,24,25,ttf
0,1,1,-0.0007,-0.0004,641.82,1589.7,1400.6,21.61,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.419,191
1,1,2,0.0019,-0.0003,642.15,1591.82,1403.14,21.61,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.0,23.4236,190
2,1,3,-0.0043,0.0003,642.35,1587.99,1404.2,21.61,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442,189
3,1,4,0.0007,0.0,642.35,1582.79,1401.87,21.61,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,642.37,1582.85,1406.22,21.61,554.0,2388.06,9055.15,47.28,522.19,2388.04,8133.8,8.4294,393,38.9,23.4044,187


In [4]:
print("max: ", train_dataset['ttf'].max())
print("mean: ", train_dataset['ttf'].mean())
print("min: ", train_dataset['ttf'].min())

max:  361
mean:  107.80786195530997
min:  0


In [5]:
####################################
# test 데이터 load
####################################
# test set load
test_dataset = pd.read_csv('./CMAPSSData/test_FD001.txt', sep=' ', encoding= 'cp949', header=None).drop([26,27], axis=1)

# unique columns 추출
test_dataset = test_dataset.iloc[:, uniq_col]
test_dataset.rename(columns={0:'id', 1:'cycle'}, inplace=True)
print("test_dataset.shape: ", test_dataset.shape)

test_dataset.shape:  (13096, 19)


In [6]:
####################################
# Loading Truth table(test data)
####################################

pm_truth=pd.read_csv('./CMAPSSData/RUL_FD001.txt',sep=' ',header=None).drop([1],axis=1)
pm_truth.columns=['more']
pm_truth['id']=pm_truth.index+1
pm_truth.head()

Unnamed: 0,more,id
0,112,1
1,98,2
2,69,3
3,82,4
4,91,5


In [7]:
# generate column max for test data
rul = pd.DataFrame(test_dataset.groupby(['id'])['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
rul.head()

Unnamed: 0,id,max
0,1,31
1,2,49
2,3,126
3,4,106
4,5,98


In [8]:
# run to failure
pm_truth['rtf'] = pm_truth['more'] + rul['max']
pm_truth.head()

Unnamed: 0,more,id,rtf
0,112,1,143
1,98,2,147
2,69,3,195
3,82,4,188
4,91,5,189


In [9]:
pm_truth.drop('more', axis=1, inplace=True)
test_dataset = test_dataset.merge(pm_truth, on =['id'], how='left')

In [10]:
test_dataset['ttf'] = test_dataset['rtf'] - test_dataset['cycle']
test_dataset.drop('rtf', axis=1, inplace=True)

In [11]:
print(test_dataset.shape)
test_dataset.head()

(13096, 20)


Unnamed: 0,id,cycle,2,3,6,7,8,10,11,12,13,15,16,17,18,19,21,24,25,ttf
0,1,1,0.0023,0.0003,643.02,1585.29,1398.21,21.61,553.9,2388.04,9050.17,47.2,521.72,2388.03,8125.55,8.4052,392,38.86,23.3735,142
1,1,2,-0.0027,-0.0003,641.71,1588.45,1395.42,21.61,554.85,2388.01,9054.42,47.5,522.16,2388.06,8139.62,8.3803,393,39.02,23.3916,141
2,1,3,0.0003,0.0001,642.46,1586.94,1401.34,21.61,554.11,2388.05,9056.96,47.5,521.97,2388.03,8130.1,8.4441,393,39.08,23.4166,140
3,1,4,0.0042,0.0,642.44,1584.12,1406.42,21.61,554.07,2388.03,9045.29,47.28,521.38,2388.05,8132.9,8.3917,391,39.0,23.3737,139
4,1,5,0.0014,0.0,642.51,1587.19,1401.92,21.61,554.16,2388.01,9044.55,47.31,522.15,2388.03,8129.54,8.4031,390,38.99,23.413,138


In [12]:
print("max: ", test_dataset['ttf'].max())
print("mean: ", test_dataset['ttf'].mean())
print("min: ", test_dataset['ttf'].min())

max:  340
mean:  141.23846976175932
min:  7


In [13]:
df_train = train_dataset.copy()
df_test = test_dataset.copy()
period = 30
df_train['label_bc'] = df_train['ttf'].apply(lambda x: 1 if x <= period else 0)
df_test['label_bc'] = df_test['ttf'].apply(lambda x: 1 if x <= period else 0)
df_train.head()

Unnamed: 0,id,cycle,2,3,6,7,8,10,11,12,...,15,16,17,18,19,21,24,25,ttf,label_bc
0,1,1,-0.0007,-0.0004,641.82,1589.7,1400.6,21.61,554.36,2388.06,...,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.419,191,0
1,1,2,0.0019,-0.0003,642.15,1591.82,1403.14,21.61,553.75,2388.04,...,47.49,522.28,2388.07,8131.49,8.4318,392,39.0,23.4236,190,0
2,1,3,-0.0043,0.0003,642.35,1587.99,1404.2,21.61,554.26,2388.08,...,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442,189,0
3,1,4,0.0007,0.0,642.35,1582.79,1401.87,21.61,554.45,2388.11,...,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739,188,0
4,1,5,-0.0019,-0.0002,642.37,1582.85,1406.22,21.61,554.0,2388.06,...,47.28,522.19,2388.04,8133.8,8.4294,393,38.9,23.4044,187,0


In [14]:
# feature list
uniq_col.remove(0)
uniq_col.remove(1)

In [15]:
# scaling
sc = StandardScaler()
df_train[uniq_col] = sc.fit_transform(df_train[uniq_col])
df_test[uniq_col] = sc.fit_transform(df_test[uniq_col])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [22]:
df_train

Unnamed: 0,id,cycle,2,3,6,7,8,10,11,12,...,15,16,17,18,19,21,24,25,ttf,label_bc
0,1,1,-0.315980,-1.372953,-1.721725,-0.134255,-0.925936,0.141683,1.121141,-0.516338,...,-0.266467,0.334262,-1.058890,-0.269071,-0.603816,-0.781710,1.348493,1.194427,191,0
1,1,2,0.872722,-1.031720,-1.061780,0.211528,-0.643726,0.141683,0.431930,-0.798093,...,-0.191583,1.174899,-0.363646,-0.642845,-0.275852,-0.781710,1.016528,1.236922,190,0
2,1,3,-1.961874,1.015677,-0.661813,-0.413166,-0.525953,0.141683,1.008155,-0.234584,...,-1.015303,1.364721,-0.919841,-0.551629,-0.649144,-2.073094,0.739891,0.503423,189,0
3,1,4,0.324090,-0.008022,-0.661813,-1.261314,-0.784831,0.141683,1.222827,0.188048,...,-1.539489,1.961302,-0.224597,-0.520176,-1.971665,-0.781710,0.352598,0.777792,188,0
4,1,5,-0.864611,-0.690488,-0.621816,-1.251528,-0.301518,0.141683,0.714393,-0.516338,...,-0.977861,1.052871,-0.780793,-0.521748,-0.339845,-0.136018,0.463253,1.059552,187,0
5,1,6,-1.961874,-0.349255,-1.161771,-0.987297,-1.173703,0.141683,1.471395,-1.079848,...,-1.427163,0.361379,-0.919841,-0.571550,-0.835790,-1.427402,0.905873,0.713126,186,0
6,1,7,0.461248,0.333211,-0.401834,0.293081,-1.240367,0.141683,1.098544,-1.079848,...,-0.678327,1.229134,-0.919841,-0.599334,-1.193084,-0.781710,1.569803,0.810125,185,0
7,1,8,-1.550401,1.015677,-0.241847,-1.233586,-0.884827,0.141683,0.544915,-1.361602,...,-1.127629,1.432514,-0.919841,-0.664862,-0.921114,-1.427402,0.850546,0.193025,184,0
8,1,9,0.369810,0.333211,-1.121775,0.074520,-1.570353,0.141683,0.364139,-0.657216,...,-0.940420,0.510525,-0.641744,-0.946896,-1.849012,-0.781710,1.293165,1.079876,183,0
9,1,10,-1.504681,0.333211,-1.941707,0.116927,-0.941491,0.141683,0.251154,-0.657216,...,-1.913907,0.510525,-0.502695,-0.753457,-0.361176,-0.136018,0.739891,1.660023,182,0


In [31]:
def gen_sequence(id_df, seq_length, seq_cols):
    df_zeros = pd.DataFrame(np.zeros((seq_length-1, id_df.shape[1])), columns=id_df.columns)
    id_df = df_zeros.append(id_df, ignore_index=True)
    data_array = id_df[seq_cols].values
    num_elements = data_array.shape[0]
    lstm_array = []
    for start, stop in zip(range(0, num_elements - seq_length), range(seq_length, num_elements)):
        lstm_array.append(data_array[start:stop,:])
    return np.array(lstm_array)

In [32]:
# function to generate labels
def gen_label(id_df, seq_length, seq_cols, label):
    df_zeros = pd.DataFrame(np.zeros((seq_length-1, id_df.shape[1])), columns=id_df.columns)
    id_df = df_zeros.append(id_df, ignore_index=True)
    data_array = id_df[seq_cols].values
    num_elements = data_array.shape[0]
    y_label = []
    for start, stop in zip(range(0, num_elements - seq_length), range(seq_length, num_elements)):
        y_label.append(id_df[label][stop])
    return np.array(y_label)

In [16]:
# timestamp or window size
seq_length = 50
seq_cols = uniq_col

In [34]:
# generate X_train
X_train = np.concatenate(list(list(gen_sequence(df_train[df_train['id']==id], seq_length, seq_cols)) for id in df_train['id'].unique()))
print(X_train.shape)

# generate y_train
y_train = np.concatenate(list(list(gen_label(df_train[df_train['id']==id], seq_length, seq_cols, 'label_bc')) for id in df_train['id'].unique()))
print(y_train.shape)

(20531, 50, 17)


In [46]:
# generate X_test
X_test = np.concatenate(list(list(gen_sequence(df_test[df_test['id']==id], seq_length, seq_cols)) for id in df_test['id'].unique()))
print(X_test.shape)

# generate Y_test
y_test = np.concatenate(list(list(gen_label(df_test[df_test['id']==id], seq_length, seq_cols, 'label_bc')) for id in df_test['id'].unique()))
print(y_test.shape)

(12996, 50, 17)
(12996,)


In [48]:
####################################
# LSMT Model
####################################
nb_features = X_train.shape[2]
timestamp = seq_length

model = Sequential()
model.add(LSTM(input_shape = (timestamp, nb_features), units=100, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(units=1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 50, 100)           47200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 77,451
Trainable params: 77,451
Non-trainable params: 0
_________________________________________________________________


In [53]:
# fit the network
model.fit(X_train, y_train, epochs=10, batch_size=512, validation_split=0.05, verbose=1, callbacks= [EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto')])

Train on 19504 samples, validate on 1027 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10


<keras.callbacks.History at 0x1ad11839710>

In [54]:
# training metrics
scores =model.evaluate(X_train, y_train, verbose=1, batch_size=512)
print('Accuracy: {}'.format(scores[1]))

Accuracy: 0.9789099410879566


In [56]:
y_pred = model.predict_classes(X_test)
print('Accuracy of model on test data: ', accuracy_score(y_test, y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

Accuracy of model on test data:  0.9275161588180979
Confusion Matrix: 
 [[11722   942]
 [    0   332]]


In [75]:
####################################
# Probability of Machine failure
####################################

def prob_failure(machine_id):
    machine_df = df_test[df_test.id == machine_id]
    machine_test = gen_sequence(machine_df, seq_length, seq_cols)
    m_pred = model.predict(machine_test)
    failure_prob = list(m_pred[-1]*100)[0]
    return failure_prob

In [76]:
machine_id = 13
print('Probability that machine {} will fail within 30 timesteps : ', prob_failure(machine_id))

Probability that machine {} will fail within 30 timesteps :  0.2877899
