# ADVANCE PREDICTION OF MACHINE FAILURE

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix,accuracy_score

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation
from keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline  

### Loading Dataset

In [2]:
data_train=pd.read_csv('PM_train.txt',sep=' ',header=None).drop([26,27],axis=1)
col_names = ['id','cycle','setting1','setting2','setting3','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11','s12','s13','s14','s15','s16','s17','s18','s19','s20','s21']
data_train.columns=col_names
print('Shape of Train dataset: ',data_train.shape)
data_train.head()

Shape of Train dataset:  (20631, 26)


Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [3]:
data_test=pd.read_csv('PM_test.txt',sep=' ',header=None).drop([26,27],axis=1)
data_test.columns=col_names

print('Shape of Test dataset: ',data_test.shape)
data_test.head()

Shape of Test dataset:  (13096, 26)


Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413


#### Loading Truth table

In [4]:
pm_truth_table=pd.read_csv('PM_truth.txt',sep=' ',header=None).drop([1],axis=1)
pm_truth_table.columns=['more']
pm_truth_table['id']=pm_truth_table.index+1
pm_truth_table.head()

Unnamed: 0,more,id
0,112,1
1,98,2
2,69,3
3,82,4
4,91,5


In [5]:
# generate column max for test data
rul = pd.DataFrame(data_test.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
rul.head()

Unnamed: 0,id,max
0,1,31
1,2,49
2,3,126
3,4,106
4,5,98


In [6]:
# run to failure
pm_truth_table['rtf']=pm_truth_table['more']+rul['max']
pm_truth_table.head()

Unnamed: 0,more,id,rtf
0,112,1,143
1,98,2,147
2,69,3,195
3,82,4,188
4,91,5,189


In [7]:
pm_truth_table.drop('more', axis=1, inplace=True)
data_test=data_test.merge(pm_truth_table,on=['id'],how='left')
data_test['ttf']=data_test['rtf'] - data_test['cycle']
data_test.drop('rtf', axis=1, inplace=True)
data_test.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,ttf
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735,142
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916,141
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166,140
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737,139
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413,138


In [8]:
data_train['ttf'] = data_train.groupby(['id'])['cycle'].transform(max)-data_train['cycle']
data_train.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,ttf
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


In [9]:
data_train_copy=data_train.copy()
data_test_copy=data_test.copy()
period=30
data_train_copy['label_bc'] = data_train_copy['ttf'].apply(lambda x: 1 if x <= period else 0)
data_test_copy['label_bc'] = data_test_copy['ttf'].apply(lambda x: 1 if x <= period else 0)
data_train_copy.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s14,s15,s16,s17,s18,s19,s20,s21,ttf,label_bc
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191,0
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190,0
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189,0
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188,0
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187,0


In [10]:
features_col_name=['setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11',
                   's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']
target_col_name='label_bc'

## Feature Scaling

In [11]:
sc=MinMaxScaler()
data_train_copy[features_col_name]=sc.fit_transform(data_train_copy[features_col_name])
data_test_copy[features_col_name]=sc.fit_transform(data_test_copy[features_col_name])
print(data_train_copy.head())

   id  cycle  setting1  setting2  setting3   s1        s2        s3        s4  \
0   1      1  0.459770  0.166667       0.0  0.0  0.183735  0.406802  0.309757   
1   1      2  0.609195  0.250000       0.0  0.0  0.283133  0.453019  0.352633   
2   1      3  0.252874  0.750000       0.0  0.0  0.343373  0.369523  0.370527   
3   1      4  0.540230  0.500000       0.0  0.0  0.343373  0.256159  0.331195   
4   1      5  0.390805  0.333333       0.0  0.0  0.349398  0.257467  0.404625   

    s5    ...          s14       s15  s16       s17  s18  s19       s20  \
0  0.0    ...     0.199608  0.363986  0.0  0.333333  0.0  0.0  0.713178   
1  0.0    ...     0.162813  0.411312  0.0  0.333333  0.0  0.0  0.666667   
2  0.0    ...     0.171793  0.357445  0.0  0.166667  0.0  0.0  0.627907   
3  0.0    ...     0.174889  0.166603  0.0  0.333333  0.0  0.0  0.573643   
4  0.0    ...     0.174734  0.402078  0.0  0.416667  0.0  0.0  0.589147   

        s21  ttf  label_bc  
0  0.724662  191         0  
1  0

## Function to reshape dataset as required by LSTM

In [27]:
def gen_sequence(id_df, seq_length, seq_cols):
    df_zeros=pd.DataFrame(np.zeros((seq_length-1,id_df.shape[1])),columns=id_df.columns)
    id_df=df_zeros.append(id_df,ignore_index=True)
   
   # print(id_df.shape)
    data_array = id_df[seq_cols].values
    num_elements = data_array.shape[0]
    lstm_array=[]
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        lstm_array.append(data_array[start:stop, :])
    #print(len(lstm_array),len(lstm_array[0]),len(lstm_array[0][0]))
    return np.array(lstm_array)

# function to generate labels
def gen_label(id_df, seq_length, seq_cols,label):
    df_zeros=pd.DataFrame(np.zeros((seq_length-1,id_df.shape[1])),columns=id_df.columns)
    id_df=df_zeros.append(id_df,ignore_index=True)
    data_array = id_df[seq_cols].values
    num_elements = data_array.shape[0]
    y_label=[]
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        y_label.append(id_df[label][stop])
    return np.array(y_label)

In [13]:
# timestamp or window size
seq_length=50
seq_cols=features_col_name

In [14]:
# generate X_train
X_train=np.concatenate(list(list(gen_sequence(data_train_copy[data_train_copy['id']==id], seq_length, seq_cols)) for id in data_train_copy['id'].unique()))
print(X_train.shape)
# generate y_train
y_train=np.concatenate(list(list(gen_label(data_train_copy[data_train_copy['id']==id], 50, seq_cols,'label_bc')) for id in data_train_copy['id'].unique()))
print(y_train.shape)

(241, 28)
191 50 24
(336, 28)
286 50 24
(228, 28)
178 50 24
(238, 28)
188 50 24
(318, 28)
268 50 24
(237, 28)
187 50 24
(308, 28)
258 50 24
(199, 28)
149 50 24
(250, 28)
200 50 24
(271, 28)
221 50 24
(289, 28)
239 50 24
(219, 28)
169 50 24
(212, 28)
162 50 24
(229, 28)
179 50 24
(256, 28)
206 50 24
(258, 28)
208 50 24
(325, 28)
275 50 24
(244, 28)
194 50 24
(207, 28)
157 50 24
(283, 28)
233 50 24
(244, 28)
194 50 24
(251, 28)
201 50 24
(217, 28)
167 50 24
(196, 28)
146 50 24
(279, 28)
229 50 24
(248, 28)
198 50 24
(205, 28)
155 50 24
(214, 28)
164 50 24
(212, 28)
162 50 24
(243, 28)
193 50 24
(283, 28)
233 50 24
(240, 28)
190 50 24
(249, 28)
199 50 24
(244, 28)
194 50 24
(230, 28)
180 50 24
(207, 28)
157 50 24
(219, 28)
169 50 24
(243, 28)
193 50 24
(177, 28)
127 50 24
(237, 28)
187 50 24
(265, 28)
215 50 24
(245, 28)
195 50 24
(256, 28)
206 50 24
(241, 28)
191 50 24
(207, 28)
157 50 24
(305, 28)
255 50 24
(263, 28)
213 50 24
(280, 28)
230 50 24
(264, 28)
214 50 24
(247, 28)
197 50 24


In [15]:
# generate X_test
X_test=np.concatenate(list(list(gen_sequence(data_test_copy[data_test_copy['id']==id], seq_length, seq_cols)) for id in data_test_copy['id'].unique()))
print(X_test.shape)
# generate y_test
y_test=np.concatenate(list(list(gen_label(data_test_copy[data_test_copy['id']==id], 50, seq_cols,'label_bc')) for id in data_test_copy['id'].unique()))
print(y_test.shape)

(80, 28)
30 50 24
(98, 28)
48 50 24
(175, 28)
125 50 24
(155, 28)
105 50 24
(147, 28)
97 50 24
(154, 28)
104 50 24
(209, 28)
159 50 24
(215, 28)
165 50 24
(104, 28)
54 50 24
(241, 28)
191 50 24
(132, 28)
82 50 24
(266, 28)
216 50 24
(244, 28)
194 50 24
(95, 28)
45 50 24
(125, 28)
75 50 24
(162, 28)
112 50 24
(214, 28)
164 50 24
(182, 28)
132 50 24
(184, 28)
134 50 24
(233, 28)
183 50 24
(197, 28)
147 50 24
(88, 28)
38 50 24
(179, 28)
129 50 24
(235, 28)
185 50 24
(97, 28)
47 50 24
(125, 28)
75 50 24
(189, 28)
139 50 24
(207, 28)
157 50 24
(220, 28)
170 50 24
(192, 28)
142 50 24
(245, 28)
195 50 24
(194, 28)
144 50 24
(99, 28)
49 50 24
(252, 28)
202 50 24
(247, 28)
197 50 24
(175, 28)
125 50 24
(170, 28)
120 50 24
(174, 28)
124 50 24
(86, 28)
36 50 24
(182, 28)
132 50 24
(172, 28)
122 50 24
(205, 28)
155 50 24
(221, 28)
171 50 24
(103, 28)
53 50 24
(201, 28)
151 50 24
(195, 28)
145 50 24
(122, 28)
72 50 24
(127, 28)
77 50 24
(352, 28)
302 50 24
(123, 28)
73 50 24
(193, 28)
143 50 24
(23

## LSTM Network

In [16]:
nb_features =X_train.shape[2]
timestamp=seq_length

model1 = Sequential()

model1.add(LSTM(
         input_shape=(timestamp, nb_features),
         units=100,
         return_sequences=True))
model1.add(Dropout(0.2))

model1.add(LSTM(
          units=50,
          return_sequences=False))
model1.add(Dropout(0.2))

model1.add(Dense(units=1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 50, 100)           50000     
_________________________________________________________________
dropout (Dropout)            (None, 50, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 51        
Total params: 80,251
Trainable params: 80,251
Non-trainable params: 0
_________________________________________________________________


In [17]:
# fit the network
model1.fit(X_train, y_train, epochs=10, batch_size=200, validation_split=0.05, verbose=1,
          callbacks = [EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto')])

Epoch 1/10
Epoch 2/10
Epoch 3/10


<tensorflow.python.keras.callbacks.History at 0x19b94a4c550>

In [18]:
# training metrics
scores = model1.evaluate(X_train, y_train, verbose=1, batch_size=200)
print('Accurracy: {}'.format(scores[1]))

Accurracy: 0.9718474745750427


In [19]:
y_pred=model1.predict_classes(X_test)
print('Accuracy of model on test data: ',accuracy_score(y_test,y_pred))
print('Confusion Matrix: \n',confusion_matrix(y_test,y_pred))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
Accuracy of model on test data:  0.9460603262542321
Confusion Matrix: 
 [[11965   699]
 [    2   330]]


# Probability of Machine failure

In [28]:
def prob_failure(machine_id):
    machine_df=data_test_copy[data_test_copy.id==machine_id]
    machine_test=gen_sequence(machine_df,seq_length,seq_cols)
    m_pred=model1.predict(machine_test)
   # print(m_pred)
    failure_prob=list(m_pred[-1])[0]
    return failure_prob

In [31]:
machine_id=100

print('Probability that machine will fail within 30 days: ',prob_failure(machine_id))
no = prob_failure(machine_id)
if no>0.5:
    print("Maintenance ALERT")
else:
    print("Machine is alright")

Probability that machine will fail within 30 days:  0.99172217
Maintenance ALERT
