# Data Loading and visualization

In [None]:
CSV_path = 'ai4i2020.csv'

## Pre-Processing

In [None]:
def plot_data_bar(Y_train, Y_test):
  import matplotlib.pyplot as plt
  data = {'Non-Failure':'maroon', 'Failure':'red'}         
  labels = list(data.keys())
  handles = [plt.Rectangle((0,0),1,1, color=data[label]) for label in labels]
  values_train = list(Y_train.value_counts())
  values_test = list(Y_test.value_counts())

  plt.rcParams['figure.figsize'] = 8, 8

  # plot training and validation history
  fig, (ax1, ax2) = plt.subplots(1, 2)
  ax1.bar(labels, values_train, color = list(data.values()), width = 0.9)
  ax1.set_xlabel("Training data Machine Status")
  ax1.set_ylabel("No. of training samples")
  ax1.set_title("Training Data")

  ax2.bar(labels, values_test, color = list(data.values()), width = 0.9)
  ax2.set_xlabel("Test data Machine Status")
  ax2.set_ylabel("No. of test samples")
  ax2.set_title("Test Data")

  count_class_0, count_class_1 = Y_train.value_counts()
  ax1.legend(handles, values_train, loc='best')

  count_class_0, count_class_1 = Y_test.value_counts()
  ax2.legend(handles, values_test, loc='best')

  plt.tight_layout()
  return plt

In [None]:
import pandas as pd
import numpy as np

df_data = pd.read_csv(CSV_path)
df_data.sample(frac=0.0001, replace=True, random_state=1)

In [None]:
df_data.nunique()

In [None]:
df_data = df_data.iloc[:, 2:-1]
df_data.sample(frac=0.0001, replace=True, random_state=1)

In [None]:
df_data.iloc[:, -5:-4]

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

df_data = df_data.reset_index(drop=True)

#One Hote Encoding for machine 'Type'
type_cols_scaled = pd.get_dummies(df_data['Type'])
type_cols_scaled.columns = ['High', 'Low', 'Medium']

#scaling
scaler = StandardScaler()
num_cols = df_data.iloc[:, 1:-5]
nums_cols = scaler.fit_transform(num_cols)
num_cols_scaled = pd.DataFrame(nums_cols)
num_cols_scaled.columns = df_data.columns[1:-5]

# concate them in a new dataframe

df_data_scaled = pd.concat([type_cols_scaled, num_cols_scaled, df_data.iloc[:, -5:-4]], axis = 1)

df_data_scaled.sample(frac=0.0001, replace=True, random_state=1)

In [None]:
import numpy as np
import matplotlib.pyplot as plt 


#Extracting all the input and output data

x_all = df_data_scaled.iloc[:,:-1]
y_all = df_data_scaled['Machine failure']

data = {'Non-Failure':'maroon', 'Failure':'red'}         
labels = list(data.keys())
handles = [plt.Rectangle((0,0),1,1, color=data[label]) for label in labels]
values = list(y_all.value_counts())

fig = plt.figure(figsize = (5, 8))
 
# creating the bar plot
plt.bar(labels, values, color = list(data.values()), width = 0.9)

plt.xlabel("Machine Status")
plt.ylabel("No. of samples")
plt.legend(handles, values)
plt.title("Total data present")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt 


#Extracting all the input and output data

x_all = df_data_scaled.iloc[:,:-1]
y_all = df_data_scaled['Machine failure']

data = {'Non-Failure':'maroon', 'Failure':'red'}         
labels = list(data.keys())
handles = [plt.Rectangle((0,0),1,1, color=data[label]) for label in labels]
values = list(y_all.value_counts())

plt.rcParams["figure.figsize"] = [7.50, 7.50]
plt.rcParams["figure.autolayout"] = True

width = 0.35
fig, ax = plt.subplots()

pps = ax.bar(labels, values, color = list(data.values()), width = 0.9)

for p in pps:
   height = p.get_height()
   plt.annotate( "{}%".format(height/100),(p.get_x() + p.get_width()/2, height+.05),ha="center",va="bottom",fontsize=15)

ax.yaxis.set_visible(False)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

# Split the data in training and validation dataset
X_train, X_test, Y_train, Y_test = train_test_split(x_all,y_all, train_size=0.8, random_state=15, stratify=y_all)

In [None]:
Y_test.value_counts()

In [None]:
plot_data_bar(Y_train, Y_test)

## Visualizing the imbalanced data

In [None]:
count_class_0, count_class_1 = y_all.value_counts()

df_class_0 = df_data_scaled[df_data_scaled['Machine failure'] == 0]
df_class_1 = df_data_scaled[df_data_scaled['Machine failure'] == 1]

In [None]:
initial_bias = np.log([count_class_1/count_class_0])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.jointplot(x=df_class_1['Process temperature [K]'], y=df_class_1['Rotational speed [rpm]'],
              kind='hex', xlim=(-5,5), ylim=(-5,5))
plt.suptitle("Positive distribution")

sns.jointplot(x=df_class_0['Process temperature [K]'], y=df_class_0['Rotational speed [rpm]'],
              kind='hex', xlim=(-5,5), ylim=(-5,5))
_ = plt.suptitle("Negative distribution")

## Under Sampling

In [None]:
from sklearn.model_selection import train_test_split

# Split the data in training and validation dataset
X_train, X_test, Y_train, Y_test = train_test_split(x_all,y_all, train_size=0.8, random_state=13, stratify=y_all)

#Checking data balance

count_class_0, count_class_1 = Y_train.value_counts()

In [None]:
#Undersampling to provide balanced training data

train_data = pd.concat([X_train, Y_train], axis=1)

df_class_0 = train_data[train_data['Machine failure'] == 0]
df_class_1 = train_data[train_data['Machine failure'] == 1]

df_class_0 = df_class_0.sample(count_class_1)

print(df_class_0.shape)
print(df_class_1.shape)

df_data_under = pd.concat([df_class_0, df_class_1])

print(df_data_under.shape)

df_data_under.sample(frac=0.01, replace=True, random_state=1)

In [None]:
#Extracting Undersampled Data

X_train = df_data_under.iloc[:,:-1]
Y_train = df_data_under['Machine failure']

print(X_train.shape)
print(Y_train.shape)
print(Y_train.value_counts())

In [None]:
plot_data_bar(Y_train, Y_test)

## Over Sampling

In [None]:
from sklearn.model_selection import train_test_split

# Split the data in training and validation dataset
X_train, X_test, Y_train, Y_test = train_test_split(x_all,y_all, train_size=0.8, random_state=13, stratify=y_all)

#Checking data balance

count_class_0, count_class_1 = Y_train.value_counts()
count_class_0, count_class_1

In [None]:
#Oversampling to provide balanced training data

train_data = pd.concat([X_train, Y_train], axis=1)

df_class_0 = train_data[train_data['Machine failure'] == 0]
df_class_1 = train_data[train_data['Machine failure'] == 1]

df_class_1 = df_class_1.sample(count_class_0, replace = True)

print(df_class_0.shape)
print(df_class_1.shape)

df_data_over = pd.concat([df_class_0, df_class_1])

print(df_data_over.shape)

In [None]:
#Extracting Oversampled Data

X_train = df_data_over.iloc[:,:-1]
Y_train = df_data_over['Machine failure']

print(X_train.shape)
print(Y_train.shape)
print(Y_train.value_counts())

In [None]:
plot_data_bar(Y_train, Y_test)

## GAN

### GAN Model Defintion

In [None]:
from tensorflow.keras.datasets import mnist
import visualkeras
from collections import defaultdict
from PIL import ImageFont
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense, Input, Flatten, Conv2D, MaxPool2D, LeakyReLU, Reshape, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import utils

def discriminator_dense():
    inp = Input(shape=8)
    x = Dense(1024, activation=LeakyReLU(alpha=0.2))(inp)
    x = Dropout(0.4)(x)
    x = Dense(512, activation=LeakyReLU(alpha=0.2))(x)
    x = Dropout(0.4)(x)
    x = Dense(512, activation=LeakyReLU(alpha=0.2))(x)
    
    op = Dense(1, activation="sigmoid")(x)
    
    model = Model(inp, op)
    model.compile(
    optimizer = Adam(learning_rate=0.0002, beta_1=0.5),
    loss="binary_crossentropy")
    
    return model
  
discrim  = discriminator_dense()
discrim .summary()

utils.plot_model(discrim , show_shapes=True, to_file="model_discriminator.png")

In [None]:
def generator(n):
    inp = Input(shape=(n))
    
    x = Dense(256, activation=LeakyReLU(alpha=0.2))(inp)
    x = Dense(512, activation=LeakyReLU(alpha=0.2))(x)
    x = Dense(1024, activation=LeakyReLU(alpha=0.2))(x)
    x = Dense(256, activation=LeakyReLU(alpha=0.2))(x)
    x = Dense(64, activation=LeakyReLU(alpha=0.2))(x)
    op = Dense(8, activation='tanh')(x)
    
    return Model(inp, op)

gener = generator(100)
gener.summary()

utils.plot_model(gener, show_shapes=True, to_file="model_generator.png")

In [None]:
def gan(discrim, gener):
    discrim.trainable = False
    
    model = Sequential()
    
    model.add(gener)
    model.add(discrim)
    
    model.compile(
    optimizer=Adam(lr=0.0002, beta_1=0.5),
    loss="binary_crossentropy")
    
    return model

gan_model = gan(discrim, gener)
gan_model .summary()

utils.plot_model(gan_model , show_shapes=True, to_file="model_discriminator.png")

### GAN Model Training

In [None]:
df_class_0 = df_data_scaled[df_data_scaled['Machine failure'] == 0]
df_class_1 = df_data_scaled[df_data_scaled['Machine failure'] == 1]

In [None]:
from tqdm import tqdm

df_class1 = df_class_1.iloc[:,:-1].to_numpy()
epochs = 3000
batch_size = 32
half_batch = batch_size//2
n = 100
losses = []
data_len = len(df_class1)

In [None]:
import time
start = time.time()
for i in range(epochs):
    print("Epoch: ", i)
    for j in tqdm(range(data_len//batch_size)):
        
        xreal, yreal = df_class1[np.random.randint(0, data_len, half_batch)].reshape(half_batch, 8), np.ones(half_batch).reshape(half_batch, 1)

        xfake, yfake = gener.predict(np.random.randn(half_batch, n)), np.zeros(half_batch).reshape(half_batch, 1)
        
        xfinal, yfinal = np.vstack((xreal, xfake)), np.vstack((yreal, yfake))
        
        dloss = discrim.train_on_batch(xfinal, yfinal)
        
        gloss = gan_model.train_on_batch(np.random.randn(batch_size, n), np.ones(batch_size).reshape(batch_size, 1))
        
        losses.append([dloss, gloss])
    
    print("losess --> ", dloss, "  ", gloss)
end = time.time()
print('Time taken: ', end-start)

In [None]:
gan_model.save('my_gan_model.h5')

gener.save('my_gan_generator_model.h5')

discrim.save('my_gan_discriminator_model.h5')

### GAN Model application

In [None]:
import tensorflow as tf

#load the model

gan_model = tf.keras.models.load_model('my_gan_model.h5')
gener = tf.keras.models.load_model('my_gan_generator_model.h5')
discrim = tf.keras.models.load_model('my_gan_discriminator_model.h5')

In [None]:
from sklearn.model_selection import train_test_split

# Split the data in training and validation dataset
X_train, X_test, Y_train, Y_test = train_test_split(x_all,y_all, train_size=0.8, random_state=13, stratify=y_all)

In [None]:
train_data = pd.concat([X_train, Y_train], axis=1)

df_class_0 = train_data[train_data['Machine failure'] == 0]
df_class_1 = train_data[train_data['Machine failure'] == 1]

fake_data_len = len(df_class_0)//2 - len(df_class_1)

In [None]:
from tqdm import tqdm

x_fake = []
y_fake = []
batch_size = 16
n = 100
for i in tqdm(range(fake_data_len//batch_size)):
  x_fake.append(gener.predict(np.random.randn(batch_size, n)))
  y_fake.append(np.ones(batch_size).reshape(batch_size, 1))

newx = np.array(x_fake)
newy = np.array(y_fake)

x_fake_new = newx.reshape(len(newx)*len(newx[0]), 8)
y_fake_new = newy.reshape(len(newy)*len(newy[0]), 1)

In [None]:
x_gan_new = pd.DataFrame(x_fake_new)

y_gan_new = pd.DataFrame(y_fake_new)

# concate them in a new dataframe

x_gan_new.columns = X_train.columns

X_train = pd.concat([X_train, x_gan_new], axis = 0, ignore_index=True)
Y_train = pd.concat([Y_train, y_gan_new], axis = 0)

In [None]:
plot_data_bar(Y_train, Y_test)

## SMOTE

In [None]:
from sklearn.model_selection import train_test_split

# Split the data in training and validation dataset
X_train, X_test, Y_train, Y_test = train_test_split(x_all,y_all, train_size=0.8, random_state=13, stratify=y_all)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_train, Y_train = smote.fit_resample(X_train, Y_train)

Y_train.value_counts()

In [None]:
plot_data_bar(Y_train, Y_test)

# Linear SVM

## SVM Model

In [None]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
import time


model = LinearSVC()
print('Training Support Vector Machine with {} data'.format(len(X_train)))
start = time.time()
model.fit(X_train,Y_train)
end = time.time()
print("Training Over in: ", end-start)

## Linear SVM Model Statistical Analysis

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def plot_confusion_matrix(testLabels,testPred,title):
    cm = confusion_matrix(testLabels,testPred)
    cm_df = pd.DataFrame(cm,
                         index = ['Non-Failure','Failure'], 
                         columns = ['Non-Failure','Failure'])
    sns.heatmap(cm_df, annot=True, fmt="d", cmap = "Reds")
    plt.title(title)
    plt.ylabel('Actual Values')
    plt.xlabel('Predicted Values')
    plt.savefig('Confusion_Matrix')
    plt.show()

In [None]:
from sklearn.metrics import recall_score, f1_score, accuracy_score, precision_score

Y_testpred = model.predict(X_test)

print("Classification Report for Linear SVM model\n\n")
print(classification_report(Y_test,Y_testpred))

recall = recall_score(Y_test,Y_testpred)
precision = precision_score(Y_test,Y_testpred)
f1 = f1_score(Y_test,Y_testpred) 

print('recall:', recall, '\nprecision:', precision, '\nf1:',f1)

plot_confusion_matrix(Y_test,Y_testpred,"Confusion Matrix for SVM model")


# Multi Layer Perceptron (MLP)

## MLP Model

In [None]:
from keras.datasets import mnist
import visualkeras
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense, Input, Flatten, Conv2D, MaxPool2D, LeakyReLU, Reshape, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import utils



def mlp_arch():

  # set the precision for the network
  tf.keras.backend.set_floatx('float64')
  
  # specify metrics to use
  recall = tf.keras.metrics.Recall()
  
  # precision = tf.keras.metrics.Precision()
  fp = tf.keras.metrics.FalsePositives()
  
  inp = Input(shape=(8,))
  x = Dense(64, activation='relu')(inp)
  x = Dropout(0.5)(x)
  x = Dense(32, activation='relu')(x)
  x = Dropout(0.5)(x)
  x = Dense(16, activation='relu')(x)
  x = Dropout(0.5)(x)
  
  op = Dense(1, activation="sigmoid")(x)
  
  model = Model(inp, op)
  model.compile(
      optimizer = Adam(lr=0.0002, beta_1=0.5),
      loss="binary_crossentropy",
      metrics=['accuracy', recall, fp])
    
  return model

mlp_model = mlp_arch()
mlp_model.summary()

utils.plot_model(mlp_model, show_shapes=True, to_file="mlp_model.png")

## MLP Model Training

In [None]:
Y_train_MLP = Y_train.values.reshape(len(Y_train), 1)

Y_test_MLP = Y_test.values.reshape(len(Y_test), 1)

train_data = (X_train, Y_train_MLP)

valid_data = (X_test, Y_test_MLP)

In [None]:
import time
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tqdm import tqdm

# set early stopping criterian
es = EarlyStopping(monitor='val_recall_4', mode='max', 
                   verbose=1, patience=1500)

start = time.time()

# train the model
history = mlp_model.fit(X_train, Y_train_MLP,
                        epochs=3000,
                        batch_size=64,
                        verbose=0,
                        workers=20,
                        validation_split=0.1,
                        callbacks=[es]
                 )

end = time.time()

print("Training Over in: ", end-start)

In [None]:
mlp_model.save('mlp_gan_model.h5')

## MLP Model Statistical Analysis

In [None]:
import tensorflow as tf

#load the model trained with GAN

mlp_model = tf.keras.models.load_model('mlp_gan_model.h5')

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def plot_confusion_matrix(testLabels,testPred,title):
    cm = confusion_matrix(testLabels,testPred)
    cm_df = pd.DataFrame(cm,
                         index = ['Non-Failure','Failure'], 
                         columns = ['Non-Failure','Failure'])
    sns.heatmap(cm_df, annot=True, fmt="d", cmap = "Reds")
    plt.title(title)
    plt.ylabel('Actual Values')
    plt.xlabel('Predicted Values')
    plt.savefig('Confusion_Matrix')
    plt.show()

In [None]:
from sklearn.metrics import recall_score, f1_score, accuracy_score, precision_score

Y_predict = mlp_model.predict(X_test)
Y_predict = Y_predict.reshape(len(Y_predict))
Y_predict = (Y_predict >= 0.5).astype("float")

print("Classification Report for MLP model Validation Data\n\n")
print(classification_report(Y_test_MLP,Y_predict))

recall = recall_score(Y_test_MLP,Y_predict)
precision = precision_score(Y_test_MLP,Y_predict)
f1 = f1_score(Y_test_MLP,Y_predict) 

print('recall:', recall, '\nprecision:', precision, '\nf1:',f1)

plot_confusion_matrix(Y_test,Y_predict,"Confusion Matrix for MLP model")

In [None]:
mlp_model.evaluate(X_test, Y_test_MLP)

In [None]:
# evaluate
_, _, tr_recall, tr_fp = mlp_model.evaluate(X_train, Y_train_MLP, verbose=0)
_, _, test_recall, test_fp = mlp_model.evaluate(X_test, Y_test_MLP, verbose=0)
print('Train_recall: %.3f, Test_recall: %.3f' % (tr_recall, test_recall))
print('Train_fp: %.3f, Test_fp: %.3f' % (tr_fp, test_fp))

In [None]:
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = 20, 10


# plot training and validation history
fig, (ax1, ax2) = plt.subplots(2, 1)
ax1.plot(history.history['recall_4'], label='train_recall')
ax1.plot(history.history['val_recall_4'], label='val_recall')
ax2.plot(history.history['false_positives_4'], label='train_fp')
ax2.plot(history.history['val_false_positives_4'], label='val_fp')

ax2.set_xlim([0, 2500])
ax2.set_ylim([0, 100])

ax1.legend(loc='upper left')
ax2.legend(loc='upper left')

plt.tight_layout();

In [None]:
# Get training and test loss histories
training_loss = history.history['loss']
valid_loss = history.history['val_loss']

# Create count of the number of epochs
epoch_count = range(1, len(training_loss) + 1)

# Visualize loss history
plt.plot(epoch_count, training_loss, '--')
plt.plot(epoch_count, valid_loss, '-')
plt.grid()
plt.legend(['Training Loss', 'Validation Loss'], loc='upper left')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show();

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(Y_test_MLP))]

# predict probabilities
lr_probs = mlp_model.predict(X_test)

# keep probabilities for the positive outcome only
#lr_probs = lr_probs[lr_probs>=0.5]

# calculate scores
ns_auc = roc_auc_score(Y_test_MLP, ns_probs)
lr_auc = roc_auc_score(Y_test_MLP, lr_probs)

# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('MLP: ROC AUC=%.3f' % (lr_auc))

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(Y_test_MLP, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(Y_test_MLP, lr_probs)

# plot the roc curve for the model
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='MLP')

# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')

s = 'AUC = ' + str(lr_auc)

pyplot.text(0.9, 0, s, bbox=dict(facecolor='blue', alpha=0.5))

pyplot.grid()

# show the legend
pyplot.legend(loc='upper left')
# show the plot
pyplot.show()