## Bibliotecas

In [1]:
import os
import pickle
import math

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf

from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation, Flatten
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Sequential, load_model, model_from_json
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Precision, Recall

from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, ParameterGrid, train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support , roc_auc_score, auc, precision_score, recall_score, f1_score, accuracy_score
from sklearn.utils import class_weight
from sklearn.preprocessing import StandardScaler

from collections import Counter
from scipy import interp
from pickle import load
from random import randint
from timeit import default_timer as timer
from itertools import cycle

%run ./base_functions.ipynb

In [None]:
tf.test.is_built_with_cuda()
tf.config.list_physical_devices('GPU')

### Fixando a seed

In [2]:
# Seed value
# Apparently you may use different seed values at each stage
seed_value = randint(0, 99999)
print(seed_value)

os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)

## Funções

In [3]:
def df_prepro(data):
    
    y_data = data['STATUS'].copy()
    x_data = data.drop(['Unnamed: 0', 'STATUS'], 1)
    
    # Data standardization
    scaled = preprocessing.scale(x_data)
    x_data_norm = pd.DataFrame(scaled, index=x_data.index, columns=x_data.columns)
    
    y_df = pd.DataFrame(y_data).astype('category')
    y_df_ohe = pd.get_dummies(y_df)
    y_ohe = y_df_ohe.values
    
    return x_data_norm, y_data, y_ohe

## Parâmetros e configurações

In [7]:
# Definição dos caminhos das pastas

data_folder = "D:\\4. Doutorado\\Tese\\3. Estudos de caso\\Caso 1 - Tennessee Eastman\\4. Bancos simulados - Python\\"
outputs_folder = "C:\\Users\\anaso\\Desktop\\workspace\\doutorado\\outputs\\"

In [8]:
TEST_CODE = 'MLP1'

## Leitura dos dados

In [9]:
data = pd.read_csv(data_folder + "09-python_dados-3anos.csv", sep=';')
x_data = data.drop(['Unnamed: 0', 'STATUS'], 1)
y_data = data['STATUS'].copy()

data_teste = pd.read_csv(data_folder + "13-python_dados-1ano.csv", sep=';')
x_data_teste = data_teste.drop(['Unnamed: 0', 'STATUS'], 1)
y_test_multi = data_teste['STATUS'].copy()

In [10]:
print("Treino: ", np.shape(data))
print("Teste:  ", np.shape(data_teste))

## Pré-processamento dos dados

In [11]:
# Divisão em treino e validação

X_train, X_valid, y_train_multi, y_valid_multi = train_test_split(x_data, y_data, 
                                                                  test_size=0.15, 
                                                                  random_state=seed_value, 
                                                                  shuffle=False)

print("\nTREINO")
print("X: ", np.shape(X_train))
print("Y: ", np.shape(y_train_multi))
print("Status:", Counter(y_train_multi))

print("\nVALIDAÇÃO")
print("X: ", np.shape(X_valid))
print("Y: ", np.shape(y_valid_multi))
print("Status:", Counter(y_valid_multi))

In [12]:
STATUS = np.unique(y_train_multi)
STATUS

In [13]:
scaler = StandardScaler()
scaler.fit(X_train)

x_train = scaler.transform(X_train)
x_valid = scaler.transform(X_valid)
x_test = scaler.transform(x_data_teste)

In [14]:
y_train = to_categorical(y_train_multi, num_classes=len(STATUS))
y_valid = to_categorical(y_valid_multi, num_classes=len(STATUS))
y_test = to_categorical(y_test_multi, num_classes=len(STATUS))

In [15]:
# Conferência final

print("TREINO")
print("Entradas:      ", np.shape(x_train))
print("Saída:         ", np.shape(y_train))

print("\VALIDAÇÃO")
print("Entradas:      ", np.shape(x_valid))
print("Saída:         ", np.shape(y_valid))

print("\nTESTE")
print("Entradas:      ", np.shape(x_test))
print("Saída:         ", np.shape(y_test))

## Modelagem do sistema de FDD

In [16]:
# Cálculo de class_weight para o caso multilabel

class_weights = class_weight.compute_class_weight(class_weight='balanced', 
                                                  classes=STATUS, 
                                                  y=y_train.argmax(axis=1))

class_weight_dict = {}
for i in range(len(STATUS)):
        class_weight_dict[i] = class_weights[i]

class_weight_dict

### Treinamento do modelo - Treino simples

In [17]:
x_train.shape[1]

52

In [None]:
model = Sequential()

model.add(Dense(40, input_dim=x_train.shape[1], activation ='relu')) 
model.add(Dense(30, activation ='relu')) 
model.add(Dense(len(STATUS), activation ='softmax')) 

model.summary()

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', mode='auto', patience=20, restore_best_weights=True)
model_check = ModelCheckpoint(filepath=outputs_folder+"model_cp_"+TEST_CODE+".h5", monitor="val_loss", mode="auto")

ti = timer()

# Treinamento do modelo
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), 
              metrics=['acc', Precision(), Recall()])

history = model.fit(x_train, y_train, 
                    epochs=100, 
                    batch_size=500, 
                    validation_data=(x_valid, y_valid), 
                    verbose=1,
                    class_weight=class_weight_dict,
                    callbacks=[early_stopping, model_check]) 

save_model(model=model, iterator=TEST_CODE, train_type='simple', model_name='MLP')

tf = timer()

In [None]:
print("Tempo total: " + str(int((tf-ti)//60)) + " minutos e " + str(math.ceil((tf-ti)%60))+ " segundos")

In [None]:
import datetime
print(datetime.datetime.now())

## Análise dos resultados

In [None]:
# Gráficos - Treinamento x Validação

# Informações do treinamento
try:
    train_acc = history.history[model.metrics_names[1]]
    train_loss = history.history[model.metrics_names[0]]
    train_precision = history.history[model.metrics_names[2]]
    train_recall = history.history[model.metrics_names[3]]
except:
    # quando o history vem do CSV salvo do melhor modelo
    train_acc = history['acc']
    train_loss = history['loss']
    train_precision = history['precision_'+str(best_trial)]
    train_recall = history['recall_'+str(best_trial)]

# Informações da validação
try:
    val_acc = history.history['val_'+str(model.metrics_names[1])]
    val_loss = history.history['val_'+str(model.metrics_names[0])]
    val_precision = history.history['val_'+str(model.metrics_names[2])]
    val_recall = history.history['val_'+str(model.metrics_names[3])]
except:
    val_acc = history['val_acc']
    val_loss = history['val_loss']
    val_precision = history['val_precision_'+str(best_trial)]
    val_recall = history['val_recall_'+str(best_trial)]

epochs = range(1, len(train_acc) + 1)
fig = plt.figure(figsize=(16,8))
print("Épocas: ", len(epochs))

plt.subplot(2, 2, 1)
plt.plot(epochs, train_acc, '-bo', label='Training acc')
plt.plot(epochs, val_acc, '-ko', label='Validation acc')
plt.title('Accuracy')
plt.legend()

plt.subplot(2, 2, 2)
plt.plot(epochs, train_loss, '-bo', label='Training loss')
plt.plot(epochs, val_loss, '-ko', label='Validation loss')
plt.title('Loss')
plt.legend()

plt.subplot(2, 2, 3)
plt.plot(epochs, train_precision, '-bo', label='Training precision')
plt.plot(epochs, val_precision, '-ko', label='Validation precision')
plt.title('Precision')
plt.legend()

plt.subplot(2, 2, 4)
plt.plot(epochs, train_recall, '-bo', label='Training recall')
plt.plot(epochs, val_recall, '-ko', label='Validation recall')
plt.title('Recall')
plt.legend()

plt.show()

### TREINAMENTO

In [None]:
# Confusion Matrix w/ Heatmap

out_train = model.predict(x_train)

df_cm_train = pd.DataFrame(confusion_matrix(y_train.argmax(axis=1), out_train.argmax(axis=1)), \
                           index = [i for i in STATUS], columns = [i for i in STATUS])

# Linha para normalizar os dados
df_cm_train_norm = round((df_cm_train.astype('float') / df_cm_train.sum(axis=1)[:, np.newaxis]), 2)

plt.figure(figsize = (13,12), dpi=600)
ax = sn.heatmap(df_cm_train_norm, annot=True, cmap='PuBu')
ax.set_xlabel("CLASSES PREDITAS", fontsize=12)
ax.set_ylabel("CLASSES REAIS", fontsize=12)

bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

In [None]:
# Outras métricas do treinamento - Precision, Recall, F-Score

train_metrics = metrics(y_train, out_train, model, df_cm_train, STATUS, multi_problem=True)

print("\nOverall Precision: {:.2f}%".format((train_metrics['Precision'].sum()/len(STATUS)*100)))
print("Overall Recall:    {:.2f}%".format((train_metrics['Recall'].sum()/len(STATUS)*100)))
print("Overall F1-Score:  {:.2f}%".format((train_metrics['F-score(a=1)'].sum()/len(STATUS)*100)))
print("Overall AUC:       {:.2f}%".format((roc_auc_score(y_train, out_train)*100)))
print("Overall Accuracy:  {:.2f}%".format(accuracy_score(y_train.argmax(axis=1), out_train.argmax(axis=1))*100))

train_metrics

### VALIDAÇÃO

In [None]:
out_valid = model.predict(x_valid)

df_cm_valid = pd.DataFrame(confusion_matrix(y_valid.argmax(axis=1), out_valid.argmax(axis=1)), \
                           index = [i for i in STATUS], columns = [i for i in STATUS])

# Linha para normalizar os dados
df_cm_valid_norm = round((df_cm_valid.astype('float') / df_cm_valid.sum(axis=1)[:, np.newaxis]), 2)

plt.figure(figsize = (13,12), dpi=600)
ax = sn.heatmap(df_cm_valid_norm, annot=True, cmap='PuBu')
ax.set_xlabel("CLASSES PREDITAS", fontsize=12)
ax.set_ylabel("CLASSES REAIS", fontsize=12)

bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

In [None]:
# Outras métricas do treinamento - Precision, Recall, F-Score

valid_metrics = metrics(y_valid, out_valid, model, df_cm_valid, STATUS, multi_problem=True)

print("\nOverall Precision: {:.2f}%".format((valid_metrics['Precision'].sum()/len(STATUS)*100)))
print("Overall Recall:    {:.2f}%".format((valid_metrics['Recall'].sum()/len(STATUS)*100)))
print("Overall F1-Score:  {:.2f}%".format((valid_metrics['F-score(a=1)'].sum()/len(STATUS)*100)))
print("Overall AUC:       {:.2f}%".format((roc_auc_score(y_valid, out_valid)*100)))
print("Overall Accuracy:  {:.2f}%".format(accuracy_score(y_valid.argmax(axis=1), out_valid.argmax(axis=1))*100))

valid_metrics

### TESTE

In [None]:
out_test = model.predict(x_test)

df_cm_test = pd.DataFrame(confusion_matrix(y_test.argmax(axis=1), out_test.argmax(axis=1)), \
                          index = [i for i in STATUS], columns = [i for i in STATUS])

# Linha para normalizar os dados
df_cm_test_norm = round((df_cm_test.astype('float') / df_cm_test.sum(axis=1)[:, np.newaxis]), 2)

plt.figure(figsize = (13,12), dpi=600)
ax = sn.heatmap(df_cm_test_norm, annot=True, cmap='PuBu')
ax.set_xlabel("CLASSES PREDITAS", fontsize=12)
ax.set_ylabel("CLASSES REAIS", fontsize=12)

bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

In [None]:
# Outras métricas do teste - Precision, Recall, F-Score

test_metrics = metrics(y_test, out_test, model, df_cm_test, STATUS, multi_problem=True)

print("\nOverall Precision: {:.2f}%".format((test_metrics['Precision'].sum()/len(STATUS)*100)))
print("Overall Recall:    {:.2f}%".format((test_metrics['Recall'].sum()/len(STATUS)*100)))
print("Overall F1-Score:  {:.2f}%".format((test_metrics['F-score(a=1)'].sum()/len(STATUS)*100)))
print("Overall AUC:       {:.2f}%".format((roc_auc_score(y_test, out_test)*100)))
print("Overall Accuracy:  {:.2f}%".format(accuracy_score(y_test.argmax(axis=1), out_test.argmax(axis=1))*100))

test_metrics

In [None]:
# Construção da curva ROC para o caso binário (base: classe 0)

# Dados reais em OHE
# y_test = pd.DataFrame(y_test).astype('category')
# y_test = pd.get_dummies(y_test).values

# Predições em OHE
y_pred = out_test.copy()

n_classes = len(STATUS)

# Calcula a curva ROC e a métrica AUC para cada classe
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i], )
    roc_auc[i] = auc(fpr[i], tpr[i])
    
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

plt.figure(figsize=(5,5), dpi=300)
plt.plot(fpr[0], tpr[0], color='darkorange', lw=2, label='Curva ROC (area = %0.3f)' % roc_auc[0])
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Taxa de falsos positivos')
plt.ylabel('Taxa de verdadeiros positivos')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Curva ROC e cálculo da métrica AUC para todas as classes

roc_auc_scores = []

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure(figsize=(12,12), dpi=300)
plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.3f})'''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.3f})'''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2, label='ROC curve of class {0} (area = {1:0.3f})'''.format(i, roc_auc[i]))
    roc_auc_scores.append(roc_auc[i])

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Taxa de falsos positivos')
plt.ylabel('Taxa de verdadeiros positivos')
plt.legend(loc="lower right")
plt.show()