## Bibliotecas

In [None]:
import collections
import math
import os
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
import tensorflow as tf

from collections import Counter
from scipy import interp

from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Conv2D, Dense, Dropout, MaxPooling2D, BatchNormalization, Activation, Flatten, GaussianNoise
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential, load_model, model_from_json
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Precision, Recall

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, ParameterGrid, train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support , roc_auc_score, auc, precision_score, recall_score, f1_score, accuracy_score
from sklearn.utils import class_weight

from pickle import load
from timeit import default_timer as timer
from random import randint
from itertools import cycle

%run ./base_functions.ipynb
# %run ./config.ipynb

pd.options.mode.chained_assignment = None

In [None]:
tf.test.is_built_with_cuda()
tf.config.list_physical_devices('GPU')

### Fixando a seed

In [None]:
# Seed value
# Apparently you may use different seed values at each stage
seed_value = randint(0, 99999)
print(seed_value)

os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)

## Parâmetros e configurações

In [None]:
# Definição dos caminhos das pastas

data_folder = "D:\\Caso 2 - Bomba Centrífuga\\Bancos tratados\\"
outputs_folder = "C:\\Users\\anaso\\Desktop\\workspace\\doutorado\\outputs\\"

In [None]:
TEST_CODE = 'T1'

### Funções

In [None]:
def vectorized_stride_ana(array, max_time, sub_window_size, stride_size):
    
    sub_windows = ( 
        np.expand_dims(np.arange(sub_window_size), 0) +
        np.expand_dims(np.arange(max_time + 1), 0).T
    )
    
    # Descobre o index da última coluna do array
    last_col_index = (array.shape[1])-1
    
    # Linha da matriz de índices que vai até o tamanho total do trecho que será convertido em matrizes
    cut_point = np.where(sub_windows[:,last_col_index] == len(array)-1)[0].item()
    
    # Faz o corte
    sub_windows_new = sub_windows[:cut_point+1] # adicionei o +1 pra bater com o número total de matreizes
    
    # Fancy indexing to select every V rows.
    return array[sub_windows_new[::stride_size]]

## Leitura dos dados

In [None]:
data = pd.read_csv(data_folder + "banco_labeling-v1.csv", sep=';')
data.head()

In [None]:
time_data = data[['Time']]
x_data = data.drop(['rotulos_multi', 'rotulos_bin', 'Time'], axis=1)
y_bin = data[['rotulos_bin']]
y_multi = data[['rotulos_multi']]

print(x_data.shape)
print(y_multi.shape)

In [None]:
Counter(y_multi['rotulos_multi'])

In [None]:
Counter(y_bin['rotulos_bin'])

In [None]:
params = pd.read_csv(data_folder + "banco_labeling_params-v1.csv", sep=';')

## Pré-processamento dos dados

In [None]:
# Auxilia a identificar o ponto no qual o banco precisa ser dividido
params[60:]

In [None]:
# Remove colunas desnecessárias
params.drop(['status_init', 'status_end'], 1, inplace=True)

In [None]:
new_params = params.iloc[:74,:]
new_x_data = x_data.loc[:1212560,:]
new_y_multi = y_multi.loc[:1212560]

x_test = x_data.loc[1212560:,:]
test_params = params[74:]
y_test = y_multi.loc[1212560:]

print(x_test.shape)
print(y_test.shape)
print(Counter(y_test['rotulos_multi']))
print()

print(x_data.shape)
print("", new_x_data.shape[0] + x_test.shape[0])

In [None]:
nlinhas = 25
ncolunas = 25
sliding_window = 5

ti = timer()
x_windows, y_windows, y_windows_ohe = matrix_generator(new_x_data, new_params, nlinhas, ncolunas, \
                                                       sliding_window)
tf = timer()

In [None]:
print("Tempo total: " + str(int((tf-ti)//60)) + " minutos e " + str(math.ceil((tf-ti)%60))+ " segundos")

In [None]:
# Divisão randômica e estratificada em treino e validação

X_train, X_valid, y_train_multi, y_valid_multi = train_test_split(x_windows, y_windows, 
                                                                  test_size=0.15, 
                                                                  random_state=seed_value, 
                                                                  shuffle=True, 
                                                                  stratify=y_windows)

print("\nTREINO")
print("X: ", np.shape(X_train))
print("Y: ", np.shape(y_train_multi))
print("Status:", Counter(y_train_multi))

print("\nVALIDAÇÃO")
print("X: ", np.shape(X_valid))
print("Y: ", np.shape(y_valid_multi))
print("Status:", Counter(y_valid_multi))

In [None]:
# Transforma os rótulos em binário
y_windows_train_bin = np.where(y_train_multi != 0, 1, y_train_multi)
y_windows_valid_bin = np.where(y_valid_multi != 0, 1, y_valid_multi)

# Faz um novo OHE
y_windows_train_ohe_bin = to_categorical(y_windows_train_bin, num_classes=2)
y_windows_valid_ohe_bin = to_categorical(y_windows_valid_bin, num_classes=2)

y_train = y_windows_train_ohe_bin
y_valid = y_windows_valid_ohe_bin

STATUS = [0,1]

In [None]:
# Scaling

df_train = pd.DataFrame(X_train.reshape((25*X_train.shape[0], 25)))
df_valid = pd.DataFrame(X_valid.reshape((25*X_valid.shape[0], 25)))

scaler = StandardScaler()
scaler.fit(df_train)

df_train_norm = scaler.transform(df_train)
df_valid_norm = scaler.transform(df_valid)

In [None]:
back_to_4d_train = vectorized_stride_ana(df_train_norm, len(df_train_norm)-1, nlinhas, nlinhas)
x_train = back_to_4d_train.reshape((len(back_to_4d_train), nlinhas, ncolunas, 1), order='C')

back_to_4d_valid = vectorized_stride_ana(df_valid_norm, len(df_valid_norm)-1, nlinhas, nlinhas)
x_valid = back_to_4d_valid.reshape((len(back_to_4d_valid), nlinhas, ncolunas, 1), order='C')

print(X_train.shape)
print(x_train.shape)
print()

print(X_valid.shape)
print(x_valid.shape)

In [None]:
# Teste sem shuffle

test_norm = pd.DataFrame(scaler.transform(x_test), columns=x_test.columns)
x_test, y_windows, y_windows_ohe = matrix_generator(test_norm, test_params, nlinhas, ncolunas, \
                                                    sliding_window)
print()
print(x_test.shape)
print(y_windows.shape)
print(Counter(y_windows))

y_windows_test_bin = np.where(y_windows != 0, 1, y_windows)
y_test = to_categorical(y_windows_test_bin, num_classes=2)

In [None]:
print(x_test.shape)
print(y_test.shape)

## Modelagem do sistema de FDD

In [None]:
neg, pos = np.bincount(y_train.argmax(axis=1)) 
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0

class_weight_dict = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

### Treinamento do modelo - Treino simples

In [None]:
# Definição da topologia do modelo

model = Sequential()

model.add(
    Conv2D(
        filters=10,
        kernel_size=(3,3),
        strides=1,
        padding="same",
        data_format='channels_last',
        use_bias=True,
        kernel_initializer="glorot_uniform",
        bias_initializer="zeros",
        activation='relu',
        input_shape=(nlinhas,ncolunas,1)
    )
)
model.add(
    Conv2D(
        filters=20,
        kernel_size=(3,3),
        strides=1,
        padding="same",
        data_format='channels_last',
        use_bias=True,
        kernel_initializer="glorot_uniform",
        bias_initializer="zeros",
        activation='relu',
    )
)

model.add(MaxPooling2D(pool_size=(2,2), strides=2, data_format='channels_last'))

# model.add(
#     Conv2D(
#         filters=30,
#         kernel_size=(3,3),
#         strides=1,
#         padding="same",
#         data_format='channels_last',
#         use_bias=True,
#         kernel_initializer="glorot_uniform",
#         bias_initializer="zeros",
#         activation='relu',
#     )
# )

# model.add(MaxPooling2D(pool_size=(2,2), strides=2, data_format='channels_last'))

model.add(Flatten())

model.add(Dense(units=len(STATUS), activation='softmax'))

model.summary()

In [None]:
# Treinamento da rede convolucional

# Definição dos callbacks usados
callbacks_list = [EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True),
                  ModelCheckpoint(filepath=outputs_folder+"model_cp_"+TEST_CODE+".h5", monitor="val_loss", mode="auto")]

model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), 
              metrics=['acc', Precision(), Recall()])

ti = timer()

history = model.fit(x_train, y_train, 
                    epochs=10,#0, 
                    batch_size=500, 
                    validation_data=(x_valid, y_valid), 
                    verbose=1,
                    shuffle=False,
                    class_weight=class_weight_dict,
                    callbacks=callbacks_list) 

save_model(model=model, iterator=TEST_CODE, train_type='simple', model_name='CNN')

tf = timer()

In [None]:
print("Tempo total: " + str(int((tf-ti)//60)) + " minutos e " + str(math.ceil((tf-ti)%60))+ " segundos")

In [None]:
import datetime
print(datetime.datetime.now())

## Análise dos resultados

In [None]:
# Gráficos - Treinamento x Validação

# Informações do treinamento
try:
    train_acc = history.history[model.metrics_names[1]]
    train_loss = history.history[model.metrics_names[0]]
    train_precision = history.history[model.metrics_names[2]]
    train_recall = history.history[model.metrics_names[3]]
except:
    # quando o history vem do CSV salvo do melhor modelo
    train_acc = history['acc']
    train_loss = history['loss']
    train_precision = history['precision_'+str(best_trial)]
    train_recall = history['recall_'+str(best_trial)]

# Informações da validação
try:
    val_acc = history.history['val_'+str(model.metrics_names[1])]
    val_loss = history.history['val_'+str(model.metrics_names[0])]
    val_precision = history.history['val_'+str(model.metrics_names[2])]
    val_recall = history.history['val_'+str(model.metrics_names[3])]
except:
    val_acc = history['val_acc']
    val_loss = history['val_loss']
    val_precision = history['val_precision_'+str(best_trial)]
    val_recall = history['val_recall_'+str(best_trial)]

epochs = range(1, len(train_acc) + 1)
fig = plt.figure(figsize=(16,8))
print("Épocas: ", len(epochs))

plt.subplot(2, 2, 1)
plt.plot(epochs, train_acc, '-bo', label='Training acc')
plt.plot(epochs, val_acc, '-ko', label='Validation acc')
plt.title('Accuracy')
plt.legend()

plt.subplot(2, 2, 2)
plt.plot(epochs, train_loss, '-bo', label='Training loss')
plt.plot(epochs, val_loss, '-ko', label='Validation loss')
plt.title('Loss')
plt.legend()

plt.subplot(2, 2, 3)
plt.plot(epochs, train_precision, '-bo', label='Training precision')
plt.plot(epochs, val_precision, '-ko', label='Validation precision')
plt.title('Precision')
plt.legend()

plt.subplot(2, 2, 4)
plt.plot(epochs, train_recall, '-bo', label='Training recall')
plt.plot(epochs, val_recall, '-ko', label='Validation recall')
plt.title('Recall')
plt.legend()

plt.show()

In [None]:
# Treinamento

train_metrics = display_metrics(x_train, y_train, model, STATUS, 'Treino', multi_problem=False) 
train_metrics

In [None]:
# Validação

valid_metrics = display_metrics(x_valid, y_valid, model, STATUS, 'Validação', multi_problem=False) 
valid_metrics

In [None]:
# Teste

test_metrics = display_metrics(x_test, y_test, model, STATUS, 'Teste', multi_problem=False) 
test_metrics

In [None]:
pred = model.predict(x_test)

plt.figure(figsize=(12,3), dpi=100)
plt.plot(y_test.argmax(axis=1))
plt.title("Real")

plt.figure(figsize=(12,3), dpi=100)
plt.plot(pred.argmax(axis=1))
plt.title("Predito")

plt.figure(figsize=(12,3), dpi=100)
plt.plot(x_test['VT-322'])

In [None]:
calculate_roc_auc(x_test, y_test, model, 'CNN')

In [None]:
# Construção da curva ROC para o caso binário (base: classe 0)

model_pred = model.predict(x_test)

# Predições em OHE
y_pred = pd.DataFrame(model_pred.argmax(axis=1)).astype('category')
y_pred = pd.get_dummies(y_pred).values

n_classes = len(STATUS)

# Calcula a curva ROC e a métrica AUC para cada classe
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i], )
    roc_auc[i] = auc(fpr[i], tpr[i])
    
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

plt.figure(figsize=(5,5), dpi=300)
plt.plot(fpr[0], tpr[0], color='darkorange', lw=2, label='Curva ROC (area = %0.3f)' % roc_auc[0])
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Taxa de falsos positivos')
plt.ylabel('Taxa de verdadeiros positivos')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Curva ROC e cálculo da métrica AUC para todas as classes

roc_auc_scores = []

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure(figsize=(12,12), dpi=300)
plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.3f})'''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.3f})'''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2, label='ROC curve of class {0} (area = {1:0.3f})'''.format(i, roc_auc[i]))
    roc_auc_scores.append(roc_auc[i])

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Taxa de falsos positivos')
plt.ylabel('Taxa de verdadeiros positivos')
plt.legend(loc="lower right")
plt.show()