## Bibliotecas

In [None]:
import os
import pickle
import math
import joblib

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import collections
import seaborn as sns
import tensorflow as tf

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, ParameterGrid, train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support , roc_auc_score, auc, precision_score, recall_score, f1_score, accuracy_score
from sklearn.utils import class_weight

from pickle import load
from random import randint
from scipy import interp
from itertools import cycle
from timeit import default_timer as timer

%run ./base_functions.ipynb

In [None]:
tf.test.is_built_with_cuda()
tf.config.list_physical_devices('GPU')

### Fixando a seed

In [None]:
# Seed value
# Apparently you may use different seed values at each stage
seed_value = randint(0, 99999)
print(seed_value)

os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)

## Funções

In [None]:
def df_prepro(data):
    
    y_data = data['STATUS'].copy()
    x_data = data.drop(['Unnamed: 0', 'STATUS'], 1)
    
    # Uniformização entre 0 e 1
    min_max_scaler = preprocessing.MinMaxScaler()
    scaled = min_max_scaler.fit_transform(x_data)
    x_data_norm = pd.DataFrame(scaled, index=x_data.index, columns=x_data.columns)
    
    return x_data_norm, y_data

## Parâmetros e configurações

In [None]:
# Definição dos caminhos das pastas

data_folder = "D:\\TEP - Python\\"
outputs_folder = "C:\\Users\\anaso\\Desktop\\workspace\\doutorado\\outputs\\"

In [None]:
TEST_CODE = 'RF28_new'

## Leitura dos dados

In [None]:
data = pd.read_csv(data_folder + "09-python_dados-3anos.csv", sep=';')
data_teste = pd.read_csv(data_folder + "13-python_dados-1ano.csv", sep=';')

In [None]:
print("Treino: ", np.shape(data))
print("Teste:  ", np.shape(data_teste))

## Pré-processamento dos dados

In [None]:
y_train = data['STATUS'].copy()
x_train = data.drop(['Unnamed: 0', 'STATUS'], 1)

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, 
                                                      test_size=0.15, 
                                                      random_state=seed_value, 
                                                      shuffle=False)

y_test = data_teste['STATUS'].copy()
x_test = data_teste.drop(['Unnamed: 0', 'STATUS'], 1)

In [None]:
# Conferência parcial

print("TREINO")
print("Entradas:      ", np.shape(x_train))
print("Saída:         ", np.shape(y_train))

print("\VALIDAÇÃO")
print("Entradas:      ", np.shape(x_valid))
print("Saída:         ", np.shape(y_valid))

print("\nTESTE")
print("Entradas:      ", np.shape(x_test))
print("Saída:         ", np.shape(y_test))

In [None]:
STATUS = np.sort(y_test.unique())
STATUS

## Modelagem do sistema de FDD

In [None]:
# Cálculo de class_weight para o caso multilabel

class_weights = class_weight.compute_class_weight(class_weight='balanced', 
                                                  classes=np.unique(y_train), 
                                                  y=y_train)

class_weight_dict = {}
for i in range(len(STATUS)):
        class_weight_dict[i] = class_weights[i]

class_weight_dict

### Treinamento do modelo - Treino simples

In [None]:
# Definição da topologia do modelo - Random Forest

ti = timer()

model = RandomForestClassifier(n_estimators=100, 
                               criterion='gini',
                               max_depth=50,
                               max_features=15,
                               bootstrap=True,
                               oob_score=True, 
                               n_jobs=-1,
                               random_state=seed_value,
                               verbose=2, 
                               class_weight=class_weight_dict)
model.fit(x_train, y_train)

tf = timer()

In [None]:
print("Tempo total: " + str(int((tf-ti)//60)) + " minutos e " + str(math.ceil((tf-ti)%60))+ " segundos.")

In [None]:
model.oob_score_ 

## Análise dos resultados

### Métricas do treinamento

In [None]:
# Confusion Matrix w/ Heatmap

out_train = model.predict(x_train)

df_cm_train = pd.DataFrame(confusion_matrix(y_train, out_train), index=[i for i in STATUS], columns=[i for i in STATUS])

# Linha para normalizar os dados
df_cm_train_norm = round((df_cm_train.astype('float') / df_cm_train.sum(axis=1)[:, np.newaxis]), 2)

plt.figure(figsize = (13,12), dpi=600)
ax = sn.heatmap(df_cm_train_norm, annot=True, cmap='PuBu') # fmt='d'
ax.set_xlabel("CLASSES PREDITAS", fontsize=12)
ax.set_ylabel("CLASSES REAIS", fontsize=12)

bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

In [None]:
train_metrics = metrics(y_train, out_train, model, df_cm_train, STATUS, multi_problem=True)

print("\nOverall Precision: {:.2f}%".format((train_metrics['Precision'].sum()/len(STATUS)*100)))
print("Overall Recall:    {:.2f}%".format((train_metrics['Recall'].sum()/len(STATUS)*100)))
print("Overall F1-Score:  {:.2f}%".format((train_metrics['F-score(a=1)'].sum()/len(STATUS)*100)))
print("Overall Accuracy:  {:.2f}%".format(accuracy_score(y_train, out_train)*100))

train_metrics

### VALIDAÇÃO

In [None]:
# Confusion Matrix w/ Heatmap

out_valid = model.predict(x_valid)

df_cm_valid = pd.DataFrame(confusion_matrix(y_valid, out_valid), index=[i for i in STATUS], columns=[i for i in STATUS])

# Linha para normalizar os dados
df_cm_valid_norm = round((df_cm_valid.astype('float') / df_cm_valid.sum(axis=1)[:, np.newaxis]), 2)

plt.figure(figsize = (13,12), dpi=600)
ax = sn.heatmap(df_cm_valid_norm, annot=True, cmap='PuBu')
ax.set_xlabel("CLASSES PREDITAS", fontsize=12)
ax.set_ylabel("CLASSES REAIS", fontsize=12)

bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

In [None]:
valid_metrics = metrics(y_valid, out_valid, model, df_cm_valid, STATUS, multi_problem=True)

print("\nOverall Precision: {:.2f}%".format((valid_metrics['Precision'].sum()/len(STATUS)*100)))
print("Overall Recall:    {:.2f}%".format((valid_metrics['Recall'].sum()/len(STATUS)*100)))
print("Overall F1-Score:  {:.2f}%".format((valid_metrics['F-score(a=1)'].sum()/len(STATUS)*100)))
print("Overall Accuracy:  {:.2f}%".format(accuracy_score(y_valid, out_valid)*100))

valid_metrics

### TESTE

In [None]:
# Confusion Matrix w/ Heatmap

out_test = model.predict(x_test)

df_cm_test = pd.DataFrame(confusion_matrix(y_test, out_test), index=[i for i in STATUS], columns=[i for i in STATUS])

# Linha para normalizar os dados
df_cm_test_norm = round((df_cm_test.astype('float') / df_cm_test.sum(axis=1)[:, np.newaxis]), 2)

plt.figure(figsize = (13,12), dpi=600)
ax = sn.heatmap(df_cm_test_norm, annot=True, cmap='PuBu')
ax.set_xlabel("CLASSES PREDITAS", fontsize=12)
ax.set_ylabel("CLASSES REAIS", fontsize=12)

bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

In [None]:
# Outras métricas do teste - Precision, Recall, F-Score

test_metrics = metrics(y_test, out_test, model, df_cm_test, STATUS, multi_problem=True)

print("\nOverall Precision: {:.2f}%".format((test_metrics['Precision'].sum()/len(STATUS)*100)))
print("Overall Recall:    {:.2f}%".format((test_metrics['Recall'].sum()/len(STATUS)*100)))
print("Overall F1-Score:  {:.2f}%".format((test_metrics['F-score(a=1)'].sum()/len(STATUS)*100)))
print("Overall Accuracy:  {:.2f}%".format(accuracy_score(y_test, out_test)*100))

test_metrics

In [None]:
# Construção da curva ROC para o caso binário (base: classe 0)

# Dados reais em OHE
y_test = pd.DataFrame(y_test).astype('category')
y_test = pd.get_dummies(y_test).values

# Predições em OHE
y_pred = pd.DataFrame(out_test).astype('category')
y_pred = pd.get_dummies(y_pred).values

n_classes = len(STATUS)

# Calcula a curva ROC e a métrica AUC para cada classe
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i], )
    roc_auc[i] = auc(fpr[i], tpr[i])
    
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

plt.figure(figsize=(5,5), dpi=300)
plt.plot(fpr[0], tpr[0], color='darkorange', lw=2, label='Curva ROC (area = %0.3f)' % roc_auc[0])
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Taxa de falsos positivos')
plt.ylabel('Taxa de verdadeiros positivos')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Curva ROC e cálculo da métrica AUC para todas as classes

roc_auc_scores = []

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure(figsize=(12,12), dpi=300)
plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.3f})'''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.3f})'''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2, label='ROC curve of class {0} (area = {1:0.3f})'''.format(i, roc_auc[i]))
    roc_auc_scores.append(roc_auc[i])

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Taxa de falsos positivos')
plt.ylabel('Taxa de verdadeiros positivos')
plt.legend(loc="lower right")
plt.show()

## Salvando o modelo

In [None]:
joblib.dump(model, outputs_folder + "random_forest-" + TEST_CODE)

## Upload do modelo

In [None]:
# loaded_rf = joblib.load(outputs_folder + "random_forest-" + TEST_CODE)