<h1>Log Parsing</h1>

In [None]:
import log_parsing
import data_preprocessing
import evaluation_metrics

log_file_path = 'C:/Users/Alysson/Desktop/DissertacaoUNB/datasets/BGL/BGL_2k.log'
structured_file_path = 'C:/Users/Alysson/Desktop/DissertacaoUNB/datasets/BGL/BGL_2k.log_structured.csv'
parser = 'Drain'
log_format = '<Label> <Timestamp> <Date> <Node> <Time> <NodeRepeat> <Type> <Component> <Level> <Content>'
rex  = [
        r'core\.\d+',  # Correspondente ao padrão 'core.' seguido de um ou mais dígitos.
        r'blk_(|-)[0-9]+',  # Correspondente ao padrão 'blk_' seguido opcionalmente por '-' e um ou mais dígitos (bloco ID).
        r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)',  # Correspondente a endereços IP no formato x.x.x.x:porta ou x.x.x.x.
        r'([0-9a-f]+[:][0-9a-f]+)',  # Correspondente a pares hexadecimal no formato 0x0:0x0.
        r'fpr[0-9]+[=]0x[0-9a-f]+ [0-9a-f]+ [0-9a-f]+ [0-9a-f]+',  # Correspondente a padrões 'fprX=0xX X X X'.
        r'r[0-9]+[=]0x[0-9a-f]+',  # Correspondente a padrões 'rX=0xX'.
        r'[l|c|xe|ct]r=0x[0-9a-f]+',  # Correspondente a padrões como 'lr=0xX' ou 'cr=0xX'.
        r'0x[0-9a-f]+',  # Correspondente a valores hexadecimais no formato 0xX.
        r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$',  # Correspondente a números inteiros ou números separados por não caracteres alfanuméricos.
        ]

outdir = 'datasets/BGL/'

# Chama o parsing de logs
structured_log_df = log_parsing.process_log_files(structured_file_path, log_file_path, parser, log_format, outdir, rex, config_file_path)

structured_log_df.head()


<h1>Cria log sequences</h1>

In [None]:
import sys
import os

# Adicione o caminho do módulo ao sys.path
module_path = r'C:\Users\Alysson\Desktop\DissertacaoUNB'
if module_path not in sys.path:
    sys.path.append(module_path)
    
from log_parsing import load_BGL_data
from data_preprocessing import sample_data


def create_sequences(nome_arquivo, Label=True):
    nome_do_arquivo_aux = os.path.basename(nome_arquivo)
    nome_do_arquivo_sem_extensao = os.path.splitext(nome_do_arquivo_aux)[0]
    diretorio_do_arquivo = os.path.dirname(nome_arquivo)

    # Remova a extensão .log_structured do nome do arquivo
    nome_do_arquivo_sem_extensao = nome_do_arquivo_sem_extensao.replace(".log_structured", "")

    # Caminho para o diretório 'logsequences'
    logsequences_dir = os.path.join(diretorio_do_arquivo, "logsequences")

    # Verifique se o diretório 'logsequences' existe, senão, crie-o
    if not os.path.exists(logsequences_dir):
        os.makedirs(logsequences_dir)

    config = {
        "window_size": 0.01667,
        "step_size": 0.01667,
        "structured_file_path": os.path.join(diretorio_do_arquivo, f"{nome_do_arquivo_sem_extensao}.log_structured.csv"),
        "sequence_path": os.path.join(logsequences_dir, f"{nome_do_arquivo_sem_extensao}.csv")
    }

    BGL_structured_data = load_BGL_data(config["structured_file_path"], create_seconds_since=True, Label=Label)
    sequenceVector = sample_data(BGL_structured_data, config["window_size"], config["step_size"], config["sequence_path"], Label=Label, window_type='time')

    print(f"Executando create_sequences para {nome_arquivo}")
    
def processar_arquivos_csv_em_diretorio(diretorio, Label=False):
    # Verifique se o diretório existe
    if os.path.exists(diretorio) and os.path.isdir(diretorio):
        # Liste os arquivos no diretório
        arquivos = os.listdir(diretorio)

        for arquivo in arquivos:
            if arquivo.endswith(".log_structured.csv"):
                create_sequences(os.path.join(diretorio, arquivo), Label=Label)
    else:
        print(f"O diretório '{diretorio}' não existe.")

# Diretório 1
diretorio1 = r'C:\Users\Alysson\Desktop\DissertacaoUNB\datasets\BGL'
processar_arquivos_csv_em_diretorio(diretorio1, Label=True)

In [None]:
import pandas as pd 

# Caminho do módulo ao sys.path
module_path = r'C:\Users\Alysson\Desktop\DissertacaoUNB'
if module_path not in sys.path:
    sys.path.append(module_path)
    
from create_train_test_dataset import create_train_test_dataset
from semantic_vectorization import generate_word_vectors_dict, process_line, map_log_sequence_to_word_vec, create_tfidf_matrix_log_sequence, create_weight_log_sequence, create_semantic_log_seq, create_semantic_vector_sequence

########## PROCESSAMENTO DAS LOG SEQUENCES BGL ########## 
templatesTrain = pd.read_csv(r'C:\Users\Alysson\Desktop\DissertacaoUNB\datasets\BGL\BGL_2k.log_templates.csv', sep=',')
SequenceVecTrain = pd.read_csv(r'C:\Users\Alysson\Desktop\DissertacaoUNB\datasets\BGL\logsequences\BGL_2k.csv', sep=',')
#SequenceVecTest = pd.read_csv(r'C:\Users\Alysson\Desktop\DissertacaoUNB\datasets\BGL\XXXXXXX.csv', sep=',') #Opcional
#templatesTest = pd.read_csv(r'C:\Users\Alysson\Desktop\DissertacaoUNB\datasets\BGL\XXXXXXX.csv', sep=',') #Opcional
preProcess = {"stemming": False, "lemmatization": True}

train_data, test_data, _ = create_train_test_dataset(templatesTrain, SequenceVecTrain, onlyNormalDataTrain=True, shuffleTrain=True, removeEmptySeq=True, 
                                                  splitData=True, SequenceVectorTest=None, templatesTest=None, preProcessParam=preProcess)

########## CRIAÇÃO DAS SEQUÊNCIAS DE VETORES SEMÂNTICOS - ÚNICA ########## 

# Use a função apply para aplicar a função process_line a cada linha da coluna 'ReplacedSequence'
train_data['splittedReplacedSequence'] = train_data['ReplacedSequence'].apply(process_line)
test_data['splittedReplacedSequence'] = test_data['ReplacedSequence'].apply(process_line) 

word_vector_dict_train, dimension_train = generate_word_vectors_dict(train_data, model_type='glove50d') 
#word_vector_dict_test, dimension_test = generate_word_vectors_dict(test_data, model_type='glove50d')

train_data = create_semantic_vector_sequence(train_data, word_vector_dict_train, dimension_train)
test_data = create_semantic_vector_sequence(test_data, word_vector_dict_train, dimension_train) 

print('Total train sequences: ' ,len(train_data))
print('Total test sequences: ' ,len(test_data))

<h1>Experimento 0</h1>

In [None]:
import pandas as pd 
import numpy as np

# Adicione o caminho do módulo ao sys.path
module_path = r'C:\Users\Alysson\Desktop\DissertacaoUNB'
if module_path not in sys.path:
    sys.path.append(module_path)
    
from create_train_test_dataset import create_train_test_dataset
from semantic_vectorization import *
from anomaly_detection import *

########## PROCESSAMENTO LOG BGL ########## 
templatesTrain = pd.read_csv(r'C:\Users\Alysson\Desktop\DissertacaoUNB\datasets\BGL\BGL_2k.log_templates.csv', sep=',')
SequenceVecTrain = pd.read_csv(r'C:\Users\Alysson\Desktop\DissertacaoUNB\datasets\BGL\logsequences\BGL_2k.csv', sep=',')
#SequenceVecTest = pd.read_csv(r'C:\Users\Alysson\Desktop\MinhaImplementacaoBGL\datasets\BGL\XXXXXXX.csv', sep=',') #Opcional
#templatesTest = pd.read_csv(r'C:\Users\Alysson\Desktop\MinhaImplementacaoBGL\datasets\BGL\XXXXXXX.csv', sep=',') #Opcional
preProcess = {"stemming": False, "lemmatization": True}

train_data, test_data, _ = create_train_test_dataset(templatesTrain, SequenceVecTrain, onlyNormalDataTrain=True, shuffleTrain=True, removeEmptySeq=True, 
                                                  splitData=True, SequenceVectorTest=None, templatesTest=None, preProcessParam=preProcess)

########## CRIAÇÃO DAS SEQUÊNCIAS DE VETORES SEMÂNTICOS - ÚNICA ########## 

# Use a função apply para aplicar a função process_line a cada linha da coluna 'ReplacedSequence'
train_data['splittedReplacedSequence'] = train_data['ReplacedSequence'].apply(process_line)
test_data['splittedReplacedSequence'] = test_data['ReplacedSequence'].apply(process_line) 

word_vector_dict_train, dimension_train = generate_word_vectors_dict(train_data, model_type='glove50d') 
#word_vector_dict_test, dimension_test = generate_word_vectors_dict(test_data, model_type='glove50d')

train_data = create_semantic_vector_sequence(train_data, word_vector_dict_train, dimension_train)
test_data = create_semantic_vector_sequence(test_data, word_vector_dict_train, dimension_train) 

print('Total train sequences: ' ,len(train_data))
print('Total test sequences: ' ,len(test_data))

########## TREINAMENTO E TESTE DOS MODELOS ########## 
pd.set_option('display.max_colwidth', 150)

contamination = 0.1
# LOF
print('*'*50, ' LOF ', '*'*50)
params = {'algorithm': 'auto', 'contamination': contamination, 'leaf_size': 30, 'metric': 'minkowski', 'n_jobs': None, 'n_neighbors': 20, 'novelty': True} # Default

_, lof_df_anomalies, lof_df_prediction, lof_metricas, lof_total_time = detect_lof_anomalies(train_data, test_data, metric_params=params, label=True, 
                                                                                            plot_tsne=True, gridSearch=False)
precision_lof, recall_lof, f1_lof, fpr_lof, tpr_lof, roc_auc_lof = lof_metricas
train_time_lof, prediction_time_lof = lof_total_time

# Exibe as primeiras sequências mais anômalas 
lof_df_anomalies[lof_df_anomalies['Anomaly_Prediction'] == -1].head(5)

# Autoencoder
print('*'*50, ' Autoencoder ', '*'*50)
params = {'contamination':contamination}
_, autoencoder_df_anomalies, _, autoencoder_metricas, autoencoder_total_time  = detect_autoencoder_anomalies(train_data, test_data, metric_params=params, label=True, plot_tsne=True)

precision_autoencoder, recall_autoencoder, f1_autoencoder, fpr_autoencoder, tpr_autoencoder, roc_auc_autoencoder = autoencoder_metricas
train_time_autoencoder, prediction_time_autoencoder = autoencoder_total_time

# Exibe as primeiras sequências mais anômalas 
autoencoder_df_anomalies[autoencoder_df_anomalies['Anomaly_Prediction'] == -1].head(5)

# SOM
print('*'*50, ' SOM ', '*'*50)
params = {'contamination':contamination, 'grid_size':(10, 10), 'sigma':1.0, 'learning_rate':0.5}
_, som_df_anomalies, _, som_metricas, som_total_time  = detect_som_anomalies(train_data, test_data, metric_params=params, label=True, plot_tsne=True)

precision_som, recall_som, f1_som, fpr_som, tpr_som, roc_auc_som = som_metricas
train_time_som, prediction_time_som = som_total_time

# Exibe as primeiras sequências mais anômalas 
som_df_anomalies[som_df_anomalies['Anomaly_Flag'] == -1].head(5)
#export_anomalies_to_csv(som_df_anomalies, 'som_df_anomalies', r'C:\Users\Alysson\Desktop\MinhaImplementacaoBGL\datasets\Ccmeval\anomaliasEncontradas')

# Isolation Forest
print('*'*50, ' IF ', '*'*50)
params = {'contamination': contamination, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 100, 'n_jobs': None, 'random_state': 123} # Default
_, isolation_forest_df_anomalies, isolation_forest_prediction, isolation_forest_metricas, isolation_forest_total_time = detect_isolation_forest_anomalies(train_data, test_data,
                                                                                          metric_params=params, label=True, plot_tsne=True, gridSearch=False)

precision_isolation_forest, recall_isolation_forest, f1_isolation_forest, fpr_isolation_forest, tpr_isolation_forest, roc_auc_isolation_forest = isolation_forest_metricas
train_time_isolation_forest, prediction_time_isolation_forest = isolation_forest_total_time

# Exibe as primeiras sequências mais anômalas 
isolation_forest_df_anomalies[isolation_forest_df_anomalies['Anomaly_Prediction'] == -1].head(5)
#export_anomalies_to_csv(isolation_forest_df_anomalies, 'isolation_forest_df_anomalies', r'C:\Users\Alysson\Desktop\MinhaImplementacaoBGL\datasets\Ccmeval\anomaliasEncontradas')

# MCD
print('*'*50, ' MCD ', '*'*50)
params = {'contamination':contamination, 'support_fraction': 1} # Default ('support_fraction': 1 alterado para evitar os warnings)
_, mcd_df_anomalies, mcd_prediction, mcd_metricas, mcd_total_time = detect_mcd_anomalies(train_data, test_data, metric_params=params, label=True, plot_tsne=True)

precision_mcd, recall_mcd, f1_mcd, fpr_mcd, tpr_mcd, roc_auc_mcd = mcd_metricas
train_time_mcd, prediction_time_mcd = mcd_total_time

# Exibe as primeiras sequências mais anômalas 
mcd_df_anomalies[mcd_df_anomalies['Anomaly_Prediction'] == -1].head(5)

# HBOS
print('*'*50, ' HBOS ', '*'*50)
params = {'n_bins':10, 'alpha':0.1, 'tol':0.5, 'contamination':contamination} # Default
_, hbos_df_anomalies, hbos_prediction, hbos_metricas, hbos_total_time = detect_hbos_anomalies(train_data, test_data,
                                                                                             metric_params=params, label=True, plot_tsne=True)

precision_hbos, recall_hbos, f1_hbos, fpr_hbos, tpr_hbos, roc_auc_hbos = hbos_metricas
train_time_hbos, prediction_time_hbos = hbos_total_time

# Exibe as primeiras sequências mais anômalas
hbos_df_anomalies[hbos_df_anomalies['Anomaly_Prediction'] == -1].head(5)  # Mostra as anomalias (classe -1)

# CBLOF
print('*'*50, ' CBLOF ', '*'*50)
params= {'contamination':contamination } # Default
_, cblof_df_anomalies, cblof_prediction, cblof_metricas, cblof_total_time = detect_cblof_anomalies(train_data, test_data,
                                                                                             metric_params=params, label=True, plot_tsne=True)

precision_cblof, recall_cblof, f1_cblof, fpr_cblof, tpr_cblof, roc_auc_cblof = cblof_metricas
train_time_cblof, prediction_time_cblof = cblof_total_time

# Exibe as primeiras sequências mais anômalas
cblof_df_anomalies[cblof_df_anomalies['Anomaly_Prediction'] == -1].head(5)  # Mostra as anomalias (classe -1)

# GMM
print('*'*50, ' GMM ', '*'*50)
params = {'contamination': contamination, 'n_components': 1, 'covariance_type': 'full', 'max_iter': 100, 'init_params': 'kmeans', 'random_state': 123} # Default
_, gmm_df_anomalies, gmm_prediction, gmm_metricas, gmm_total_time = detect_gmm_anomalies(train_data, test_data,
                                                                                         metric_params=params, label=True, plot_tsne=True)

precision_gmm, recall_gmm, f1_gmm, fpr_gmm, tpr_gmm, roc_auc_gmm = gmm_metricas
train_time_gmm, prediction_time_gmm = gmm_total_time

# Exibe as primeiras sequências mais anômalas
gmm_df_anomalies[gmm_df_anomalies['Anomaly_Prediction'] == -1].head(5)  # Mostra as anomalias (classe -1)

In [None]:
################## Experimento 0 - Resultados ##################
import matplotlib.pyplot as plt
import numpy as np

models = ['LOF', 'AE', 'SOM', 'IF', 'MCD', 'HBOS', 'CBLOF', 'GMM']
f1_scores = [f1_lof, f1_autoencoder, f1_som, f1_isolation_forest, f1_mcd, f1_hbos, f1_cblof, f1_gmm]
recall_scores = [recall_lof, recall_autoencoder, recall_som, recall_isolation_forest, recall_mcd, recall_hbos, recall_cblof, recall_gmm]
precision_scores = [precision_lof, precision_autoencoder, precision_som, precision_isolation_forest, precision_mcd, precision_hbos, precision_cblof, precision_gmm]
roc_auc_scores = [roc_auc_lof, roc_auc_autoencoder, roc_auc_som, roc_auc_isolation_forest, roc_auc_mcd, roc_auc_hbos, roc_auc_cblof, roc_auc_gmm]

# Valores de FPR, TPR e ROC AUC para cada modelo
fpr_models = [fpr_lof, fpr_autoencoder, fpr_som, fpr_isolation_forest, fpr_mcd, fpr_hbos, fpr_cblof, fpr_gmm]
tpr_models = [tpr_lof, tpr_autoencoder, tpr_som, tpr_isolation_forest, tpr_mcd, tpr_hbos, tpr_cblof, tpr_gmm]
roc_auc_models = [roc_auc_lof, roc_auc_autoencoder, roc_auc_som, roc_auc_isolation_forest, roc_auc_mcd, roc_auc_hbos, roc_auc_cblof, roc_auc_gmm]

# Tempos de treinamento e predição para cada modelo
train_times = [train_time_lof, train_time_autoencoder, train_time_som, train_time_isolation_forest, train_time_mcd, train_time_hbos, train_time_cblof, train_time_gmm]
prediction_times = [prediction_time_lof, prediction_time_autoencoder, prediction_time_som, prediction_time_isolation_forest, prediction_time_mcd, prediction_time_hbos, prediction_time_cblof, prediction_time_gmm]

# Cálculo dos tempos totais (treinamento + predição)
total_times = [train_time + prediction_time for train_time, prediction_time in zip(train_times, prediction_times)]

# Configuração das posições das barras
bar_width = 0.15
index = np.arange(len(models))

# Configuração dos eixos e legendas para as métricas e tempos
fig, axes = plt.subplots(2, 3, figsize=(18, 8))

# Aumentar o espaçamento entre subplots
plt.subplots_adjust(wspace=0.5, hspace=0.6)

# Gráfico de barras das métricas
axes[0, 0].bar(index, f1_scores, bar_width, label='F1-score', alpha=0.7)
axes[0, 0].bar(index + bar_width, recall_scores, bar_width, label='Recall', alpha=0.7)
axes[0, 0].bar(index + 2 * bar_width, precision_scores, bar_width, label='Precision', alpha=0.7)

axes[0, 0].set_title('Comparação de Métricas de Desempenho', fontsize=12)
axes[0, 0].set_xticks(index + 1.5 * bar_width)
axes[0, 0].set_xticklabels(models, rotation=45)
axes[0, 0].legend(loc='upper right', fontsize='small')
axes[0, 0].grid(axis='y', linestyle='--', alpha=0.6)
axes[0, 0].set_ylim(0.0, 1.1)  # Define o limite do eixo y

# Gráfico de barras para o tempo de predição
axes[1, 0].set_title('Tempo de Predição', fontsize=12)
axes[1, 0].set_xticks(index + bar_width / 2)
axes[1, 0].set_xticklabels(models, rotation=45)
axes[1, 0].set_ylabel('Tempo (s)')

axes[1, 0].bar(index, prediction_times, 3 * bar_width, label='Predição', alpha=0.7, color='lightcoral')

# Gráfico de barras para o tempo de treinamento
axes[1, 1].set_title('Tempo de Treinamento', fontsize=12)
axes[1, 1].set_xticks(index + bar_width / 2)
axes[1, 1].set_xticklabels(models, rotation=45)
axes[1, 1].set_ylabel('Tempo (s)')

axes[1, 1].bar(index, train_times, 3 * bar_width, label='Treinamento', alpha=0.7, color='skyblue')

# Gráfico de barras para o tempo total (treinamento + predição)
axes[1, 2].set_title('Tempo Total', fontsize=12)
axes[1, 2].set_xticks(index + bar_width / 2)
axes[1, 2].set_xticklabels(models, rotation=45)
axes[1, 2].set_ylabel('Tempo (s)')

axes[1, 2].bar(index, total_times, 3 * bar_width, label='Total', alpha=0.7, color='lightgreen')

# Gráfico da curva ROC comparativa
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'purple', 'orange']  # Cores diferentes para cada modelo
linestyles = ['-', '--', '-.', ':', '-', '--', '-.', ':', '-']
line_width = 2.0  

for i in range(len(models)):
    axes[0, 2].plot(fpr_models[i], tpr_models[i], color=colors[i], linestyle=linestyles[i], lw=line_width,
                 label=f'{models[i]} (AUC = {roc_auc_models[i]:.2f})')

axes[0, 2].set_xlim([-0.03, 1.03])
axes[0, 2].set_ylim([-0.05, 1.05])
axes[0, 2].set_xlabel('Taxa de Falsos Positivos (FPR)')
axes[0, 2].set_ylabel('Taxa de Verdadeiros Positivos (TPR)')
axes[0, 2].set_title('Curva ROC Comparativa', fontsize=12)
axes[0, 2].plot([0, 1], [0, 1], linestyle='-', color='red', linewidth=0.5, label='Classificador Aleatório')
axes[0, 2].legend(loc="lower right", fontsize='x-small')
axes[0, 2].grid(linestyle='--', alpha=0.6)


# Todos os tempos
axes[0, 1].set_title('Tempos', fontsize=12)
axes[0, 1].set_xticks(index + bar_width / 2)
axes[0, 1].set_xticklabels(models, rotation=45)
axes[0, 1].set_ylabel('Tempo (s)')

# Gráfico de barras para o tempo de predição
axes[0, 1].bar(index - bar_width, prediction_times, bar_width, label='Predição', alpha=0.7, color='lightcoral')
# Gráfico de barras para o tempo de treinamento
axes[0, 1].bar(index, train_times, bar_width, label='Treinamento', alpha=0.7, color='skyblue')
# Gráfico de barras para o tempo total
axes[0, 1].bar(index + bar_width, total_times, bar_width, label='Total', alpha=0.7, color='lightgreen')

# Adicione uma legenda para o subplot dos tempos
axes[0, 1].legend(loc='upper right', fontsize='small')


plt.tight_layout()

# plt.savefig('metricas_e_roc.png', dpi=300, bbox_inches='tight')  # Salvar a figura em alta resolução
plt.show()

import matplotlib.pyplot as plt
import numpy as np

models = ['LOF', 'AE', 'SOM', 'IF', 'MCD', 'HBOS', 'CBLOF', 'GMM']
f1_scores = [f1_lof, f1_autoencoder, f1_som, f1_isolation_forest, f1_mcd, f1_hbos, f1_cblof, f1_gmm]
recall_scores = [recall_lof, recall_autoencoder, recall_som, recall_isolation_forest, recall_mcd, recall_hbos, recall_cblof, recall_gmm]
precision_scores = [precision_lof, precision_autoencoder, precision_som, precision_isolation_forest, precision_mcd, precision_hbos, precision_cblof, precision_gmm]

# Configuração das posições das barras
bar_width = 0.5
index = np.arange(len(models))

# Configuração dos eixos e legendas para as métricas
fig, axes = plt.subplots(1, 3, figsize=(15, 5))  # 1 linha e 3 colunas

# Gráfico de barras do F1-score
bar1 = axes[0].bar(index, f1_scores, bar_width, label='F1-score', alpha=0.7)
axes[0].set_title('F1-Score', fontsize=12)
axes[0].set_xticks(index)
axes[0].set_xticklabels(models, rotation=45)
axes[0].yaxis.grid(True, linestyle='--', alpha=0.6)
axes[0].xaxis.grid(False)  # Desativar o grid vertical
axes[0].set_ylim(0.0, 1.1)

# Adicionar valores no topo das barras do F1-score
for bar, score in zip(bar1, f1_scores):
    axes[0].annotate(f'{score:.2f}', xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()), xytext=(0, 3),
                    textcoords="offset points", ha='center', va='bottom')

# Gráfico de barras do Recall
bar2 = axes[1].bar(index, recall_scores, bar_width, label='Recall', alpha=0.7, color='g')
axes[1].set_title('Recall', fontsize=12)
axes[1].set_xticks(index)
axes[1].set_xticklabels(models, rotation=45)
axes[1].yaxis.grid(True, linestyle='--', alpha=0.6)
axes[1].xaxis.grid(False)  # Desativar o grid vertical
axes[1].set_ylim(0.0, 1.1)

# Adicionar valores no topo das barras do Recall
for bar, score in zip(bar2, recall_scores):
    axes[1].annotate(f'{score:.2f}', xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()), xytext=(0, 3),
                    textcoords="offset points", ha='center', va='bottom')

# Gráfico de barras da Precision
bar3 = axes[2].bar(index, precision_scores, bar_width, label='Precision', alpha=0.7, color='r')
axes[2].set_title('Precision', fontsize=12)
axes[2].set_xticks(index)
axes[2].set_xticklabels(models, rotation=45)
axes[2].yaxis.grid(True, linestyle='--', alpha=0.6)
axes[2].xaxis.grid(False)  # Desativar o grid vertical
axes[2].set_ylim(0.0, 1.1)

# Adicionar valores no topo das barras da Precision
for bar, score in zip(bar3, precision_scores):
    axes[2].annotate(f'{score:.2f}', xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()), xytext=(0, 3),
                    textcoords="offset points", ha='center', va='bottom')

plt.tight_layout()

plt.show()


import pandas as pd

# Crie o DataFrame
data = {
    'Model': models,
    'F1 Score': f1_scores,
    'Recall': recall_scores,
    'Precision': precision_scores,
    'ROC AUC': roc_auc_scores,
    'Training Time (s)': train_times,
    'Prediction Time (s)': prediction_times,
    'Total Time (s)': total_times
}

df = pd.DataFrame(data)

# Inclusão da coluna Total Time 
df['Total Time (s)'] = (df['Training Time (s)'] + df['Prediction Time (s)'])

# Configure a exibição sem quebrar linhas
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
df



<h1>Experimento 1</h1>

In [None]:
################## Experimento 1 ##################

import log_parsing
import evaluation_metrics
from log_parsing import load_BGL_data
from data_preprocessing import sample_data

##### Log BGL - TIME WINDOW #####
if __name__ == "__main__":

    def generate_sequence(window_size, step_size, output_path):
        config = {
            "window_size": window_size, # Tamanho das sequências desejadas em fração de tempo ou número de eventos
            "step_size": step_size, # Passo para avançar nas sequências em fração de tempo ou número de eventos
            "structured_file_path": "C:/Users/Alysson/Desktop/DissertacaoUNB/datasets/BGL/BGL_2k.log_structured.csv",
            "BGL_sequence_path": output_path
        }
        bgl_structured_data = load_BGL_data(config["structured_file_path"], create_seconds_since=True, Label=True)
        sequenceVector = sample_data(bgl_structured_data, config["window_size"], config["step_size"], config["BGL_sequence_path"], Label=True, window_type='time')

    if __name__ == "__main__":
        for window_size, step_size in [(0.002778, 0.002778), (0.008333, 0.008333), (0.01667, 0.01667), (0.03333, 0.03333), (0.05, 0.05)]: #10seg, 30seg, 1min, 2min, 3min
            output_path = f'C:/Users/Alysson/Desktop/DissertacaoUNB/datasets/BGL/BGL_2k-{window_size}horas.csv'
            generate_sequence(window_size, step_size, output_path)
            print(50*'-')
            
########## PROCESSAMENTO LOG BGL - LOTE ########## 

# Lista de valores de window size
window_size_list = [0.002778, 0.008333, 0.01667, 0.03333, 0.05] #10seg, 30seg, 1min, 2min, 3min

def create_train_test_data(hours, onlyNormalDataTrain=True, shuffleTrain=True, removeEmptySeq=True):
    templatesTrain = pd.read_csv(r'C:\Users\Alysson\Desktop\DissertacaoUNB\datasets\BGL\BGL_2k.log_templates.csv', sep=',')
    SequenceVecTrain = pd.read_csv(r'C:\Users\Alysson\Desktop\DissertacaoUNB\datasets\BGL\BGL_2k-{}horas.csv'.format(hours), sep=',') # time window
    preProcess = {"stemming": False, "lemmatization": True}
    
    train_data, test_data, _ = create_train_test_dataset(templatesTrain, SequenceVecTrain, onlyNormalDataTrain=onlyNormalDataTrain, shuffleTrain=shuffleTrain, 
                                                      removeEmptySeq=removeEmptySeq, splitData=True, SequenceVectorTest=None, templatesTest=None, 
                                                      preProcessParam=preProcess)
    return train_data, test_data

# Itera sobre a lista de horas e cria os conjuntos de treinamento e teste
train_data_list, test_data_list = [], []

for window_size in window_size_list:
    print('Time - Hours: ', window_size)
    train_data, test_data = create_train_test_data(window_size)
    train_data_list.append(train_data)
    test_data_list.append(test_data)
    print(50*'-')
     

In [None]:
################## Experimento 1 ##################

########## CRIAÇÃO DAS SEQUÊNCIAS DE VETORES SEMÂNTICOS - LOTE ########## 
word_vector_dict_train, dimension_train = generate_word_vectors_dict(train_data, model_type='glove50d')
#word_vector_dict_test, dimension_test = generate_word_vectors_dict(test_data, model_type='glove50d')

train_data_list_aux = []
for train_data in train_data_list:
    train_data['splittedReplacedSequence'] = train_data['ReplacedSequence'].apply(process_line)
    train_data_aux = create_semantic_vector_sequence(train_data, word_vector_dict_train, dimension_train) 
    train_data_list_aux.append(train_data_aux)
train_data_list = train_data_list_aux

test_data_list_aux = []
for test_data in test_data_list:
    test_data['splittedReplacedSequence'] = test_data['ReplacedSequence'].apply(process_line)
    test_data_aux = create_semantic_vector_sequence(test_data, word_vector_dict_train, dimension_train) 
    test_data_list_aux.append(test_data_aux)
test_data_list = test_data_list_aux

In [None]:
################## Experimento 1 - Treinamento e teste ##################

# Lista de rótulos para as iterações
iteration_labels = ['10 seg', '30 seg', '1 min', '2 min', '3 min']

import pandas as pd  # Certifique-se de que você importou a biblioteca pandas

# Detecção em lote
def detect_anomalies(index, train_data, test_data, model='lof', label=True, plot_tsne=False, plot_metrics=False):
    contamination = 0.15
    
    if model == 'lof':
        params = {'algorithm': 'auto', 'contamination': contamination, 'leaf_size': 30, 'metric': 'minkowski', 
                  'n_jobs': None, 'n_neighbors': 20, 'novelty': True} # Default
        _, df_anomalies, df_prediction, metricas, total_time = detect_lof_anomalies(train_data, test_data, metric_params=params, 
                                                                                    label=label, plot_tsne=plot_tsne, plot_metrics=plot_metrics)
    elif model == 'autoencoder':
        params = {'contamination': contamination}
        _, df_anomalies, df_prediction, metricas, total_time = detect_autoencoder_anomalies(train_data, test_data, metric_params=params, 
                                                                                            label=label, plot_tsne=plot_tsne, plot_metrics=plot_metrics)
    elif model == 'som':
        params = {'contamination':contamination, 'grid_size':(10, 10), 'sigma':1.0, 'learning_rate':0.5}
        _, df_anomalies, df_prediction, metricas, total_time = detect_som_anomalies(train_data, test_data, metric_params=params, 
                                                                                    label=label, plot_tsne=plot_tsne, plot_metrics=plot_metrics)
    elif model == 'isolation_forest':
        params = {'contamination': contamination, 'n_jobs': -1, 'random_state': 123} # Default
        _, df_anomalies, df_prediction, metricas, total_time = detect_isolation_forest_anomalies(train_data, test_data, 
                                                                                                 metric_params=params, label=label, plot_tsne=plot_tsne, plot_metrics=plot_metrics)
    elif model == 'mcd':
        params = {'contamination':contamination, 'support_fraction': 1} # Default com support_fraction alterado para 1 para evitar os warnings
        _, df_anomalies, df_prediction, metricas, total_time = detect_mcd_anomalies(train_data, test_data, 
                                                                                              metric_params=params, label=label, plot_tsne=plot_tsne, plot_metrics=plot_metrics)
    elif model == 'hbos':
        params = {'n_bins':10, 'alpha':0.15, 'tol':0.5, 'contamination':contamination} # Default
        _, df_anomalies, df_prediction, metricas, total_time = detect_hbos_anomalies(train_data, test_data, metric_params=params, 
                                                                                     label=label, plot_tsne=plot_tsne, plot_metrics=plot_metrics)
    elif model == 'cblof':
        params= {'contamination':contamination} # Default
        _, df_anomalies, df_prediction, metricas, total_time = detect_cblof_anomalies(train_data, test_data, metric_params=params, 
                                                                                      label=label, plot_tsne=plot_tsne, plot_metrics=plot_metrics)
    elif model == 'gmm':
        params = {'contamination': 0.1, 'n_components': 1, 'covariance_type': 'full', 'max_iter': 100, 'init_params': 'kmeans', 'random_state': 123} # Default
        _, df_anomalies, df_prediction, metricas, total_time = detect_gmm_anomalies(train_data, test_data, metric_params=params, 
                                                                                    label=label, plot_tsne=plot_tsne, plot_metrics=plot_metrics)
    else:
        raise ValueError("Invalid model specified.")

    precision, recall, f1, fpr, tpr, roc_auc = metricas
    train_time, prediction_time = total_time
    return (df_anomalies, df_prediction, precision, recall, f1, fpr, tpr, roc_auc, train_time, prediction_time)

# Criar listas vazias para cada métrica
models = []
precision_list = []
recall_list = []
f1_list = []
fpr_list = []
tpr_list = []
roc_auc_list = []
train_time_list = []
prediction_time_list = []

# Criar listas vazias para cada métrica
iteration_list = []  # Lista para armazenar a iteração atual

# Semantic Vector List
train_data_List = train_data_list
test_data_List = test_data_list

# Executar a detecção de anomalias para cada modelo e armazenar os resultados nas listas
for i in range(len(iteration_labels)): # Número de iterações
    lof_results = detect_anomalies(i, train_data_List[i], test_data_List[i], model='lof')
    autoencoder_results = detect_anomalies(i, train_data_List[i], test_data_List[i], model='autoencoder')
    som_results = detect_anomalies(i, train_data_List[i], test_data_List[i], model='som')
    isolation_forest_results = detect_anomalies(i, train_data_List[i], test_data_List[i], model='isolation_forest')
    mcd_results = detect_anomalies(i, train_data_List[i], test_data_List[i], model='mcd')
    hbos_results = detect_anomalies(i, train_data_List[i], test_data_List[i], model='hbos')
    cblof_results = detect_anomalies(i, train_data_List[i], test_data_List[i], model='cblof')
    gmm_results = detect_anomalies(i, train_data_List[i], test_data_List[i], model='gmm')
    
    # Extrair as métricas de cada resultado e adicioná-las às listas
    iteration_list.extend([i+1] * 8)  # Adiciona a iteração atual 9 vezes (uma para cada modelo)
    models.extend(['LOF', 'AE', 'SOM', 'IF', 'MCD', 'HBOS', 'CBLOF', 'GMM'])
    precision_list.extend([lof_results[2], autoencoder_results[2], som_results[2], isolation_forest_results[2], mcd_results[2], hbos_results[2], cblof_results[2], gmm_results[2]])
    recall_list.extend([lof_results[3], autoencoder_results[3], som_results[3], isolation_forest_results[3], mcd_results[3], hbos_results[3], cblof_results[3], gmm_results[3]])
    f1_list.extend([lof_results[4], autoencoder_results[4], som_results[4], isolation_forest_results[4], mcd_results[4], hbos_results[4], cblof_results[4], gmm_results[4]])
    fpr_list.extend([lof_results[5], autoencoder_results[5], som_results[5], isolation_forest_results[5], mcd_results[5], hbos_results[5], cblof_results[5], gmm_results[5]])
    tpr_list.extend([lof_results[6], autoencoder_results[6], som_results[6], isolation_forest_results[6], mcd_results[6], hbos_results[6], cblof_results[6], gmm_results[6]])
    roc_auc_list.extend([lof_results[7], autoencoder_results[7], som_results[7], isolation_forest_results[7], mcd_results[7], hbos_results[7], cblof_results[7], gmm_results[7]])
    train_time_list.extend([lof_results[8], autoencoder_results[8], som_results[8], isolation_forest_results[8], mcd_results[8], hbos_results[8], cblof_results[8], gmm_results[8]])
    prediction_time_list.extend([lof_results[9], autoencoder_results[9], som_results[9], isolation_forest_results[9], mcd_results[9], hbos_results[9], cblof_results[9], gmm_results[9]])

# Criar o DataFrame com as listas de métricas e iteração
data = {
    'Iteration': iteration_list,
    'Model': models,
    'Precision': precision_list,
    'Recall': recall_list,
    'F1 Score': f1_list,
    'FPR': fpr_list,
    'TPR': tpr_list,
    'ROC AUC': roc_auc_list,
    'Train Time': train_time_list,
    'Prediction Time': prediction_time_list
}

df = pd.DataFrame(data)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Definir diretório de salvamento
save_directory = "C:/Users/Alysson/Desktop/DissertacaoUNB/datasets/BGL/resultados"

# Definir configurações globais
sns.set(style="whitegrid")
colors = ['blue', 'green', 'red', 'darkgreen', 'magenta', 'darkorange', 'black', 'purple']
linestyles = ['-', '--', '-.', ':', '-', '-', '-', '-']
marktypes = ['o', 's', '^', 'd', 'p', '*', 'x', 'v']
iteration_labels = ['10 sec', '30 sec', '1 min', '2 min', '3 min']
legend_labels = ['LOF', 'AE', 'SOM', 'IF', 'MCD', 'HBOS', 'CBLOF', 'GMM']
metrics = ['F1 Score', 'Recall', 'Precision']

# Função para gerar gráficos de linhas de métricas
def plot_metric(df, metric, filename):
    model_list = df['Model'].unique()
    fig, ax = plt.subplots(figsize=(6, 7))
    
    for j, model in enumerate(model_list):
        model_data = df[df['Model'] == model]
        sns.lineplot(data=model_data, x='Iteration', y=metric, ax=ax, 
                     label=f'{model}', color=colors[j], linestyle=linestyles[j % len(linestyles)], 
                     marker=marktypes[j % len(marktypes)], markersize=10)
    
    ax.set_xticks(range(1, len(iteration_labels) + 1))
    ax.set_xticklabels(iteration_labels, rotation=0, ha='center', fontsize=18)
    ax.tick_params(axis='y', labelsize=18)
    ax.set_xlabel('Window Size', fontsize=20)
    ax.set_ylabel('', fontsize=20)
    ax.set_ylim(0.0, 1.1)
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), fancybox=True, shadow=True, ncol=4, fontsize='medium')
    ax.grid(True, alpha=0.5)
    plt.tight_layout(rect=[0, 0, 1, 0.96])

    # Salvar a figura no diretório especificado
    plt.savefig(os.path.join(save_directory, filename), dpi=900, bbox_inches='tight', format='svg')
    plt.close(fig)

# Função para gerar gráficos ROC
def plot_roc(df, iteration, filename):
    fig, ax = plt.subplots(figsize=(6, 6))
    iteration_df = df[df['Iteration'] == iteration]
    
    for j, model in enumerate(iteration_df['Model'].unique()):
        model_df = iteration_df[iteration_df['Model'] == model]
        fpr, tpr, roc_auc = model_df['FPR'].values[0], model_df['TPR'].values[0], model_df['ROC AUC'].values[0]
        ax.plot(fpr, tpr, label=f'{legend_labels[j]} ({roc_auc:.2f})', 
                linestyle=linestyles[j], color=colors[j], linewidth=2.0, markersize=10)
    
    ax.plot([0, 1], [0, 1], linestyle='-', color='red', linewidth=0.5, label='Random (0.50)', markersize=10)
    ax.set_xlabel('False Positive Rate (FPR)', fontsize=20)
    ax.set_ylabel('True Positive Rate (TPR)', fontsize=20)
    ax.tick_params(axis='x', labelsize=18)
    ax.tick_params(axis='y', labelsize=18)
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.17), fancybox=True, shadow=True, ncol=2, fontsize='x-large')
    ax.grid(True, alpha=0.5)

    # Salvar a figura no diretório especificado
    plt.savefig(os.path.join(save_directory, filename), dpi=900, bbox_inches='tight', format='svg')
    plt.close(fig)

# Função para gerar gráficos de tempos (Treinamento/Predição)
def plot_times(df, time_column, filename):
    fig, ax = plt.subplots(figsize=(6, 6))
    for i, model in enumerate(df['Model'].unique()):
        model_data = df[df['Model'] == model]
        ax.plot(model_data['Iteration'], model_data[time_column], label=model, 
                linestyle=linestyles[i], color=colors[i], marker=marktypes[i], markersize=10)
    
    ax.set_xticks(df['Iteration'].unique())
    ax.set_xticklabels(iteration_labels, fontsize=18)
    ax.tick_params(axis='y', labelsize=18)
    ax.set_xlabel('Window Size', fontsize=20)
    ax.set_ylabel('Time(s)', fontsize=20)
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), fancybox=True, shadow=True, ncol=4, fontsize='medium')
    ax.grid(True, alpha=0.5)

    # Salvar a figura no diretório especificado
    plt.savefig(os.path.join(save_directory, filename), dpi=900, bbox_inches='tight', format='svg')
    plt.close(fig)

# Chamada das funções para salvar os gráficos no diretório especificado
for metric in metrics:
    plot_metric(df, metric, f'Experimento1-Metricas-{metric}.svg')

for i, iteration in enumerate(df['Iteration'].unique()):
    plot_roc(df, iteration, f'Experimento1-CurvasRoc-Subplot_{i}.svg')

plot_times(df, 'Train Time', 'Experimento1-TempoTreinamento.svg')
plot_times(df, 'Prediction Time', 'Experimento1-TempoPredicao.svg')


<h1>Experimento 2</h1>

In [None]:
################## Experimento 4 ##################
import pandas as pd
import os

# Adicione o caminho do módulo ao sys.path
module_path = r'C:\Users\Alysson\Desktop\DissertacaoUNB'
if module_path not in sys.path:
    sys.path.append(module_path)
    
from create_train_test_dataset import create_train_test_dataset
from semantic_vectorization import *
import evaluation_metrics

import data_preprocessing

########## PROCESSAMENTO LOG BGL ########## 
templatesTrain = pd.read_csv(r'C:\Users\Alysson\Desktop\DissertacaoUNB\datasets\BGL\BGL_2k.log_templates.csv', sep=',')
SequenceVecTrain = pd.read_csv(r'C:\Users\Alysson\Desktop\DissertacaoUNB\datasets\BGL\BGL_2k-0.01667horas.csv', sep=',')
#SequenceVecTest = pd.read_csv(r'C:\Users\Alysson\Desktop\MinhaImplementacaoBGL\datasets\BGL\XXXXXXX.csv', sep=',') #Opcional
#templatesTest = pd.read_csv(r'C:\Users\Alysson\Desktop\MinhaImplementacaoBGL\datasets\BGL\XXXXXXX.csv', sep=',') #Opcional
preProcess = {"stemming": False, "lemmatization": True}

train_data, test_data, _ = create_train_test_dataset(templatesTrain, SequenceVecTrain, onlyNormalDataTrain=True, shuffleTrain=True, removeEmptySeq=True, 
                                                  splitData=True, SequenceVectorTest=None, templatesTest=None, preProcessParam=preProcess)

########## CRIAÇÃO DAS SEQUÊNCIAS DE VETORES SEMÂNTICOS - ÚNICA ########## 

# Use a função apply para aplicar a função process_line a cada linha da coluna 'ReplacedSequence'
train_data['splittedReplacedSequence'] = train_data['ReplacedSequence'].apply(process_line)
test_data['splittedReplacedSequence'] = test_data['ReplacedSequence'].apply(process_line) 

word_vector_dict_train, dimension_train = generate_word_vectors_dict(train_data, model_type='glove50d') 
#word_vector_dict_test, dimension_test = generate_word_vectors_dict(test_data, model_type='glove50d')

train_data = create_semantic_vector_sequence(train_data, word_vector_dict_train, dimension_train)
test_data = create_semantic_vector_sequence(test_data, word_vector_dict_train, dimension_train) 

print('Total train sequences: ' ,len(train_data))
print('Total test sequences: ' ,len(test_data))
     

In [None]:
import matplotlib.pyplot as plt
from anomaly_detection import *

# Lista de valores de contaminação
contaminations = [0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]

# Lista de funções de detecção de anomalias e seus parâmetros
models = [
    ("LOF", detect_lof_anomalies, {'algorithm': 'auto', 'contamination': None, 'leaf_size': 30, 'metric': 'minkowski', 'n_jobs': None, 'n_neighbors': 20, 'novelty': True}),
    ("Autoencoder", detect_autoencoder_anomalies, {'contamination': None}),
    ("SOM", detect_som_anomalies, {'contamination': None, 'grid_size': (10, 10), 'sigma': 1.0, 'learning_rate': 0.5}),
    ("Isolation Forest", detect_isolation_forest_anomalies, {'contamination': None, 'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 100, 'n_jobs': -1, 'random_state': 123}),
    ("MCD", detect_mcd_anomalies, {'contamination': None, 'support_fraction': 1}),
    ("HBOS", detect_hbos_anomalies, {'n_bins': 10, 'alpha': 0.1, 'tol': 0.5, 'contamination': None}),
    ("CBLOF", detect_cblof_anomalies, {'contamination': None}),
    ("GMM", detect_gmm_anomalies, {'contamination': None, 'n_components': 1, 'covariance_type': 'full', 'max_iter': 100, 'init_params': 'kmeans', 'random_state': 123})
]

# Inicialização de dicionários para armazenar as métricas
metrics = ["recall", "precision", "f1", "roc_auc"]
results = {model[0]: {metric: {} for metric in metrics} for model in models}

for contamination_value in contaminations:
    for model_name, model_func, params in models:
        params['contamination'] = contamination_value
        _, _, _, model_metrics, _ = model_func(train_data, test_data, metric_params=params, label=True, plot_tsne=False, plot_metrics=False)
        precision, recall, f1, _, _, roc_auc = model_metrics
        
        # Armazenar os resultados para cada métrica
        results[model_name]['recall'][contamination_value] = recall
        results[model_name]['precision'][contamination_value] = precision
        results[model_name]['f1'][contamination_value] = f1
        results[model_name]['roc_auc'][contamination_value] = roc_auc


In [None]:
import pandas as pd

# Lista de prefixos de modelos e os dicionários correspondentes
model_prefixes = ['LOF', 'AE', 'SOM', 'IF', 'MCD', 'HBOS', 'CBLOF', 'GMM']
metric_dicts = {
    'F1': [f1_scores_lof, f1_scores_autoencoder, f1_scores_som, f1_scores_isolation_forest, f1_scores_mcd, f1_scores_hbos, f1_scores_cblof, f1_scores_gmm],
    'Precision': [precision_scores_lof, precision_scores_autoencoder, precision_scores_som, precision_scores_isolation_forest, precision_scores_mcd, precision_scores_hbos, precision_scores_cblof, precision_scores_gmm],
    'Recall': [recall_scores_lof, recall_scores_autoencoder, recall_scores_som, recall_scores_isolation_forest, recall_scores_mcd, recall_scores_hbos, recall_scores_cblof, recall_scores_gmm]
}

# Inicializar o dicionário com a coluna de contaminação
metrics_data = {'Contamination': contaminations}

# Preencher o dicionário dinamicamente para cada modelo e métrica
for metric, dict_list in metric_dicts.items():
    for prefix, metric_dict in zip(model_prefixes, dict_list):
        metrics_data[f"{prefix}_{metric}"] = list(metric_dict.values())

# Criar DataFrame
df_metrics = pd.DataFrame(metrics_data)

# Exportar para CSV
df_metrics.to_csv(r'C:/Users/Alysson/Desktop/DissertacaoUNB/datasets/BGL/resultados/Experimento2-Resultados.csv', index=None, header=True)

# Exibir DataFrame
df_metrics


In [None]:
import matplotlib.pyplot as plt

# Dicionários com as métricas e estilos de plotagem para cada modelo
models = {
    'LOF': {'f1': f1_scores_lof, 'precision': precision_scores_lof, 'recall': recall_scores_lof, 'marker': 'o', 'linestyle': '-', 'color': 'blue'},
    'AE': {'f1': f1_scores_autoencoder, 'precision': precision_scores_autoencoder, 'recall': recall_scores_autoencoder, 'marker': 's', 'linestyle': '--', 'color': 'green'},
    'SOM': {'f1': f1_scores_som, 'precision': precision_scores_som, 'recall': recall_scores_som, 'marker': '^', 'linestyle': '-.', 'color': 'red'},
    'IF': {'f1': f1_scores_isolation_forest, 'precision': precision_scores_isolation_forest, 'recall': recall_scores_isolation_forest, 'marker': 'd', 'linestyle': ':', 'color': 'darkgreen'},
    'MCD': {'f1': f1_scores_mcd, 'precision': precision_scores_mcd, 'recall': recall_scores_mcd, 'marker': 'p', 'linestyle': '-', 'color': 'magenta'},
    'HBOS': {'f1': f1_scores_hbos, 'precision': precision_scores_hbos, 'recall': recall_scores_hbos, 'marker': '*', 'linestyle': '-', 'color': 'darkorange'},
    'CBLOF': {'f1': f1_scores_cblof, 'precision': precision_scores_cblof, 'recall': recall_scores_cblof, 'marker': 'x', 'linestyle': '-', 'color': 'black'},
    'GMM': {'f1': f1_scores_gmm, 'precision': precision_scores_gmm, 'recall': recall_scores_gmm, 'marker': 'v', 'linestyle': '-', 'color': 'purple'}
}

# Métricas a serem plotadas
metric_titles = ['F1-Score', 'Precision', 'Recall']
metric_keys = ['f1', 'precision', 'recall']

# Criar subplots
fig, axs = plt.subplots(1, 3, figsize=(18, 7))

# Plotar as métricas
for i, metric in enumerate(metric_keys):
    for model, data in models.items():
        axs[i].plot(contaminations, list(data[metric].values()), marker=data['marker'], linestyle=data['linestyle'], color=data['color'], label=model, markersize=10)
    axs[i].set_xlabel('Contamination Rate')
    axs[i].set_title(metric_titles[i])
    axs[i].legend()
    axs[i].set_ylim(0.0, 1.1)
    axs[i].set_xticks(contaminations)
    axs[i].set_xticklabels(contaminations, fontsize=12)
    axs[i].grid(True, alpha=0.5)

# Ajustar layout para evitar sobreposição e adicionar título geral
plt.tight_layout(rect=[0, 0, 1, 0.96])

# Exportar figura
plt.savefig('C:/Users/Alysson/Desktop/DissertacaoUNB/datasets/BGL/resultados/Experimento2-Metricas.svg', dpi=900, bbox_inches='tight', format='svg')

# Mostrar figura
plt.show()


In [None]:
################## Experimento 2 - Resultados - subplots 

import matplotlib.pyplot as plt

# Função para salvar gráfico em uma figura separada
def save_plot(x, y_lof, y_ae, y_som, y_if, y_mcd, y_hbos, y_cblof, y_gmm, title, filename):
    plt.figure(figsize=(6, 6))

    plt.plot(x, y_lof, marker='o', linestyle='-', color='blue', label='LOF', markersize=10)
    plt.plot(x, y_ae, marker='s', linestyle='--', color='green', label='AE', markersize=10)
    plt.plot(x, y_som, marker='^', linestyle='-.', color='red', label='SOM', markersize=10)
    plt.plot(x, y_if, marker='d', linestyle=':', color='darkgreen', label='IF', markersize=10)
    plt.plot(x, y_mcd, marker='p', linestyle='-', color='magenta', label='MCD', markersize=10)
    plt.plot(x, y_hbos, marker='*', linestyle='-', color='darkorange', label='HBOS', markersize=10)
    plt.plot(x, y_cblof, marker='x', linestyle='-', color='black', label='CBLOF', markersize=10)
    plt.plot(x, y_gmm, marker='v', linestyle='-', color='purple', label='GMM', markersize=10)

    plt.ylim(0.0, 1.1)
    plt.xticks(x)
    plt.grid(True, alpha=0.5)
    
    # Ajustar o tamanho da fonte da label do eixo horizontal
    #plt.xlabel('Taxa de Contaminação', fontsize=17)
    plt.xlabel('Contamination Rate', fontsize=17)

    # Ajustar o tamanho da fonte dos pontos dos eixos
    plt.xticks(x, fontsize=14)
    plt.yticks(fontsize=15)

    #plt.title(title)
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.14), fancybox=True, shadow=True, ncol=4, fontsize='medium')

    plt.savefig(filename, dpi=900, bbox_inches='tight', format='svg')
    plt.close()

# Gráfico (F1-Score)
save_plot(contaminations, f1_scores_lof.values(), f1_scores_autoencoder.values(), f1_scores_som.values(),
          f1_scores_isolation_forest.values(), f1_scores_mcd.values(), f1_scores_hbos.values(),
          f1_scores_cblof.values(), f1_scores_gmm.values(), 'F1-Score', 'C:/Users/Alysson/Desktop/DissertacaoUNB/datasets/BGL/resultados/Experimento2-Metricas-F1Score.svg')

# Gráfico (Precision)
save_plot(contaminations, precision_scores_lof.values(), precision_scores_autoencoder.values(),
          precision_scores_som.values(), precision_scores_isolation_forest.values(),
          precision_scores_mcd.values(), precision_scores_hbos.values(),
          precision_scores_cblof.values(), precision_scores_gmm.values(), 'Precision', 'C:/Users/Alysson/Desktop/DissertacaoUNB/datasets/BGL/resultados/Experimento2-Metricas-Precision.svg')

# Gráfico (Recall)
save_plot(contaminations, recall_scores_lof.values(), recall_scores_autoencoder.values(),
          recall_scores_som.values(), recall_scores_isolation_forest.values(),
          recall_scores_mcd.values(), recall_scores_hbos.values(),
          recall_scores_cblof.values(), recall_scores_gmm.values(), 'Recall', 'C:/Users/Alysson/Desktop/DissertacaoUNB/datasets/BGL/resultados/Experimento2-Metricas-Recall.svg')
