# Evaluación y crítica al modelo

In [1]:
# importaciones
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, auc
import plotly.express as px

## Carga de datos

In [2]:
default_threshold = 0.5

oob_size = 0.25

oob_seeds = [9603, 9306, 3096, 6039]

dataset_path = "DB/"

output_files_name = ["X_train.csv", "y_train.csv", "X_test.csv", "y_test.csv"]

In [3]:
X_train = pd.read_csv(f"{dataset_path}{output_files_name[0]}")
y_train = pd.read_csv(f"{dataset_path}{output_files_name[1]}")
X_test  = pd.read_csv(f"{dataset_path}{output_files_name[2]}")
y_test  = pd.read_csv(f"{dataset_path}{output_files_name[3]}")

## Codificación de la variable objetivo

In [4]:
X_train_encoded = X_train.to_numpy()
X_test_encoded  = X_test.to_numpy()

In [5]:
encoder = LabelBinarizer()

encoder.fit(y_train)

y_train_encoded = encoder.transform(y_train)
y_test_encoded  = encoder.transform(y_test)

## Creación de dataset_oob para cada modelo

In [6]:
# dataset_oob RandomForestClassifier
X_train_rf, X_oob_rf, y_train_rf, y_oob_rf = train_test_split(X_train_encoded, y_train_encoded, random_state=oob_seeds[0],test_size=oob_size, stratify=y_train_encoded)

# dataset_oob DecisionTreeClassifier
X_train_dt, X_oob_dt, y_train_dt, y_oob_dt = train_test_split(X_train_encoded, y_train_encoded, random_state=oob_seeds[1],test_size=oob_size, stratify=y_train_encoded)

# dataset_oob MLPClassifier
X_train_mlp, X_oob_mlp, y_train_mlp, y_oob_mlp = train_test_split(X_train_encoded, y_train_encoded, random_state=oob_seeds[2],test_size=oob_size, stratify=y_train_encoded)

# dataset_oob KNeighborsClassifier
X_train_knn, X_oob_knn, y_train_knn, y_oob_knn = train_test_split(X_train_encoded, y_train_encoded, random_state=oob_seeds[3],test_size=oob_size, stratify=y_train_encoded)

## Creación de modelos 

In [7]:
rf  = OneVsRestClassifier(RandomForestClassifier(criterion="gini", n_estimators=100,class_weight='balanced'))
dt  = OneVsRestClassifier(DecisionTreeClassifier(criterion="gini", class_weight='balanced'))
mlp = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes=(10,), activation='relu', max_iter=500))
knn = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=3, weights='distance'))

## Entrenamiento de modelos

In [8]:
rf.fit(X_train_rf, y_train_rf)

In [9]:
dt.fit(X_train_dt, y_train_dt)

In [10]:
mlp.fit(X_train_mlp, y_train_mlp)

In [11]:
knn.fit(X_train_knn, y_train_knn)

## Validación OOB de modelos

In [12]:
bagging_oob_start = time.time()

In [13]:
rf_oob_start = time.time()
y_pred_rf_oob = rf.predict_proba(X_test_encoded)
rf_oob_end = time.time()

In [14]:
dt_oob_start = time.time()
y_pred_dt_oob = dt.predict_proba(X_test_encoded)
dt_oob_end = time.time()

In [15]:
mlp_oob_start = time.time()
y_pred_mlp_oob = mlp.predict_proba(X_test_encoded)
mlp_oob_end = time.time()

In [16]:
knn_oob_start = time.time()
y_pred_knn_oob = knn.predict_proba(X_test_encoded)
knn_oob_end = time.time()

In [17]:
y_pred_avg_oob = (y_pred_rf_oob + y_pred_dt_oob + y_pred_mlp_oob + y_pred_knn_oob) / 4

In [18]:
bagging_oob_end = time.time()

In [19]:
# Ordenando resultados en un array
results_oob_list = [
  {
    "name" : "IDSBaggingClassifier",
    "ms" : bagging_oob_end - bagging_oob_start,
    "results" : y_pred_avg_oob
  },
  {
    "name" : "RandomForestClassifier",
    "ms" : rf_oob_end - rf_oob_start,
    "results" : y_pred_rf_oob
  },
  {
    "name" : "DecisionTreeClassifier",
    "ms" : dt_oob_end - dt_oob_start,
    "results" : y_pred_dt_oob
  },
  {
    "name" : "MLPClassifier",
    "ms" : mlp_oob_end - mlp_oob_start,
    "results" : y_pred_mlp_oob
  },
  {
    "name" : "KNeighborsClassifier",
    "ms" : knn_oob_end - knn_oob_start,
    "results" : y_pred_knn_oob
  }
]

## Testeo de modelos

In [20]:
bagging_test_start = time.time()

In [21]:
rf_test_start = time.time()
y_pred_rf_test = rf.predict_proba(X_test_encoded)
rf_test_end = time.time()

In [22]:
dt_test_start = time.time()
y_pred_dt_test = dt.predict_proba(X_test_encoded)
dt_test_end = time.time()

In [23]:
mlp_test_start = time.time()
y_pred_mlp_test = mlp.predict_proba(X_test_encoded)
mlp_test_end = time.time()

In [24]:
knn_test_start = time.time()
y_pred_knn_test = knn.predict_proba(X_test_encoded)
knn_test_end = time.time()

In [25]:
y_pred_avg_test = (y_pred_rf_test + y_pred_dt_test + y_pred_mlp_test + y_pred_knn_test) / 4

In [26]:
bagging_test_end = time.time()

In [27]:
# Ordenando resultados en un array
results_test_list = [
  {
    "name" : "IDSBaggingClassifier",
    "ms" : bagging_test_end - bagging_test_start,
    "results" : y_pred_avg_test
  },
  {
    "name" : "RandomForestClassifier",
    "ms" : rf_test_end - rf_test_start,
    "results" : y_pred_rf_test
  },
  {
    "name" : "DecisionTreeClassifier",
    "ms" : dt_test_end - dt_test_start,
    "results" : y_pred_dt_test
  },
  {
    "name" : "MLPClassifier",
    "ms" : mlp_test_end - mlp_test_start,
    "results" : y_pred_mlp_test
  },
  {
    "name" : "KNeighborsClassifier",
    "ms" : knn_test_end - knn_test_start,
    "results" : y_pred_knn_test
  }
]

## Guardado de resultados

In [28]:
# get_results() : Función para automatizar la generación de la Matriz de confusión y AUC de las curvas ROC y Precisión Recuperación
def get_results(y_test, y_pred, threshold, model_name, label_name : str) :
  y_test_new = y_test
  y_pred_new = y_pred
  y_pred_under_threshold = y_pred >= threshold
  y_pred_for_cm = y_pred_under_threshold.astype(int)
  """
  Matriz de confusión
              Predicted
              0     1
  Actual 0  [[TN,   FP],
         1   [FN,   TP]]
  """
  cm = confusion_matrix(y_test_new, y_pred_for_cm)
  auc_roc = roc_auc_score(y_test_new, y_pred_new)
  precision, recall, pr_thresholds = precision_recall_curve(y_test_new, y_pred_new)
  auc_pr = auc(recall, precision)

  print(f"""
  {model_name}
  {label_name}
  {threshold}
  {cm}
  {auc_roc}
  {auc_pr}
  """)
  
  return cm, auc_roc, auc_pr

In [29]:
# save_results() : Función para guardar resultados de los modelos
def save_results(results_list, file_name)
  with pd.ExcelWriter(f'output/{file_name}.xlsx') as writer:
    for i in results_list : 
      df_results = pd.DataFrame(columns=["label", "TP", "TN", "FP", "FN", "AUC_ROC", "AUC_PR", "ms"])
      for j in range(len(encoder.classes_)) : 
        cm, auc_roc, auc_pr = get_results(
          y_test=y_test_encoded[:,j],
          y_pred=i["results"][:,j],
          model_name=i["name"],
          label_name=encoder.classes_[j],
          threshold=default_threshold,
        )
        df_results.loc[len(df_results)] = [
          f"{encoder.classes_[j]}", 
          cm[1][1], cm[0][0], cm[0][1], cm[1][0], auc_roc, auc_pr, i["ms"]
        ]
      df_results.to_excel(writer, sheet_name=i["name"])

SyntaxError: expected ':' (870592188.py, line 2)

In [None]:
# Guardando resultados de oob
save_results(results_oob_list, "oob/results")

In [None]:
# Guardando resultados de test
save_results(results_test_list, "test/results")

In [None]:
# Guardando resultados detallados
# with pd.ExcelWriter('output/detailed_results.xlsx') as writer:
#   for i in results_list : 
#     df_results = pd.DataFrame(columns=["label", "TP", "TN", "FP", "FN", "ms"])
#     for j in range(len(encoder.classes_)) : 
#       fpr, tpr, roc_thresholds = roc_curve(y_test_encoded[:, j], i["results"][:,j])
#       precision, recall, pr_thresholds = precision_recall_curve(y_test_encoded[:, j], i["results"][:,j])
#       for threshold in roc_thresholds : 
#         cm, auc = get_results(
#           y_test=y_test_encoded[:, j],
#           y_pred=i["results"][:,j],
#           threshold=threshold,
#           model_name=i["name"],
#           label_name=encoder.classes_[j],
#           general=False if encoder.classes_[j] != "BENIGN" else True 
#         )
#         df_results.loc[len(df_results)] = [
#           f"{i["name"]}_{encoder.classes_[j] if encoder.classes_[j] != "BENIGN" else "GENERAL"}_{threshold}_thsld", 
#           cm[1][1], cm[0][0], cm[0][1], cm[1][0], i["ms"]]
#     df_results.to_excel(writer, sheet_name=i["name"])

# EXTRA


In [None]:
fig = px.scatter_3d(X_train[["component_1","component_2", "component_3"]])