# Evaluación y crítica al modelo

In [1]:
# importaciones
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report

## Carga de datos

In [2]:
default_threshold = 0.5

oob_size = 0.25

oob_seeds = [9603, 9306, 3096, 6039]

dataset_path = "DB/"

output_files_name = ["X_train.csv", "y_train.csv", "X_test.csv", "y_test.csv"]

In [3]:
X_train = pd.read_csv(f"{dataset_path}{output_files_name[0]}")
y_train = pd.read_csv(f"{dataset_path}{output_files_name[1]}")
X_test  = pd.read_csv(f"{dataset_path}{output_files_name[2]}")
y_test  = pd.read_csv(f"{dataset_path}{output_files_name[3]}")

## Codificación de la variable objetivo

In [4]:
X_train_encoded = X_train.to_numpy()
X_test_encoded  = X_test.to_numpy()

In [5]:
encoder = LabelBinarizer()

encoder.fit(y_train)

y_train_encoded = encoder.transform(y_train)
y_test_encoded  = encoder.transform(y_test)

## Creación de dataset_oob para cada modelo

In [6]:
# dataset_oob RandomForestClassifier
X_train_rf, X_oob_rf, y_train_rf, y_oob_rf = train_test_split(X_train_encoded, y_train_encoded, random_state=oob_seeds[0],test_size=oob_size, stratify=y_train_encoded)

# dataset_oob DecisionTreeClassifier
X_train_dt, X_oob_dt, y_train_dt, y_oob_dt = train_test_split(X_train_encoded, y_train_encoded, random_state=oob_seeds[1],test_size=oob_size, stratify=y_train_encoded)

# dataset_oob MLPClassifier
X_train_mlp, X_oob_mlp, y_train_mlp, y_oob_mlp = train_test_split(X_train_encoded, y_train_encoded, random_state=oob_seeds[2],test_size=oob_size, stratify=y_train_encoded)

# dataset_oob KNeighborsClassifier
X_train_knn, X_oob_knn, y_train_knn, y_oob_knn = train_test_split(X_train_encoded, y_train_encoded, random_state=oob_seeds[3],test_size=oob_size, stratify=y_train_encoded)

## Creación de modelos 

In [7]:
rf  = OneVsRestClassifier(RandomForestClassifier(criterion="gini", n_estimators=100,class_weight='balanced'))
dt  = OneVsRestClassifier(DecisionTreeClassifier(criterion="gini", class_weight='balanced'))
mlp = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes=(10,), activation='relu', max_iter=500))
knn = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=3, weights='distance'))

## Entrenamiento de modelos

In [8]:
rf.fit(X_train_rf, y_train_rf)

In [9]:
dt.fit(X_train_dt, y_train_dt)

In [10]:
mlp.fit(X_train_mlp, y_train_mlp)

In [11]:
knn.fit(X_train_knn, y_train_knn)

## Validación OOB de modelos

In [12]:
# Validació

## Testeo de modelos

In [13]:
bagging_start = time.time()

In [14]:
rf_start = time.time()
y_pred_rf = rf.predict_proba(X_test_encoded)
rf_end = time.time()

In [15]:
dt_start = time.time()
y_pred_dt = dt.predict_proba(X_test_encoded)
dt_end = time.time()

In [16]:
mlp_start = time.time()
y_pred_mlp = mlp.predict_proba(X_test_encoded)
mlp_end = time.time()

In [17]:
knn_start = time.time()
y_pred_knn = knn.predict_proba(X_test_encoded)
knn_end = time.time()

In [18]:
y_pred_avg = (y_pred_rf + y_pred_dt + y_pred_mlp + y_pred_knn) / 4

In [19]:
bagging_end = time.time()

## Guardado de resultados

In [20]:
# Ordenando resultados en un array
results_list = [
  {
    "name" : "IDSBaggingClassifier",
    "ms" : bagging_end - bagging_start,
    "results" : y_pred_avg
  },
  {
    "name" : "RandomForestClassifier",
    "ms" : rf_end - rf_start,
    "results" : y_pred_rf 
  },
  {
    "name" : "DecisionTreeClassifier",
    "ms" : dt_end - dt_start,
    "results" : y_pred_dt
  },
  {
    "name" : "MLPClassifier",
    "ms" : mlp_end - mlp_start,
    "results" : y_pred_mlp
  },
  {
    "name" : "KNeighborsClassifier",
    "ms" : knn_end - knn_start,
    "results" : y_pred_knn
  }
]

In [None]:
# get_results() : Función para automatizar la generación de la Matriz de confusión y AUC
def get_results(y_test, y_pred, threshold, model_name, label_name : str, general : bool) :
  y_test_new = y_test
  y_pred_new = y_pred
  y_pred_under_threshold = y_pred < threshold if general else y_pred >= threshold
  y_pred_for_cm = y_pred_under_threshold.astype(int)

  if general :
    y_test_new = [1-y for y in y_test]
    y_pred_for_cm = [1-y for y in y_pred_for_cm]
  """
  Matriz de confusión
              Predicted
              0     1
  Actual 0  [[TN,   FP],
         1   [FN,   TP]]
  """
  cm = confusion_matrix(y_test_new, y_pred_for_cm)
  auc = roc_auc_score(y_test_new, y_pred_new)

  print(f"""
  {model_name}
  {label_name}
  {threshold}
  {cm}
  {auc}
  """)
  
  return cm, auc

# def get_dt_results(y_test, y_pred, threshold, model_name, label_name : str, general : bool) :
#   y_test_new = y_test
#   y_pred_new = y_pred
#   y_pred_under_threshold = y_pred < threshold if general else y_pred >= threshold
#   y_pred_for_cm = y_pred_under_threshold.astype(int)

#   if general :
#     y_test_new = [1-y for y in y_test]
#     y_pred_for_cm = [1-y for y in y_pred_for_cm]
#   """
#   Matriz de confusión
#               Predicted
#               0     1
#   Actual 0  [[TN,   FP],
#          1   [FN,   TP]]
#   """
#   cm = confusion_matrix(y_test_new, y_pred_for_cm)
#   auc = roc_auc_score(y_test_new, y_pred_new)
  
#   return cm, auc

In [22]:
# Guardando resultados generales
with pd.ExcelWriter('output/results.xlsx') as writer:
  for i in results_list : 
    df_results = pd.DataFrame(columns=["label", "TP", "TN", "FP", "FN", "AUC", "ms"])
    for j in range(len(encoder.classes_)) : 
      cm, auc = get_results(
        y_test=y_test_encoded[:, j],
        y_pred=i["results"][:,j],
        threshold=default_threshold,
        model_name=i["name"],
        label_name=encoder.classes_[j],
        general=False if encoder.classes_[j] != "BENIGN" else True 
      )
      df_results.loc[len(df_results)] = [
        f"{i["name"]}_{encoder.classes_[j] if encoder.classes_[j] != "BENIGN" else "GENERAL"}", 
        cm[1][1], cm[0][0], cm[0][1], cm[1][0], auc, i["ms"]
      ]
    df_results.to_excel(writer, sheet_name=i["name"])

In [23]:
# Guardando resultados detallados
with pd.ExcelWriter('output/detailed_results.xlsx') as writer:
  for i in results_list : 
    df_results = pd.DataFrame(columns=["label", "TP", "TN", "FP", "FN", "ms"])
    for j in range(len(encoder.classes_)) : 
      fpr, tpr, thresholds = roc_curve(y_test_encoded[:, j], i["results"][:,j])
      for threshold in thresholds : 
        cm, auc = get_results(
          y_test=y_test_encoded[:, j],
          y_pred=i["results"][:,j],
          threshold=threshold,
          model_name=i["name"],
          label_name=encoder.classes_[j],
          general=False if encoder.classes_[j] != "BENIGN" else True 
        )
        df_results.loc[len(df_results)] = [
          f"{i["name"]}_{encoder.classes_[j] if encoder.classes_[j] != "BENIGN" else "GENERAL"}_{threshold}_thsld", 
          cm[1][1], cm[0][0], cm[0][1], cm[1][0], i["ms"]]
    df_results.to_excel(writer, sheet_name=i["name"])

KeyboardInterrupt: 

## Adicional

In [None]:
# for results in results_list : 
#   y_pred_zero_array = np.zeros_like(results["results"], dtype=int)

#   y_pred_zero_array[
#     np.arange(y_pred_zero_array.shape[0]), 
#     np.argmax(results["results"], axis=1)
#   ] = 1

#   print(results["name"])
#   print(classification_report(y_test_encoded, y_pred_zero_array, zero_division=0, target_names=encoder.classes_))

In [None]:
# for i in results_list : 
#   for j in range(1, len(results_list)) : 
#     fpr, tpr, thresholds = roc_curve(y_test_encoded[:, j], i["results"][:,j])
#     for k in thresholds :
#       y_pred_under_threshold = (i["results"][:,j] >= k).astype(int)
#       cm = confusion_matrix(y_test_encoded[:, j], y_pred_under_threshold)
#       print(cm)