# Evaluación y crítica al modelo

In [1]:
# importaciones
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, auc, accuracy_score
import plotly.express as px

## Carga de datos

In [2]:
default_threshold = 0.5

oob_size = 0.25

oob_seeds = [9603, 9306, 3096, 6039]

dataset_path = "DB/"

output_files_name = ["X_train.csv", "y_train.csv", "X_valid.csv", "y_valid.csv", "X_test.csv", "y_test.csv"]

In [65]:
X_train = pd.read_csv(f"{dataset_path}{output_files_name[0]}").values
y_train = pd.read_csv(f"{dataset_path}{output_files_name[1]}").values
X_valid = pd.read_csv(f"{dataset_path}{output_files_name[2]}").values
y_valid = pd.read_csv(f"{dataset_path}{output_files_name[3]}").values
X_test  = pd.read_csv(f"{dataset_path}{output_files_name[4]}").values
y_test  = pd.read_csv(f"{dataset_path}{output_files_name[5]}").values

In [68]:
print(f"""
X : {X_train.shape}
y : {y_train.shape}
labels: {np.unique(y_train)}
""")


X : (947746, 12)
y : (947746, 1)
labels: ['BENIGN' 'BOT' 'DDOS' 'DOS_GOLDENEYE' 'DOS_HULK' 'DOS_SLOWHTTPTEST'
 'DOS_SLOWLORIS' 'FTP_PATATOR' 'PORTSCAN' 'SSH_PATATOR'
 'WEB_ATTACK_BRUTE_FORCE' 'WEB_ATTACK_XSS']



## Transformación en array numpy

In [5]:
X_train_encoded = X_train.to_numpy()
X_valid_encoded = X_valid.to_numpy()
X_test_encoded  = X_test.to_numpy()

encoder = OneHotEncoder(sparse_output=False)

encoder.fit(y_train)

y_train_encoded = encoder.transform(y_train)
y_valid_encoded = encoder.transform(y_valid)
y_test_encoded  = encoder.transform(y_test)

y_train_encoded

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

## Creación de conjunto de entrenamiento y oob para cada modelo

In [6]:
# dataset_oob RandomForestClassifier
X_train_rf, X_oob_rf, y_train_rf, y_oob_rf = train_test_split(X_train_encoded, y_train_encoded, random_state=oob_seeds[0],test_size=oob_size, stratify=y_train_encoded)

# dataset_oob DecisionTreeClassifier
X_train_dt, X_oob_dt, y_train_dt, y_oob_dt = train_test_split(X_train_encoded, y_train_encoded, random_state=oob_seeds[1],test_size=oob_size, stratify=y_train_encoded)

# dataset_oob MLPClassifier
X_train_mlp, X_oob_mlp, y_train_mlp, y_oob_mlp = train_test_split(X_train_encoded, y_train_encoded, random_state=oob_seeds[2],test_size=oob_size, stratify=y_train_encoded)

# dataset_oob KNeighborsClassifier
X_train_knn, X_oob_knn, y_train_knn, y_oob_knn = train_test_split(X_train_encoded, y_train_encoded, random_state=oob_seeds[3],test_size=oob_size, stratify=y_train_encoded)

## Creación de modelos 

In [7]:
rf  = RandomForestClassifier(criterion="gini", n_estimators=100,class_weight='balanced')

In [8]:
dt  = DecisionTreeClassifier(criterion="gini", class_weight='balanced')

In [9]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=9603)

In [10]:
knn = knn = KNeighborsClassifier(n_neighbors=5, weights='distance')

## Entrenamiento de modelos

In [11]:
rf.fit(X_train_rf, y_train_rf)

In [12]:
dt.fit(X_train_dt, y_train_dt)

In [13]:
mlp.fit(X_train_mlp, y_train_mlp)

In [14]:
knn.fit(X_train_knn, y_train_knn)

## Validación OOB de los algoritmos

In [15]:
bagging_start_oob = time.time()
rf_start_oob = time.time()
y_pred_rf_oob = rf.predict_proba(X_oob_rf)
rf_end_oob = time.time()
dt_start_oob = time.time()
y_pred_dt_oob = dt.predict_proba(X_oob_dt)
dt_end_oob = time.time()
mlp_start_oob = time.time()
y_pred_mlp_oob = mlp.predict_proba(X_oob_mlp)
mlp_end_oob = time.time()
knn_start_oob = time.time()
y_pred_knn_oob = knn.predict_proba(X_oob_knn)
knn_end_oob = time.time()
bagging_end_oob = time.time()
rf_acc = accuracy_score(y_oob_rf, y_pred_rf_oob)
dt_acc = accuracy_score(y_oob_dt, y_pred_dt_oob)
mlp_acc = accuracy_score(y_oob_mlp, y_pred_mlp_oob) 
knn_acc = accuracy_score(y_oob_knn, y_pred_knn_oob) 
bagging_acc = (rf_acc+dt_acc+mlp_acc+knn_acc)/4
results_list_oob = [
  {
    "name" : "IDSBaggingClassifier",
    "ms" : bagging_end_oob - bagging_start_oob,
    "results" : bagging_acc
  },
  {
    "name" : "RandomForestClassifier",
    "ms" : rf_end_oob - rf_start_oob,
    "results" : rf_acc
  },
  {
    "name" : "DecisionTreeClassifier",
    "ms" : dt_end_oob - dt_start_oob,
    "results" : dt_acc
  },
  {
    "name" : "MLPClassifier",
    "ms" : mlp_end_oob - mlp_start_oob,
    "results" : mlp_acc
  },
  {
    "name" : "KNeighborsClassifier",
    "ms" : knn_end_oob - knn_start_oob,
    "results" : knn_acc
  }
]

ValueError: Found input variables with inconsistent numbers of samples: [236937, 12]

## Pruebas

### Utilidades

In [None]:
# predict_probas() : Predicción del modelo Bagging e instancias internas, devuelve un diccionario con los resultados
def predict_probas(X) : 
  bagging_start = time.time()
  rf_start = time.time()
  y_pred_rf = rf.predict_proba(X)
  rf_end = time.time()
  dt_start = time.time()
  y_pred_dt = dt.predict_proba(X)
  dt_end = time.time()
  mlp_start = time.time()
  y_pred_mlp = mlp.predict_proba(X)
  mlp_end = time.time()
  knn_start = time.time()
  y_pred_knn = knn.predict_proba(X)
  knn_end = time.time()
  bagging_end = time.time()
  y_pred_avg = (y_pred_rf + y_pred_dt + y_pred_mlp + y_pred_knn) / 4
  results_list = [
    {
      "name" : "IDSBaggingClassifier",
      "ms" : bagging_end - bagging_start,
      "results" : y_pred_avg
    },
    {
      "name" : "RandomForestClassifier",
      "ms" : rf_end - rf_start,
      "results" : y_pred_rf
    },
    {
      "name" : "DecisionTreeClassifier",
      "ms" : dt_end - dt_start,
      "results" : y_pred_dt
    },
    {
      "name" : "MLPClassifier",
      "ms" : mlp_end - mlp_start,
      "results" : y_pred_mlp
    },
    {
      "name" : "KNeighborsClassifier",
      "ms" : knn_end - knn_start,
      "results" : y_pred_knn
    }
  ]
  return results_list

In [17]:
# get_results() : Función para automatizar la generación de la Matriz de confusión y AUC de las curvas ROC y Sensibilidad Precisión
def get_results(y_test, y_pred, threshold, model_name, label_name : str) :
  y_test_new = y_test
  y_pred_new = y_pred
  y_pred_under_threshold = y_pred >= threshold
  y_pred_for_cm = y_pred_under_threshold.astype(int)
  """
  Matriz de confusión
              Predicted
              0     1
  Actual 0  [[TN,   FP],
         1   [FN,   TP]]
  """
  cm = confusion_matrix(y_test_new, y_pred_for_cm)
  auc_roc = roc_auc_score(y_test_new, y_pred_new)
  precision, recall, pr_thresholds = precision_recall_curve(y_test_new, y_pred_new)
  auc_pr = auc(recall, precision)

  print(f"""
  {model_name}
  {label_name}
  {threshold}
  {cm}
  {auc_roc}
  {auc_pr}
  """)
  
  return cm, auc_roc, auc_pr

In [None]:
# save_results() : Función para guardar resultados de los modelos en formato csv
def save_results(results_list, y_test_, file_name) :
  with pd.ExcelWriter(f'output/{file_name}.xlsx') as writer:
    for i in results_list : 
      df_results = pd.DataFrame(columns=["label", "TP", "TN", "FP", "FN", "AUC_ROC", "AUC_PR", "ms"])
      for j in range(len(label_columns)) : 
        cm, auc_roc, auc_pr = get_results(
          y_test=y_test_[:,j],
          y_pred=i["results"][:,j],
          model_name=i["name"],
          label_name=label_columns[j],
          threshold=default_threshold,
        )
        df_results.loc[len(df_results)] = [
          f"{label_columns[j]}", 
          cm[1][1], cm[0][0], cm[0][1], cm[1][0], auc_roc, auc_pr, i["ms"]
        ]
      df_results.to_excel(writer, sheet_name=i["name"])

In [69]:
y

NameError: name 'y' is not defined

In [None]:
# one_hot_encode() : Función para 
def one_hot_encode(y, classes) : 
  one_hot = np.zeros((len(y), len(classes)))
  for i, label in enumerate(y) : 
    class_index = np.where(classes == label)[0][0]
    one_hot[i, class_index] = 1
  return one_hot

In [33]:
y_train

Unnamed: 0,label
0,BENIGN
1,BENIGN
2,BENIGN
3,BENIGN
4,PORTSCAN
...,...
947741,WEB_ATTACK_XSS
947742,WEB_ATTACK_XSS
947743,WEB_ATTACK_XSS
947744,WEB_ATTACK_XSS


In [60]:
dt.fit(X_train.values, y_train.values)

In [62]:
dt.predict_proba(X_valid.values)

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [58]:
y_train_encoded = one_hot_encode(y_train.values, dt.classes_)
y_train_encoded

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [63]:
X_train.values

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        7.62393388e-04, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        8.91350715e-05, 0.00000000e+00, 0.00000000e+00],
       [9.95149691e-03, 1.13731393e-02, 2.66506190e-03, ...,
        7.46666606e-01, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.01178470e-05, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.02975717e-05, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.02812698e-05, 0.00000000e+00, 0.00000000e+00]])

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

### Validación de modelos

In [19]:
results_list_valid = predict_probas(X=X_valid_encoded)

AttributeError: 'list' object has no attribute 'shape'

In [None]:
save_results(
  results_list=results_list_valid,
  y_test_=y_valid_encoded, 
  file_name="valid/results"
)

IndexError: At least one sheet must be visible

### Testeo de modelos

In [None]:
results_list_test = predict_probas(X=X_test_encoded)

In [None]:
save_results(
  results_list=results_list_test, 
  y_test_=y_test_encoded,
  file_name="test/results"
)


  IDSBaggingClassifier
  BENIGN
  0.5
  [[ 8171   280]
 [10183 16238]]
  0.8684535368880271
  0.9587669092937364
  

  IDSBaggingClassifier
  BOT
  0.5
  [[34572   270]
 [    8    22]]
  0.9627040162256282
  0.5403856845447857
  

  IDSBaggingClassifier
  DDOS
  0.5
  [[32921    10]
 [  315  1626]]
  0.9988253271077735
  0.9924924891345227
  

  IDSBaggingClassifier
  DOS_GOLDENEYE
  0.5
  [[33888   828]
 [   22   134]]
  0.9888813921608599
  0.7614718732189543
  

  IDSBaggingClassifier
  DOS_HULK
  0.5
  [[29930  1440]
 [  918  2584]]
  0.9256506460081921
  0.7986212224369507
  

  IDSBaggingClassifier
  DOS_SLOWHTTPTEST
  0.5
  [[34770    19]
 [   34    49]]
  0.9844515317298398
  0.6229822192594479
  

  IDSBaggingClassifier
  DOS_SLOWLORIS
  0.5
  [[34506   278]
 [   23    65]]
  0.9851729439345153
  0.7171424388366175
  

  IDSBaggingClassifier
  FTP_PATATOR
  0.5
  [[34735    17]
 [   29    91]]
  0.8672678311080417
  0.7598164435478123
  

  IDSBaggingClassifier
  PORTSCAN
  0

## mhm

In [None]:
# Guardando resultados detallados
# with pd.ExcelWriter('output/detailed_results.xlsx') as writer:
#   for i in results_list : 
#     df_results = pd.DataFrame(columns=["label", "TP", "TN", "FP", "FN", "ms"])
#     for j in range(len(encoder.classes_)) : 
#       fpr, tpr, roc_thresholds = roc_curve(y_test_encoded[:, j], i["results"][:,j])
#       precision, recall, pr_thresholds = precision_recall_curve(y_test_encoded[:, j], i["results"][:,j])
#       for threshold in roc_thresholds : 
#         cm, auc = get_results(
#           y_test=y_test_encoded[:, j],
#           y_pred=i["results"][:,j],
#           threshold=threshold,
#           model_name=i["name"],
#           label_name=encoder.classes_[j],
#           general=False if encoder.classes_[j] != "BENIGN" else True 
#         )
#         df_results.loc[len(df_results)] = [
#           f"{i["name"]}_{encoder.classes_[j] if encoder.classes_[j] != "BENIGN" else "GENERAL"}_{threshold}_thsld", 
#           cm[1][1], cm[0][0], cm[0][1], cm[1][0], i["ms"]]
#     df_results.to_excel(writer, sheet_name=i["name"])