# Modelado - Ensamble Bagging

In [2]:
# Importaciones
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve, auc

## 1. Utils

In [3]:
SEED = 9603

# Número de modelos 
N_MODELS = 4
prng = np.random.RandomState(seed=SEED)
max_int32 = np.iinfo(np.int32).max
SEEDS_POR_MODELO = prng.randint(0, max_int32, size=N_MODELS)
print(SEEDS_POR_MODELO)

DB_PATH = 'DB/model_evaluation/'

CARACTERISTICA_OBJETIVO = 'label'

[793494059 498241738 377997800 912782427]


In [4]:
bagging_model = [
  {
    'path_name' : 'rf', 
    'model_name' : 'RandomForestClassifier',
    'model' : make_pipeline(
      StandardScaler(),
      RandomForestClassifier(criterion="gini", class_weight='balanced', max_depth=20, n_estimators=100, bootstrap=True, random_state=SEEDS_POR_MODELO[0])
    )
  },
  {
    'path_name' : 'dt', 
    'model_name' : 'DecisionTreeClassifier',
    'model' : make_pipeline(
      StandardScaler(),
      DecisionTreeClassifier(criterion="gini", class_weight='balanced', max_depth=20, random_state=SEEDS_POR_MODELO[1])
    )
  },
  {
    'path_name' : 'mlp',
    'model_name' : 'MLPClassifier',
    'model' : make_pipeline(
      StandardScaler(),
      MLPClassifier(hidden_layer_sizes=(64,), max_iter=100, early_stopping=True, random_state=SEEDS_POR_MODELO[2])
    )
  },
  {
    'path_name' : 'knn',
    'model_name' : 'KNeighborsClassifier',
    'model' : make_pipeline(
      StandardScaler(),
      KNeighborsClassifier(n_neighbors=3, weights='distance', metric='manhattan')
    )
  },
]

In [5]:
df = pd.read_csv(f'{DB_PATH}/1/df_train_rf.csv')
encoder = LabelEncoder()
encoder.fit(df[CARACTERISTICA_OBJETIVO])
LABELS = encoder.classes_ 

In [6]:
# getFrequency(): Función para obtener la distribución de frecuencias de la columna label
def getFrequency(df : pd.DataFrame, caracteristica) :
  frecuencia = df[caracteristica].value_counts()
  porcentaje = df[caracteristica].value_counts(normalize=True) * 100

  tabla_frecuencia = pd.DataFrame({
    "Frecuencia": frecuencia,
    "Frecuencia(%)": porcentaje
  })
  
  tabla_frecuencia["Frecuencia(%)"].round(2)
  
  print(tabla_frecuencia)
  print(df.shape)

In [7]:
# get_results() : Función para automatizar la generación de la Matriz de confusión y AUC de las curvas ROC y Sensibilidad Precisión
def get_results(y_test, y_pred, threshold) :
  y_pred_under_threshold = y_pred >= threshold
  y_pred_for_cm = y_pred_under_threshold.astype(int)
  """
  Matriz de confusión
              Predicted
              0     1
  Actual 0  [[TN,   FP],
         1   [FN,   TP]]
  """
  cm = confusion_matrix(y_test, y_pred_for_cm)
  auc_roc = roc_auc_score(y_test, y_pred)
  precision, recall, pr_thresholds = precision_recall_curve(y_test, y_pred)
  auc_sp = auc(recall, precision)
  
  return cm, auc_roc, auc_sp

In [8]:
# one_hot_encode() : Función para codificar la variable objetivo en razón de las clases que codificó la instancia de Machine Learning
def one_hot_encode(y, classes) : 
  one_hot = np.zeros((len(y), len(classes)))
  for i, label in enumerate(y) :
    one_hot[i, int(label)] = 1
  return one_hot

In [9]:
# save_df() : Función para guardar resultados
def save_df(X_, y_, save_path) : 
  df_save = pd.DataFrame(X_, columns=CAT_SELECTED_CHARS+NUM_SELECTED_CHARS)
  df_save[CARACTERISTICA_OBJETIVO] = y_

  df_save.to_csv(save_path, index=False) 

In [None]:
df_validation = pd.DataFrame(columns=["iter", "model", "label", "TP", "TN", "FP", "FN", "ms", "AUC_PS"])
df_testing    = pd.DataFrame(columns=["iter", "model", "label", "TP", "TN", "FP", "FN", "Exac", "Prec", "Sens", "F1", "ms", "AUC_PS"])

for iter in range(10) : 
  print(f"""
    Iteración {iter+1}
  """)
  results_list = []
  for instance in bagging_model : 
    print("Entrenamiento de modelo", instance['model_name'])
    
    df_train = pd.read_csv(f"{DB_PATH}{iter+1}/df_train_{instance['path_name']}.csv")
    X_train = df_train.drop([CARACTERISTICA_OBJETIVO], axis=1).values
    y_train = encoder.transform(df_train[CARACTERISTICA_OBJETIVO].values.ravel())

    print("Distribución de conjunto de entrenamiento")
    getFrequency(df_train, 'label')

    instance['model'].fit(X_train, y_train)

  for instance in bagging_model : 
    print("Validación de modelo", instance['model_name'])

    df_valid = pd.read_csv(f"{DB_PATH}{iter+1}/df_valid_{instance['path_name']}.csv")
    X_valid = df_valid.drop([CARACTERISTICA_OBJETIVO], axis=1).values
    y_valid = one_hot_encode(
      encoder.transform(df_valid[CARACTERISTICA_OBJETIVO].values.ravel()), 
      LABELS
    )

    print("Distribución de conjunto de validación")
    getFrequency(df_valid, 'label')

    valid_start = time.time()
    y_valid_pred = instance['model'].predict_proba(X_valid)
    valid_end = time.time()

    for i in range(len(LABELS)) : 
      cm, auc_roc, auc_sp = get_results(
        y_test=y_valid[:,i],
        y_pred=y_valid_pred[:,i],
        threshold=0.5
      )
      df_validation.loc[len(df_validation)] = [
        f"Iteración {iter+1}",
        instance["model_name"], 
        LABELS[i],
        cm[1][1], 
        cm[0][0], 
        cm[0][1], 
        cm[1][0], 
        valid_end-valid_start, 
        auc_sp
      ]


  for instance in bagging_model :
    X_test = pd.read_csv(f"{DB_PATH}{iter+1}/df_test_{instance['path_name']}.csv").drop([CARACTERISTICA_OBJETIVO], axis=1).values
    instance['test_data'] = X_test
  y_test = pd.read_csv(f"{DB_PATH}{iter+1}/df_test_rf.csv")[CARACTERISTICA_OBJETIVO]
  y_test = one_hot_encode(
    encoder.transform(y_test.values.ravel()), 
    LABELS
  )
  y_pred_bagging = np.zeros_like(y_test)

  """ Prueba de los modelos """
  bagging_start = time.perf_counter()
  for instance in bagging_model :
    print("Prueba del modelo", instance['model_name'])

    pred_start = time.perf_counter()
    y_pred = instance["model"].predict_proba(instance["test_data"])
    pred_end   = time.perf_counter()

    results_list.append({
      "model" : instance["model_name"],
      "pred" : y_pred,
      "time" : (pred_end - pred_start)*1000,
    })
    y_pred_bagging += y_pred
  y_pred_bagging /= len(bagging_model)
  bagging_end = time.perf_counter()

  results_list.append({
    'model' : 'IDSBaggingClassifier',
    'pred' : y_pred_bagging,
    'time' : (bagging_end-bagging_start)*1000
  })
  print(f'results_list : {len(results_list)}')

  for result in results_list : 
    for i in range(len(LABELS)) :
      cm, auc_roc, auc_sp = get_results(
        y_test=y_test[:,i],
        y_pred=result["pred"][:,i],
        threshold=0.5
      )
      exac = (cm[1][1] + cm[0][0])/(cm[1][1]+cm[0][0]+cm[0][1]+cm[1][0])
      prec = (cm[1][1])/(cm[1][1]+cm[0][1])
      sens = (cm[1][1])/(cm[1][1]+cm[1][0])
      F1sc = (sens*prec*2)/(sens+prec)
      df_testing.loc[len(df_testing)] = [
        f"Iteración {iter+1}",
        result["model"],
        LABELS[i],
        cm[1][1], 
        cm[0][0], 
        cm[0][1], 
        cm[1][0], 
        exac,
        prec,
        sens,
        F1sc, 
        result["time"]*1000,
        auc_sp
      ]
  iter+=1


    Iteración 1
  
Entrenamiento de modelo RandomForestClassifier
Distribución de conjunto de entrenamiento
                  Frecuencia  Frecuencia(%)
label                                      
BENIGN                 77747      47.036154
PORTSCAN               13428       8.123805
DOS_HULK               13371       8.089321
DDOS                    8678       5.250103
SSH_PATATOR             8678       5.250103
DOS_SLOWLORIS           8678       5.250103
DOS_SLOWHTTPTEST        8678       5.250103
DOS_GOLDENEYE           8678       5.250103
FTP_PATATOR             8678       5.250103
BOT                     8678       5.250103
(165292, 11)
Entrenamiento de modelo DecisionTreeClassifier
Distribución de conjunto de entrenamiento
                  Frecuencia  Frecuencia(%)
label                                      
BENIGN                 77754      47.038397
PORTSCAN               13428       8.123461
DOS_HULK               13371       8.088978
DDOS                    8678       5.2498



Entrenamiento de modelo KNeighborsClassifier
Distribución de conjunto de entrenamiento
                  Frecuencia  Frecuencia(%)
label                                      
BENIGN                 77757      47.039358
PORTSCAN               13428       8.123314
DOS_HULK               13371       8.088831
DDOS                    8678       5.249785
DOS_GOLDENEYE           8678       5.249785
DOS_SLOWLORIS           8678       5.249785
FTP_PATATOR             8678       5.249785
SSH_PATATOR             8678       5.249785
BOT                     8678       5.249785
DOS_SLOWHTTPTEST        8678       5.249785
(165302, 11)
Validación de modelo RandomForestClassifier
Distribución de conjunto de validación
                  Frecuencia  Frecuencia(%)
label                                      
BENIGN                 49148      68.054113
PORTSCAN                8467      11.724062
DOS_HULK                8425      11.665905
DDOS                    5042       6.981542
DOS_GOLDENEYE            

In [11]:
df_validation.to_excel("../5_analisis_de_resultados/DB/validation_results.xlsx", index=False)
df_testing.to_excel("../5_analisis_de_resultados/DB/testing_results.xlsx", index=False)
print("Guardado")

Guardado
