Conjunto de datos utlizado: [CICIDS2017](https://www.unb.ca/cic/datasets/ids-2017.html)

In [1]:
# Importaciones
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import TomekLinks
from itertools import combinations
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample

## 1. Configuración

In [2]:
SEED = 9603

TRAIN_SIZE = 0.75

CARACTERISTICAS_VAR_INF  = ['flow_bytes/s']

CARACTERISTICAS_VAR_NULL = ['flow_iat_min', 'flow_bytes/s']

NUM_SELECTED_CHARS = [
  'bwd_iat_min', 'average_packet_size', 'flow_bytes/s', 'flow_iat_min', 
  'fwd_iat_min', 'fwd_bytes/bulk_avg', 'subflow_bwd_packets', 'active_std', 
  'bwd_bytes/bulk_avg', 'flow_duration', 'fwd_packet_length_max', 'subflow_fwd_packets', 
  'down/up_ratio', 'idle_max', 'fwd_act_data_pkts', 'rst_flag_count',
  'cwr_flag_count', 
]

CAT_SELECTED_CHARS = ['port_type_registered', 'port_type_well_known', 'protocol_6', 'protocol_17']

CARACTERISTICA_OBJETIVO = "label"

## 2. Carga y transformación de datos

### Carga de datos

In [3]:
df_dataset = pd.read_csv(f"DB/dataset.csv")

### Transformación de datos

In [4]:
# Transformación de característica dst_port
"""
  Clasificación de IANA de puertos
  0 - 1023 : Well-known ports, puertos utilizados por el sistema (o root) o programas con privilegios de usuario 
  1024 - 49151 : Registered ports, puertos utilizados en procesos o programas de usuarios ordinarios
  49152 - 65535 : Dynamic and/or Private ports, puertos privados o dinámicos
"""

dst_port_encoder = OneHotEncoder(drop="first", sparse_output=False)

bins = [-1, 1023, 49151, 65535]
port_type_tags = ["well_known", "registered", "dynamic_private"]
df_dataset["port_type"] = pd.cut(df_dataset["dst_port"], bins=bins, labels=port_type_tags)

port_type_encoded = dst_port_encoder.fit_transform(df_dataset[["port_type"]])
df_port_type_encoded = pd.DataFrame(
  columns=dst_port_encoder.get_feature_names_out(),
  data=port_type_encoded
)

df_dataset = pd.concat(
  [
    df_port_type_encoded.reset_index(drop=True), 
    df_dataset.drop(["dst_port", 'port_type'], axis=1).reset_index(drop=True)
  ],
  axis=1
)

df_dataset

Unnamed: 0,port_type_registered,port_type_well_known,protocol,flow_duration,total_fwd_packet,total_bwd_packets,total_length_of_fwd_packet,total_length_of_bwd_packet,fwd_packet_length_max,fwd_packet_length_min,...,fwd_seg_size_min,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label
0,0.0,1.0,17,31522,2,2,72.0,234.0,36.0,36.0,...,8,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,BENIGN
1,0.0,1.0,6,118265328,34,34,1013.0,34323.0,205.0,0.0,...,32,599222.5,742745.670065,1124423.0,74022.0,58518591.0,583972.620524,58931522.0,58105660.0,BENIGN
2,0.0,1.0,6,4014099,9,5,20.0,11595.0,20.0,0.0,...,20,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,DDOS
3,0.0,1.0,6,280949,10,6,391.0,11595.0,391.0,0.0,...,32,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,DOS_HULK
4,0.0,1.0,6,72016,10,8,339.0,11606.0,339.0,0.0,...,20,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,DOS_HULK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169923,0.0,1.0,6,3246,7,7,353.0,11595.0,353.0,0.0,...,32,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,DOS_HULK
169924,0.0,1.0,17,30774,4,2,136.0,250.0,34.0,34.0,...,8,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,BENIGN
169925,0.0,1.0,6,149954,7,7,344.0,11595.0,344.0,0.0,...,20,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,DOS_HULK
169926,0.0,1.0,6,5873867,8,6,20.0,11595.0,20.0,0.0,...,20,526011.0,0.000000,526011.0,526011.0,5347856.0,0.000000,5347856.0,5347856.0,DDOS


In [5]:
# Codificación de característica protocol
df_dataset["protocol_6"] = (df_dataset["protocol"]==6).astype(int)
df_dataset["protocol_17"] = (df_dataset["protocol"]==17).astype(int)
df_dataset = df_dataset[CAT_SELECTED_CHARS+NUM_SELECTED_CHARS+[CARACTERISTICA_OBJETIVO]]
df_dataset

Unnamed: 0,port_type_registered,port_type_well_known,protocol_6,protocol_17,bwd_iat_min,average_packet_size,flow_bytes/s,flow_iat_min,fwd_iat_min,fwd_bytes/bulk_avg,...,bwd_bytes/bulk_avg,flow_duration,fwd_packet_length_max,subflow_fwd_packets,down/up_ratio,idle_max,fwd_act_data_pkts,rst_flag_count,cwr_flag_count,label
0,0.0,1.0,0,1,3.0,76.500000,9.707506e+03,3.0,49.0,0,...,0,31522,36.0,0,1.000000,0.0,1,0,0,BENIGN
1,0.0,1.0,1,0,49.0,519.647059,2.987858e+02,1.0,83.0,0,...,10094,118265328,205.0,0,1.000000,58931522.0,9,0,0,BENIGN
2,0.0,1.0,1,0,10.0,829.642857,2.893551e+03,10.0,186.0,0,...,0,4014099,20.0,0,0.555556,0.0,1,1,0,DDOS
3,0.0,1.0,1,0,45.0,749.125000,4.266255e+04,0.0,0.0,0,...,11595,280949,391.0,0,0.600000,0.0,1,1,0,DOS_HULK
4,0.0,1.0,1,0,88.0,663.611111,1.658659e+05,4.0,4.0,0,...,11606,72016,339.0,0,0.800000,0.0,1,5,0,DOS_HULK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169923,0.0,1.0,1,0,1.0,853.428571,3.680838e+06,1.0,4.0,0,...,11595,3246,353.0,0,1.000000,0.0,1,1,0,DOS_HULK
169924,0.0,1.0,0,1,49.0,64.333333,1.254306e+04,3.0,3.0,136,...,0,30774,34.0,0,0.500000,0.0,3,0,0,BENIGN
169925,0.0,1.0,1,0,46.0,852.785714,7.961775e+04,1.0,1.0,0,...,11595,149954,344.0,0,1.000000,0.0,1,2,0,DOS_HULK
169926,0.0,1.0,1,0,13.0,829.642857,1.977403e+03,1.0,1.0,0,...,11595,5873867,20.0,0,0.750000,5347856.0,1,1,0,DDOS


In [6]:
# Reemplazo de valores infinitos a valores nulos para imputar a la mediana
for i in CARACTERISTICAS_VAR_INF : 
  df_dataset[i] = df_dataset[i].replace([np.inf], np.nan)
df_dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dataset[i] = df_dataset[i].replace([np.inf], np.nan)


Unnamed: 0,port_type_registered,port_type_well_known,protocol_6,protocol_17,bwd_iat_min,average_packet_size,flow_bytes/s,flow_iat_min,fwd_iat_min,fwd_bytes/bulk_avg,...,bwd_bytes/bulk_avg,flow_duration,fwd_packet_length_max,subflow_fwd_packets,down/up_ratio,idle_max,fwd_act_data_pkts,rst_flag_count,cwr_flag_count,label
0,0.0,1.0,0,1,3.0,76.500000,9.707506e+03,3.0,49.0,0,...,0,31522,36.0,0,1.000000,0.0,1,0,0,BENIGN
1,0.0,1.0,1,0,49.0,519.647059,2.987858e+02,1.0,83.0,0,...,10094,118265328,205.0,0,1.000000,58931522.0,9,0,0,BENIGN
2,0.0,1.0,1,0,10.0,829.642857,2.893551e+03,10.0,186.0,0,...,0,4014099,20.0,0,0.555556,0.0,1,1,0,DDOS
3,0.0,1.0,1,0,45.0,749.125000,4.266255e+04,0.0,0.0,0,...,11595,280949,391.0,0,0.600000,0.0,1,1,0,DOS_HULK
4,0.0,1.0,1,0,88.0,663.611111,1.658659e+05,4.0,4.0,0,...,11606,72016,339.0,0,0.800000,0.0,1,5,0,DOS_HULK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169923,0.0,1.0,1,0,1.0,853.428571,3.680838e+06,1.0,4.0,0,...,11595,3246,353.0,0,1.000000,0.0,1,1,0,DOS_HULK
169924,0.0,1.0,0,1,49.0,64.333333,1.254306e+04,3.0,3.0,136,...,0,30774,34.0,0,0.500000,0.0,3,0,0,BENIGN
169925,0.0,1.0,1,0,46.0,852.785714,7.961775e+04,1.0,1.0,0,...,11595,149954,344.0,0,1.000000,0.0,1,2,0,DOS_HULK
169926,0.0,1.0,1,0,13.0,829.642857,1.977403e+03,1.0,1.0,0,...,11595,5873867,20.0,0,0.750000,5347856.0,1,1,0,DDOS


### División de conjunto en vector X y vector y

In [7]:
df_X = df_dataset.drop(["label"], axis=1)
df_y = df_dataset[["label"]]

X = df_X.values
y = df_y.values
LABELS = np.unique(y)
COLUMNS = list(df_X.columns)
print(f"""
X : {X.shape}
y : {y.shape}
labels: {LABELS}
""")


X : (169928, 21)
y : (169928, 1)
labels: ['BENIGN' 'BOT' 'DDOS' 'DOS_GOLDENEYE' 'DOS_HULK' 'DOS_SLOWHTTPTEST'
 'DOS_SLOWLORIS' 'FTP_PATATOR' 'PORTSCAN' 'SSH_PATATOR']



## 3. K iteraciones

### Configuración y encoders

In [8]:
# Modelo Bagging
bagging_model = [
  {
    "name" : "rf",
    "model_name" : "RandomForestClassifier",
  },
  {
    "name" : "dt",
    "model_name" : "DecisionTreeClassifier",
  },
  {
    "name" : "mlp",
    "model_name" : "MLPClassifier",
  },
  {
    "name" : "knn",
    "model_name": "KNeighborsClassifier",
  }
]

In [9]:
# Encoders y configuración de las k-iteraciones
num_chars_scaler = MinMaxScaler(feature_range=(0, 1), clip=True)
obj_encoder      = LabelEncoder()

# Número de iteraciones 
K_ITERS = 10

# Número de modelos 
N_MODELS = len(bagging_model)
prng = np.random.RandomState(seed=SEED)
max_int32 = np.iinfo(np.int32).max
SEEDS_POR_MODELO = prng.randint(0, max_int32, size=N_MODELS)

# Diccionario para imputar valores faltantes en 
MEDIANAS_POR_ITER = []

In [10]:
# Codificación de variable objetivo con LabelEncoder()
y = obj_encoder.fit_transform(y.ravel())

In [11]:
# Declaración de valores necesarios para K-Fold
skf = StratifiedKFold(n_splits=K_ITERS, shuffle=True, random_state=SEED)

# Contador de iteraciones
iter_cont = 0

### 4.1. Utils

In [12]:
# getFrequency(): Función para obtener la distribución de frecuencias de la columna label
def getFrequency(df : pd.DataFrame, caracteristica) :
  frecuencia = df[caracteristica].value_counts()
  porcentaje = df[caracteristica].value_counts(normalize=True) * 100

  tabla_frecuencia = pd.DataFrame({
    "Frecuencia": frecuencia,
    "Frecuencia(%)": porcentaje
  })
  
  tabla_frecuencia["Frecuencia(%)"].round(2)
  
  print(tabla_frecuencia)
  print(df.shape)

In [13]:
# bagging_resample() : Función para muestreo Bagging
def bagging_resample(X_, y_, seed) :
  indices = np.arange(len(X_)) 
  bootstrap_indices = resample(
    indices,
    replace=True,
    stratify=y_,
    random_state=seed,
    n_samples=int(round(len(X_)*TRAIN_SIZE, 0))
  )
  # np.setdiff1d permite obtener los índices que no aparecen 
  # en los indices tras el muestreo Bootstrap
  oob_indices = np.setdiff1d(indices, bootstrap_indices)

  X_train = X_[bootstrap_indices]
  y_train = y_[bootstrap_indices]
  X_valid = X_[oob_indices]
  y_valid = y_[oob_indices]

  return X_train, y_train, X_valid, y_valid

In [14]:
def obtener_medianas_de_modelo(df_ : pd.DataFrame, caracteristicas : list) : 
  medianas = []
  for caracteristica in caracteristicas : 
    medianas.append({
      'caracteristica' : caracteristica,
      'mediana' : df_[caracteristica].median()
    })
  return medianas

In [15]:
# imputacion_con_filtro_datos_nulos() : Imputación de datos nulos con la mediana, se aplican las medianas filtrando por la variable objetivo
def imputacion_con_filtro_datos_nulos(df_ : pd.DataFrame, caracteristica : str) : 
  df = df_.copy()
  mask = df[caracteristica].isna()
  # Se obtienen las etiquetas de la variable objetivo que tienen valores nulos
  null_values_obj_tags = df[mask][CARACTERISTICA_OBJETIVO].unique().tolist()
  for obj_tag in null_values_obj_tags : 
    # Obtención de mediana de la columna filtrandolo por la variable objetivo
    median = df[df[CARACTERISTICA_OBJETIVO] == obj_tag][caracteristica].median()

    # Reemplazo de valores nulos por la mediana obtenida
    df.loc[mask & (df[CARACTERISTICA_OBJETIVO] == obj_tag), caracteristica] = median

    # Cálculo de posibles características que no se hayan imputado
    rest = df[df[CARACTERISTICA_OBJETIVO] == obj_tag][caracteristica].isna().sum()

    # Cálculo de la media para reemplazar en nuevos datos

    # print(f"Imputación de valores nulos: {caracteristica}")
    # print(df[mask]["label"].value_counts())
    # print(f"Mediana de la característica {caracteristica} en la etiqueta {i}: {median}")
    # print(f"Valores nulos en {caracteristica} en {i} después de la imputación: {rest}")

  # Cálculo de la media para reemplazar en nuevos datos
  # print(f"Mediana de {caracteristica} : {MEDIANS[caracteristica]}")

  return df

In [16]:
# smote_nc_resample() : Función para generar nuevos datos con SMOTENC
def smote_nc_resample(X_, y_, categorical_features, seed) : 
  cant_benign = len(y_[y_ == 'BENIGN']) 
  strategy = {
    'BENIGN' : cant_benign
  }
  for i in range(1,10) : 
    cant_pred = int(round(cant_benign/(len(LABELS)-1), 0))
    cant_real = len(y_[y_ == LABELS[i]])
    strategy[LABELS[i]] = cant_pred if cant_pred>cant_real else cant_real
    
  smt_nc = SMOTENC(
    categorical_features=categorical_features,
    random_state=seed,
    sampling_strategy=strategy
  )

  X_resample, y_resample = smt_nc.fit_resample(X_, y_) 

  getFrequency(
    pd.DataFrame(
      y_resample,
      columns=[CARACTERISTICA_OBJETIVO]
    ),
    CARACTERISTICA_OBJETIVO
  )

  return X_resample, y_resample

In [17]:
# tomek_links_subsample() : Función para submuestrear los datos con Tomek links
def tomek_links_subsample(X_ : pd.DataFrame, y_ : pd.Series) :   
  tmk = TomekLinks(
    sampling_strategy=['BENIGN']
  )
  # y_binary = y_.map(lambda x: 1 if x == 'BENIGN' else 0)
  X_tmk_resample, y_tmk_resample = tmk.fit_resample(X_, y_)
  
  getFrequency(
    pd.DataFrame(
      y_tmk_resample,
      columns=[CARACTERISTICA_OBJETIVO]
    ),
    CARACTERISTICA_OBJETIVO
  )

  return X_tmk_resample, y_tmk_resample

In [18]:
# manageOutliersToMinMax() : Función para manejar outliers y transformarlos a los valores mínimos y máximos en un boxplot aplicando filtros en la variable objetivo
def manageOutliersToMinMax(df : pd.DataFrame, caracteristica : str) :
  df_modificado = df.copy()
  mask = df[CARACTERISTICA_OBJETIVO] == "BENIGN"
  
  subset = df[mask]
  q1 = subset[caracteristica].quantile(0.25)
  q3 = subset[caracteristica].quantile(0.75) 
  iqr = q3 - q1
  lower_bound = q1 - 1.5 * iqr
  upper_bound = q3 + 1.5 * iqr

  valid_values = subset[(subset[caracteristica] >= lower_bound) & (subset[caracteristica] <= upper_bound)][caracteristica]
  valid_min = valid_values.min()
  valid_max = valid_values.max()
  
  df_modificado.loc[mask & (df[caracteristica] < lower_bound), caracteristica] = valid_min
  df_modificado.loc[mask & (df[caracteristica] > upper_bound), caracteristica] = valid_max
  return df_modificado

In [None]:
def X_y_train_clean_resample(X_, y_, seed) : 
  df_ = pd.DataFrame(
    columns=CAT_SELECTED_CHARS+NUM_SELECTED_CHARS,
    data=X_
  )

  df_["label"] = y_

  """ Imputación de valores faltantes """
  for i in CARACTERISTICAS_VAR_NULL : 
    df_ = imputacion_con_filtro_datos_nulos(df_, i)

  df_ = df_[CAT_SELECTED_CHARS+NUM_SELECTED_CHARS+[CARACTERISTICA_OBJETIVO]]
  
  getFrequency(df_, CARACTERISTICA_OBJETIVO)

  """ Sobremuestreo con SMOTE """
  print("Sobremuestreo con SMOTE")
  X_smt, y_smt = smote_nc_resample(
    X_=df_,
    y_=obj_encoder.inverse_transform(df_[CARACTERISTICA_OBJETIVO]),
    categorical_features=list(range(len(CAT_SELECTED_CHARS))),
    seed=seed
  ) 
  print("Completado")

  """ Submuestreo con Tomek-Links """
  print("Submuestreo con Tomek-links")
  X_tmk, y_tmk = tomek_links_subsample(
    X_=pd.DataFrame(X_smt),
    y_=pd.Series(y_smt.ravel())
  )
  print("Completado")

  df_final = pd.DataFrame(X_tmk, columns=CAT_SELECTED_CHARS+NUM_SELECTED_CHARS)
  df_final[CARACTERISTICA_OBJETIVO] = y_tmk

  return df_final

def X_y_train_nrmlztn(df_ : pd.DataFrame, num_chars_scaler_) : 
  
  """ Normalización de datos """
  num_chars_encoded = num_chars_scaler_.transform(
    df_[NUM_SELECTED_CHARS]
  )

  """ Integración de datos """
  X_clean = pd.DataFrame(
    columns=NUM_SELECTED_CHARS,
    data=num_chars_encoded
  )

  X_clean[CAT_SELECTED_CHARS] = df_[CAT_SELECTED_CHARS]

  X_clean = X_clean[CAT_SELECTED_CHARS + NUM_SELECTED_CHARS]

  return X_clean, df_[CARACTERISTICA_OBJETIVO]

In [20]:
def X_test_preparation(X_, medianas, num_chars_scaler_) : 
  df_ = pd.DataFrame(
    columns=CAT_SELECTED_CHARS+NUM_SELECTED_CHARS,
    data=X_
  )

  """ Imputación de valores faltantes """
  for i in medianas : 
    df_[i['caracteristica']] = df_[i['caracteristica']].fillna(i['mediana'])

  """ Normalización de datos """
  num_chars_encoded = num_chars_scaler_.transform(df_[NUM_SELECTED_CHARS])

  """ Integración de datos """
  df_final = pd.DataFrame(
    columns=NUM_SELECTED_CHARS,
    data=num_chars_encoded
  )

  df_final[CAT_SELECTED_CHARS] = df_[CAT_SELECTED_CHARS]

  df_final = df_final[CAT_SELECTED_CHARS + NUM_SELECTED_CHARS]

  return df_final.values

In [21]:
def save_df(X_, y_, save_path) : 
  df_save = pd.DataFrame(X_, columns=CAT_SELECTED_CHARS+NUM_SELECTED_CHARS)
  df_save[CARACTERISTICA_OBJETIVO] = y_

  df_save.to_csv(save_path, index=False) 

### 4.2. Creación de conjuntos de entrenamiento, validación OOB y prueba con K-iteraciones 

In [22]:
for temp_index, test_index in skf.split(X, y) : 
  X_temp, X_test = X[temp_index], X[test_index]
  y_temp, y_test = y[temp_index], y[test_index]

  MEDIANAS_POR_ITER.append({
    'iter' : iter_cont+1,
    'medianas_por_modelo' : []
  })

  print(f"""
        Iteración {iter_cont+1}
  """)

  """ Entrenamiento y validación de los modelos """

  for i in range(len(SEEDS_POR_MODELO)) : 
    X_train, y_train, X_valid, y_valid = bagging_resample(
      X_temp, y_temp, SEEDS_POR_MODELO[i]
    )

    MEDIANAS_POR_ITER[iter_cont]['medianas_por_modelo'].append({
      'modelo' : bagging_model[i]['model_name'],
      'num_chars_scaler' : MinMaxScaler(feature_range=(0, 1), clip=True)
    })

    print(f"Modelo {bagging_model[i]["model_name"]}")

    """ Preparación del conjunto de entrenamiento """
    print("Preparación del conjunto de entrenamiento")
    df_ = X_y_train_clean_resample(X_train, y_train, seed=SEEDS_POR_MODELO[i])

    MEDIANAS_POR_ITER[iter_cont]['medianas_por_modelo'][i]['medianas'] = obtener_medianas_de_modelo(
      df_=df_,
      caracteristicas=CARACTERISTICAS_VAR_NULL
    )

    MEDIANAS_POR_ITER[iter_cont]['medianas_por_modelo'][i]['num_chars_scaler'].fit(df_[NUM_SELECTED_CHARS])
    X_train_clean, y_train_clean = X_y_train_nrmlztn(
      df_=df_,
      num_chars_scaler_=MEDIANAS_POR_ITER[iter_cont]['medianas_por_modelo'][i]['num_chars_scaler']
    )

    """ Preparación del conjunto de validación """
    print("Preparación del conjunto de validación")
    X_valid = X_test_preparation(
      X_valid, 
      MEDIANAS_POR_ITER[iter_cont]['medianas_por_modelo'][i]['medianas'],
      MEDIANAS_POR_ITER[iter_cont]['medianas_por_modelo'][i]['num_chars_scaler']
    )
    y_valid = obj_encoder.inverse_transform(y_valid)

    """ Preparación del conjunto de prueba """
    print("Preparación del conjunto de prueba")
    X_test_new = X_test_preparation(
      X_test,
      MEDIANAS_POR_ITER[iter_cont]['medianas_por_modelo'][i]['medianas'],
      MEDIANAS_POR_ITER[iter_cont]['medianas_por_modelo'][i]['num_chars_scaler']
    )
    y_test_new = obj_encoder.inverse_transform(y_test)

    save_df(
      X_=X_train_clean,
      y_=y_train_clean,
      save_path=f"../model/DB/{iter_cont+1}/df_train_{bagging_model[i]["name"]}.csv"
    )
    save_df(
      X_=X_valid,
      y_=y_valid,
      save_path=f"../model/DB/{iter_cont+1}/df_valid_{bagging_model[i]["name"]}.csv"
    )
    save_df(
      X_=X_test_new,
      y_=y_test_new,
      save_path=f"../model/DB/{iter_cont+1}/df_test_{bagging_model[i]["name"]}.csv"
    )
  iter_cont += 1


        Iteración 1
  
Modelo RandomForestClassifier
Preparación del conjunto de entrenamiento
       Frecuencia  Frecuencia(%)
label                           
0           78104      68.093565
8           13428      11.706960
4           13371      11.657265
2            8026       6.997323
3             639       0.557101
6             338       0.294679
7             335       0.292064
9             251       0.218830
5             147       0.128159
1              62       0.054054
(114701, 22)
Sobremuestreo con SMOTE
                  Frecuencia  Frecuencia(%)
label                                      
BENIGN                 78104      47.150300
PORTSCAN               13428       8.106297
DOS_HULK               13371       8.071887
DDOS                    8678       5.238788
SSH_PATATOR             8678       5.238788
DOS_SLOWLORIS           8678       5.238788
DOS_SLOWHTTPTEST        8678       5.238788
DOS_GOLDENEYE           8678       5.238788
FTP_PATATOR             8678   

[WinError 2] El sistema no puede encontrar el archivo especificado
  File "c:\Users\User\anaconda3\envs\ids_thesis\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\User\anaconda3\envs\ids_thesis\Lib\subprocess.py", line 550, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\User\anaconda3\envs\ids_thesis\Lib\subprocess.py", line 1028, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\User\anaconda3\envs\ids_thesis\Lib\subprocess.py", line 1540, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


                  Frecuencia  Frecuencia(%)
label                                      
BENIGN                 77779      47.046406
PORTSCAN               13428       8.122233
DOS_HULK               13371       8.087755
DDOS                    8678       5.249087
SSH_PATATOR             8678       5.249087
DOS_SLOWLORIS           8678       5.249087
DOS_SLOWHTTPTEST        8678       5.249087
DOS_GOLDENEYE           8678       5.249087
FTP_PATATOR             8678       5.249087
BOT                     8678       5.249087
(165324, 1)
Completado
Preparación del conjunto de validación
Preparación del conjunto de prueba
Modelo DecisionTreeClassifier
Preparación del conjunto de entrenamiento
       Frecuencia  Frecuencia(%)
label                           
0           78104      68.093565
8           13428      11.706960
4           13371      11.657265
2            8026       6.997323
3             639       0.557101
6             338       0.294679
7             335       0.292064
9     

In [23]:
MEDIANAS_POR_ITER

[{'iter': 1,
  'medianas_por_modelo': [{'modelo': 'RandomForestClassifier',
    'num_chars_scaler': MinMaxScaler(clip=True),
    'medianas': [{'caracteristica': 'flow_iat_min',
      'mediana': np.float64(4.0)},
     {'caracteristica': 'flow_bytes/s',
      'mediana': np.float64(1389.8892408452489)}]},
   {'modelo': 'DecisionTreeClassifier',
    'num_chars_scaler': MinMaxScaler(clip=True),
    'medianas': [{'caracteristica': 'flow_iat_min',
      'mediana': np.float64(4.0)},
     {'caracteristica': 'flow_bytes/s',
      'mediana': np.float64(1450.1026120823822)}]},
   {'modelo': 'MLPClassifier',
    'num_chars_scaler': MinMaxScaler(clip=True),
    'medianas': [{'caracteristica': 'flow_iat_min',
      'mediana': np.float64(4.0)},
     {'caracteristica': 'flow_bytes/s',
      'mediana': np.float64(1432.127772649716)}]},
   {'modelo': 'KNeighborsClassifier',
    'num_chars_scaler': MinMaxScaler(clip=True),
    'medianas': [{'caracteristica': 'flow_iat_min',
      'mediana': np.float64(4.0

In [24]:
SEEDS_POR_MODELO

array([793494059, 498241738, 377997800, 912782427], dtype=int32)

## Guardando resultados

In [25]:
# df_valid.to_excel("output/validation_results.xlsx", index=False)
# df_test.to_excel("output/testing_results.xlsx", index=False)
# print("Guardado")