# Preparación de datos

In [41]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import levene, bartlett
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from factor_analyzer.factor_analyzer import calculate_kmo

## Utilidades

In [2]:
EJECUCION_RAPIDA = True

seed=9603

dataset_path = "DB/ids_dataset.csv"

caracteristicas_nominales = ['destination_port', 'label']

caracteristicas_binarias  = [
      'fwd_psh_flags', 'fwd_urg_flags',
      'fin_flag_count', 'syn_flag_count', 'rst_flag_count', 
      'psh_flag_count', 'ack_flag_count', 'urg_flag_count', 
      'cwe_flag_count', 'ece_flag_count'
]
      
caracteristicas_numericas = [
      'flow_duration', 'total_fwd_packets', 'total_backward_packets', 
      'total_length_of_fwd_packets', 'total_length_of_bwd_packets', 
      'fwd_packet_length_max', 'fwd_packet_length_min', 
      'fwd_packet_length_mean', 'fwd_packet_length_std', 
      'bwd_packet_length_max', 'bwd_packet_length_min', 
      'bwd_packet_length_mean', 'bwd_packet_length_std', 
      'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 
      'flow_iat_min', 'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std', 
      'fwd_iat_max', 'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean', 
      'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min', 'fwd_header_length', 
      'bwd_header_length', 'fwd_packets/s', 'bwd_packets/s',
      'min_packet_length', 'max_packet_length', 'packet_length_mean',
      'packet_length_std', 'packet_length_variance', 'down/up_ratio',
      'average_packet_size', 'avg_fwd_segment_size', 
      'avg_bwd_segment_size', 'fwd_header_length.1', 'subflow_fwd_packets', 
      'subflow_fwd_bytes', 'subflow_bwd_packets', 'subflow_bwd_bytes', 
      'init_win_bytes_forward', 'init_win_bytes_backward', 
      'act_data_pkt_fwd', 'min_seg_size_forward', 'active_mean', 
      'active_std', 'active_max', 'active_min', 'idle_mean', 'idle_std', 
      'idle_max', 'idle_min'
]

caracteristicas_no_utiles = [
      'bwd_psh_flags',
      'bwd_urg_flags', 
      'fwd_avg_bytes/bulk', 'fwd_avg_packets/bulk', 'fwd_avg_bulk_rate', 
      'bwd_avg_bytes/bulk', 'bwd_avg_packets/bulk', 'bwd_avg_bulk_rate'
]

caracteristicas_con_valor_maximo_infinito = [
      'flow_bytes/s', 'flow_packets/s'
]

In [3]:
# generateBoxplot(): Función para generar boxplot 
def generateBoxplot(x, y, data, save_path) :
  sns.boxplot(x=x, y=y, data=data)

  plt.xlabel(x)
  plt.ylabel(y)
  plt.savefig(save_path)
  plt.show()

In [4]:
# generateCorrMatrix(): Función para generar matriz de correlación
def generateCorrMatrix(data, save_path, title="Matriz de correlación", method="pearson", decimals=2) :
  # Generación de matriz de correlación
  matriz_correlacion = data.corr(method=method).round(decimals=decimals)

  # Generación de mapa de calor para visualizar los datos
  plt.figure(figsize=(52, 39))

  sns.heatmap(matriz_correlacion, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
  plt.title(title)
  plt.savefig(save_path)
  plt.show()

## Carga y transformación de datos

In [5]:
# Carga de datos
df = pd.read_csv(dataset_path)
print("Datos cargados")

Datos cargados


In [6]:
# Transformación de nombres de columnas
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print(df.columns)
print(f"Total de características: {len(df.columns)}")

Index(['destination_port', 'flow_duration', 'total_fwd_packets',
       'total_backward_packets', 'total_length_of_fwd_packets',
       'total_length_of_bwd_packets', 'fwd_packet_length_max',
       'fwd_packet_length_min', 'fwd_packet_length_mean',
       'fwd_packet_length_std', 'bwd_packet_length_max',
       'bwd_packet_length_min', 'bwd_packet_length_mean',
       'bwd_packet_length_std', 'flow_bytes/s', 'flow_packets/s',
       'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 'flow_iat_min',
       'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max',
       'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean', 'bwd_iat_std',
       'bwd_iat_max', 'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags',
       'fwd_urg_flags', 'bwd_urg_flags', 'fwd_header_length',
       'bwd_header_length', 'fwd_packets/s', 'bwd_packets/s',
       'min_packet_length', 'max_packet_length', 'packet_length_mean',
       'packet_length_std', 'packet_length_variance', 'fin_flag_count',
       'syn_flag_co

In [7]:
# Transformación de nombres de etiquetas de la columna label
df["label"] = df["label"].str.upper().str.replace("� ", "").str.replace(" ", "_").str.replace("-", "_")
print(df["label"].unique())
print(f"Total de etiquetas: {len(df["label"].unique())}")

['BENIGN' 'FTP_PATATOR' 'SSH_PATATOR' 'DOS_SLOWLORIS' 'DOS_SLOWHTTPTEST'
 'DOS_HULK' 'DOS_GOLDENEYE' 'HEARTBLEED' 'WEB_ATTACK_BRUTE_FORCE'
 'WEB_ATTACK_XSS' 'WEB_ATTACK_SQL_INJECTION' 'INFILTRATION' 'BOT'
 'PORTSCAN' 'DDOS']
Total de etiquetas: 15


## Selección de datos relevantes

In [8]:
# Eliminación de datos no relevantes
df = df.drop(caracteristicas_no_utiles, axis=1)
print(df.columns)
print(f"Características: {len(df.columns.tolist())}")

Index(['destination_port', 'flow_duration', 'total_fwd_packets',
       'total_backward_packets', 'total_length_of_fwd_packets',
       'total_length_of_bwd_packets', 'fwd_packet_length_max',
       'fwd_packet_length_min', 'fwd_packet_length_mean',
       'fwd_packet_length_std', 'bwd_packet_length_max',
       'bwd_packet_length_min', 'bwd_packet_length_mean',
       'bwd_packet_length_std', 'flow_bytes/s', 'flow_packets/s',
       'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 'flow_iat_min',
       'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max',
       'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean', 'bwd_iat_std',
       'bwd_iat_max', 'bwd_iat_min', 'fwd_psh_flags', 'fwd_urg_flags',
       'fwd_header_length', 'bwd_header_length', 'fwd_packets/s',
       'bwd_packets/s', 'min_packet_length', 'max_packet_length',
       'packet_length_mean', 'packet_length_std', 'packet_length_variance',
       'fin_flag_count', 'syn_flag_count', 'rst_flag_count', 'psh_flag_count',

## Limpieza de datos

### 1. Imputación de datos faltantes

#### 1.1. Valores nulos : flow_bytes/s

In [9]:
imputacion_con_filtro = True

if imputacion_con_filtro :
  print(df[df["flow_bytes/s"].isna()]["label"].value_counts())
  null_values_list = df[df["flow_bytes/s"].isna()]["label"].unique().tolist()

  for i in null_values_list : 
    median = df[df["label"] == i]["flow_bytes/s"].median()
    print(f"Mediana de la característica flow_bytes/s en la etiqueta {i}: {median}")

    df.loc[df["flow_bytes/s"].isnull() & (df["label"] == i), "flow_bytes/s"] = median

    rest = df[df["label"] == i]["flow_bytes/s"].isnull().sum()

    print(f"Valores faltantes en flow_bytes/s en {i} después de la imputación: {rest}")
else : 
  mediana = df["flow_bytes/s"].median()
  print(f"Mediana de la característica flow_bytes/s: {mediana}\n")
  df["flow_bytes/s"] = df["flow_bytes/s"].fillna(mediana)
  print(f"Valores faltantes en flow_bytes/s: {df["flow_bytes/s"].isnull().sum()}")

label
DOS_HULK    949
BENIGN      409
Name: count, dtype: int64
Mediana de la característica flow_bytes/s en la etiqueta BENIGN: 5158.72695
Valores faltantes en flow_bytes/s en BENIGN después de la imputación: 0
Mediana de la característica flow_bytes/s en la etiqueta DOS_HULK: 121.1692036
Valores faltantes en flow_bytes/s en DOS_HULK después de la imputación: 0


### 2. Valores infinitos y outliers

In [10]:
# manageOutliersToMinMax() : Función para manejar outliers y transformarlos a los valores mínimos y máximos en un boxplot
def manageOutliersToMinMax(df : pd.DataFrame, col_num, col_obj) :
  df_modificado = df.copy()
  for var_obj in df[col_obj].unique() :
    subset = df[df[col_obj] == var_obj]
    q1 = subset[col_num].quantile(0.25)
    q3 = subset[col_num].quantile(0.75) 
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    valid_values = subset[(subset[col_num] >= lower_bound) & (subset[col_num] <= upper_bound)][col_num]
    valid_min = valid_values.min()
    valid_max = valid_values.max()

    mask = df[col_obj] == var_obj
    df_modificado.loc[mask & (df[col_num] < lower_bound), col_num] = valid_min
    df_modificado.loc[mask & (df[col_num] > upper_bound), col_num] = valid_max
  return df_modificado

In [11]:
# Manejo de outliers para ajustarlos al máximo y mínimo del conjunto de datos
for i in caracteristicas_numericas+caracteristicas_con_valor_maximo_infinito : 
  df = manageOutliersToMinMax(df=df,col_num=i,col_obj="label")

In [12]:
# Boxplots de las variables numéricas
if not EJECUCION_RAPIDA : 
  for i in caracteristicas_numericas+caracteristicas_con_valor_maximo_infinito : 
    generateBoxplot(
      x=i,
      y="label",
      data=df,
      save_path=f"img/boxplots/boxplot_label_{i.replace("/","_").replace(".", "_")}"
    )

In [13]:
# Matriz de correlación
if not EJECUCION_RAPIDA : 
  generateCorrMatrix(
    data=df[caracteristicas_numericas+caracteristicas_con_valor_maximo_infinito],
    decimals=2,
    save_path="img/matriz_correlacion_pearson_outliers_eliminados_no_normalizada.png",
    method="spearman"
  )

## Construcción de nuevos datos

### 1. Normalización de datos

In [None]:
# scaler para normalizar datos
scaler = MinMaxScaler()
nrmlzd_cols = scaler.fit_transform(df[caracteristicas_numericas+caracteristicas_con_valor_maximo_infinito])

In [20]:
df_nrmlzd = pd.DataFrame(
  nrmlzd_cols,
  columns=caracteristicas_numericas+caracteristicas_con_valor_maximo_infinito
)
df_nrmlzd[caracteristicas_binarias] = df[caracteristicas_binarias]
df_nrmlzd["label"] = df["label"]
print(df_nrmlzd.columns)
print(df_nrmlzd.shape)

Index(['flow_duration', 'total_fwd_packets', 'total_backward_packets',
       'total_length_of_fwd_packets', 'total_length_of_bwd_packets',
       'fwd_packet_length_max', 'fwd_packet_length_min',
       'fwd_packet_length_mean', 'fwd_packet_length_std',
       'bwd_packet_length_max', 'bwd_packet_length_min',
       'bwd_packet_length_mean', 'bwd_packet_length_std', 'flow_iat_mean',
       'flow_iat_std', 'flow_iat_max', 'flow_iat_min', 'fwd_iat_total',
       'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max', 'fwd_iat_min',
       'bwd_iat_total', 'bwd_iat_mean', 'bwd_iat_std', 'bwd_iat_max',
       'bwd_iat_min', 'fwd_header_length', 'bwd_header_length',
       'fwd_packets/s', 'bwd_packets/s', 'min_packet_length',
       'max_packet_length', 'packet_length_mean', 'packet_length_std',
       'packet_length_variance', 'down/up_ratio', 'average_packet_size',
       'avg_fwd_segment_size', 'avg_bwd_segment_size', 'fwd_header_length.1',
       'subflow_fwd_packets', 'subflow_fwd_bytes', 'sub

### 2. Análisis factorial

In [None]:
# Una técnica de reducción de dimensionalidad con PCA es evaluar la correlación entre las características con la variable objetivo
# Dichas características deben guardar correlación y una varianza significativas para asegurar que PCA no causará perdida de información
# Para evaluar dichas métricas se utilizará el Test de Bartlett y KMO 

#### 2.1. Variables numéricas a variables categóricas

##### 2.1.1. KMO

In [None]:
kmo_all, kmo_model = calculate_kmo(df_nrmlzd)
print(f"Índice KMO general: {kmo_model:.4f}")

##### 2.1.2. Test de Bartlett

In [39]:
# El test de Bartlett propone que las muestras provienen de muestras con varianzas iguales
# Si el p_value es menor al nivel de significancia (0.05) se puede rechazar la hipótesis nula y asegurar que existe varianza entre los grupos
# Hipótesis nula: No existe correlación entre las características numéricas
# Hipótesis alterna: Existe correlación entre las características numéricas}

df_test_bartlett = pd.DataFrame(columns=["char", "stat", "p_value"])

for i in caracteristicas_numericas+caracteristicas_con_valor_maximo_infinito : 
  data_by_col_obj = [df_nrmlzd[df_nrmlzd["label"] == j][i] for j in df_nrmlzd["label"].unique()]
  stat, p_value = bartlett(*data_by_col_obj)

  df_test_bartlett.loc[len(df_test_bartlett)] = [i, stat, p_value]

  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  numer = (Ntot*1.0 - k) 

In [None]:
# Test de Bartlett con características filtrados para obtener p_value menores a 0.05 
df_test_bartlett[df_test_bartlett["p_value"] < 0.05]

Unnamed: 0,char,stat,p_value
0,flow_duration,19119100.0,0.0
1,total_fwd_packets,inf,0.0
2,total_backward_packets,inf,0.0
3,total_length_of_fwd_packets,inf,0.0
4,total_length_of_bwd_packets,inf,0.0
5,fwd_packet_length_max,inf,0.0
6,fwd_packet_length_min,inf,0.0
7,fwd_packet_length_mean,inf,0.0
8,fwd_packet_length_std,inf,0.0
9,bwd_packet_length_max,inf,0.0


##### 2.1.3. Test de Brown-Forsythe

In [33]:
df_test_brown_forsythe = pd.DataFrame(columns=["char", "stat", "p_value"])

for i in caracteristicas_numericas+caracteristicas_con_valor_maximo_infinito : 
  data_by_col_obj = [df_nrmlzd[df_nrmlzd["label"] == j][i] for j in df_nrmlzd["label"].unique()]
  stat, p_value = levene(*data_by_col_obj, center="median")

  df_test_brown_forsythe.loc[len(df_test_brown_forsythe)] = [i, stat, p_value]

In [37]:
# Test de Bartlett con características filtrados para obtener p_value menores a 0.05 
df_test_brown_forsythe[df_test_brown_forsythe["p_value"] < 0.05]

Unnamed: 0,char,stat,p_value
0,flow_duration,197059.2,0.0
1,total_fwd_packets,65119.13,0.0
2,total_backward_packets,87593.51,0.0
3,total_length_of_fwd_packets,85830.73,0.0
4,total_length_of_bwd_packets,85338.19,0.0
5,fwd_packet_length_max,58645.12,0.0
6,fwd_packet_length_min,39417.47,0.0
7,fwd_packet_length_mean,28380.46,0.0
8,fwd_packet_length_std,55498.95,0.0
9,bwd_packet_length_max,326062.4,0.0


# Por pasar

Para preparar los datos para el análisis, se normalizarán los datos numéricos, esto quiere decir que se escalarán los datos de 0 a 1

In [None]:
class IDSFeatureTransformer(BaseEstimator, TransformerMixin) : 
  def __init__(self):
    super().__init__()
    self.n_components = 9
    self.scaler = MinMaxScaler()
    self.pca = PCA(n_components=self.n_components)
    self.drop_cols = [
      'bwd_psh_flags',
      'bwd_urg_flags', 
      'fwd_avg_bytes/bulk', 'fwd_avg_packets/bulk', 'fwd_avg_bulk_rate', 
      'bwd_avg_bytes/bulk', 'bwd_avg_packets/bulk', 'bwd_avg_bulk_rate'
    ]
    self.caracteristicas_numericas = [
      'flow_duration', 'total_fwd_packets', 'total_backward_packets', 
      'total_length_of_fwd_packets', 'total_length_of_bwd_packets', 
      'fwd_packet_length_max', 'fwd_packet_length_min', 
      'fwd_packet_length_mean', 'fwd_packet_length_std', 
      'bwd_packet_length_max', 'bwd_packet_length_min', 
      'bwd_packet_length_mean', 'bwd_packet_length_std', 
      'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 
      'flow_iat_min', 'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std', 
      'fwd_iat_max', 'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean', 
      'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min', 'fwd_header_length', 
      'bwd_header_length', 'fwd_packets/s', 'bwd_packets/s',
      'min_packet_length', 'max_packet_length', 'packet_length_mean',
      'packet_length_std', 'packet_length_variance', 'down/up_ratio',
      'average_packet_size', 'avg_fwd_segment_size', 
      'avg_bwd_segment_size', 'fwd_header_length.1', 'subflow_fwd_packets', 
      'subflow_fwd_bytes', 'subflow_bwd_packets', 'subflow_bwd_bytes', 
      'init_win_bytes_forward', 'init_win_bytes_backward', 
      'act_data_pkt_fwd', 'min_seg_size_forward', 'active_mean', 
      'active_std', 'active_max', 'active_min', 'idle_mean', 'idle_std', 
      'idle_max', 'idle_min', 'flow_bytes/s', 'flow_packets/s'
    ]
    self.caracteristica_nominal  = ['destination_port']
    self.caracteristica_objetivo = ['label']
    self.caracteristicas_binarias  = [
      'fwd_psh_flags', 'fwd_urg_flags',
      'fin_flag_count', 'syn_flag_count', 'rst_flag_count', 
      'psh_flag_count', 'ack_flag_count', 'urg_flag_count', 
      'cwe_flag_count', 'ece_flag_count'
    ]
    self.flow_packets_col = {
        "column_name" : "flow_packets/s",
        "sum_values" : ["fwd_packets/s", "bwd_packets/s"]
    }
    self.inf_null_col = [
      'flow_bytes/s'
    ]
  
  def label_transform(self, y) :
    y_new = y.copy()
    y_new["label"] = y_new["label"].str.upper().str.replace("� ", "").str.replace(" ", "_").str.replace("-", "_")
    return y_new

  def columns_transform(self, X : pd.DataFrame) : 
    X_new = X.copy()
    X_new.columns = X_new.columns.str.strip().str.lower().str.replace(" ", "_")
    return X_new
  
  def data_selection(self, X : pd.DataFrame) : 
    X_new = X.copy()
    X_new.drop(columns= self.drop_cols, inplace=True, errors='ignore')
    return X_new
  
  def data_cleaning(self, X : pd.DataFrame) :
    X_new = X.copy()

    # Imputación de valores NaN por la mediana en la columna flow_bytes/s
    X_new[self.inf_null_col] = X_new[self.inf_null_col].fillna(X_new[self.inf_null_col].median())
    # Imputación de valores infinitos por el máximo en la columna flow_bytes/s
    # Primero reemplazamos los valores infinitos por NaN
    X_new[self.inf_null_col] = X_new[self.inf_null_col].replace(np.inf, np.nan)
    # Ahora reemplazamos los valores NaN por el nuevo máximo 
    X_new[self.inf_null_col] = X_new[self.inf_null_col].fillna(X_new[self.inf_null_col].max())

    # Imputamos la característica flow_packets/s reemplazando los infinitos por la suma de fwd_packets/s y bwd_packets/s 
    mask = np.isinf(X_new[self.flow_packets_col["column_name"]])
    X_new.loc[mask, self.flow_packets_col["column_name"]] = X_new[self.flow_packets_col["sum_values"]].sum(axis=1)

    return X_new
  
  def fit_transform(self, X : pd.DataFrame, y : pd.DataFrame) : 
    X_pre = self.columns_transform(X.copy())
    y_new = self.columns_transform(y)
    y_new = self.label_transform(y_new)

    """ Selección de datos relevantes """
    X_new = self.data_selection(X_pre)

    """ Limpieza de datos """
    X_new = self.data_cleaning(X_new)

    """ Construcción de nuevos datos """
    # Entrenamos el MinMaxScaler() con los datos preparados, 
    # asimismo, normalizamos el dataset con fit_transform()
    X_num_scaled = self.scaler.fit_transform(X_new[self.caracteristicas_numericas])

    # Utilizamos PCA() para reducir la dimensionalidad del dataset
    X_pca = pd.DataFrame(
      self.pca.fit_transform(X_num_scaled),
      columns=[f"component_{i+1}" for i in range(0, self.n_components)]
    )
    
    """ Integración de datos """

    X_concat = pd.concat([
      X_pca.reset_index(drop=True), 
      X_new[self.caracteristicas_binarias].reset_index(drop=True), 
      X_new[self.caracteristica_nominal].reset_index(drop=True)
    ], axis=1)

    self.columns = X_concat.columns.tolist()
    self.variance = self.pca.explained_variance_ratio_

    return X_concat, y_new

  def transform(self, X : pd.DataFrame, y : pd.DataFrame) :
    X_pre = self.columns_transform(X.copy())
    y_new = self.columns_transform(y)
    y_new = self.label_transform(y_new)

    """ Selección de datos relevantes """
    X_new = self.data_selection(X_pre)

    """ Limpieza de datos """
    X_new = self.data_cleaning(X_new)

    """ Construcción de nuevos datos """
    # Normalizamos el data frame, el MinMaxScaler() ya fue entrenado en fit_transform()
    X_num_scaled  = self.scaler.transform(X_new[self.caracteristicas_numericas])

    X_pca = pd.DataFrame(
      self.pca.transform(X_num_scaled),
      columns=[f"component_{i+1}" for i in range(0, self.n_components)]
    )

    X_concat = pd.concat([
      X_pca.reset_index(drop=True), 
      X_new[self.caracteristicas_binarias].reset_index(drop=True), 
      X_new[self.caracteristica_nominal].reset_index(drop=True)
    ], axis=1)

    return X_concat, y_new
  
transformer = IDSFeatureTransformer()

In [None]:
df = pd.read_csv(dataset_path, sep=",")
df.columns

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', '

In [None]:
X = df.drop([" Label"], axis=1)
y = df[[" Label"]]

print(f"""
Dataset
      X: {X.shape}
      y: {y.shape}
""")


Dataset
      X: (2830743, 78)
      y: (2830743, 1)



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

print(f"""
Dataset entrenamiento
      X: {X_train.shape}
      y: {y_train.shape}

Dataset prueba
      X: {X_test.shape}
      y: {y_test.shape}
""")

X_train, y_train = transformer.fit_transform(X_train, y_train)
X_test, y_test = transformer.transform(X_test, y_test)

print(f"""
Dataset entrenamiento
      X: {X_train.shape}
      y: {y_train.shape}

Dataset prueba
      X: {X_test.shape}
      y: {y_test.shape}
""")

y_train = y_train["label"] 
y_test = y_test["label"] 


Dataset entrenamiento
      X: (2264594, 78)
      y: (2264594, 1)

Dataset prueba
      X: (566149, 78)
      y: (566149, 1)


Dataset entrenamiento
      X: (2264594, 20)
      y: (2264594, 1)

Dataset prueba
      X: (566149, 20)
      y: (566149, 1)



In [None]:
rf_clf = RandomForestClassifier(random_state=seed, n_estimators=100)
rf_clf.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
y_pred = rf_clf.predict(X_test)

print(classification_report(y_test, y_pred))

# biplot

                          precision    recall  f1-score   support

                  BENIGN       1.00      1.00      1.00    454620
                     BOT       0.85      0.76      0.81       393
                    DDOS       1.00      1.00      1.00     25606
           DOS_GOLDENEYE       1.00      0.99      0.99      2059
                DOS_HULK       1.00      1.00      1.00     46215
        DOS_SLOWHTTPTEST       0.99      0.99      0.99      1100
           DOS_SLOWLORIS       0.99      0.99      0.99      1159
             FTP_PATATOR       1.00      1.00      1.00      1588
              HEARTBLEED       1.00      1.00      1.00         2
            INFILTRATION       1.00      0.14      0.25         7
                PORTSCAN       0.99      1.00      0.99     31786
             SSH_PATATOR       1.00      1.00      1.00      1179
  WEB_ATTACK_BRUTE_FORCE       0.73      0.78      0.76       301
WEB_ATTACK_SQL_INJECTION       1.00      0.25      0.40         4
         