# Preparación de datos

In [145]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [5]:
seed=9603
dataset_path = "DB/ids_dataset.csv"

caracteristicas_nominales = ['destination_port', 'label']

caracteristicas_binarias  = [
      'fwd_psh_flags', 'fwd_urg_flags',
      'fin_flag_count', 'syn_flag_count', 'rst_flag_count', 
      'psh_flag_count', 'ack_flag_count', 'urg_flag_count', 
      'cwe_flag_count', 'ece_flag_count'
]
      
caracteristicas_numericas = [
      'flow_duration', 'total_fwd_packets', 'total_backward_packets', 
      'total_length_of_fwd_packets', 'total_length_of_bwd_packets', 
      'fwd_packet_length_max', 'fwd_packet_length_min', 
      'fwd_packet_length_mean', 'fwd_packet_length_std', 
      'bwd_packet_length_max', 'bwd_packet_length_min', 
      'bwd_packet_length_mean', 'bwd_packet_length_std', 
      'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 
      'flow_iat_min', 'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std', 
      'fwd_iat_max', 'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean', 
      'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min', 'fwd_header_length', 
      'bwd_header_length', 'fwd_packets/s', 'bwd_packets/s',
      'min_packet_length', 'max_packet_length', 'packet_length_mean',
      'packet_length_std', 'packet_length_variance', 'down/up_ratio',
      'average_packet_size', 'avg_fwd_segment_size', 
      'avg_bwd_segment_size', 'fwd_header_length.1', 'subflow_fwd_packets', 
      'subflow_fwd_bytes', 'subflow_bwd_packets', 'subflow_bwd_bytes', 
      'init_win_bytes_forward', 'init_win_bytes_backward', 
      'act_data_pkt_fwd', 'min_seg_size_forward', 'active_mean', 
      'active_std', 'active_max', 'active_min', 'idle_mean', 'idle_std', 
      'idle_max', 'idle_min'
]

caracteristicas_no_utiles = [
      'bwd_psh_flags',
      'bwd_urg_flags', 
      'fwd_avg_bytes/bulk', 'fwd_avg_packets/bulk', 'fwd_avg_bulk_rate', 
      'bwd_avg_bytes/bulk', 'bwd_avg_packets/bulk', 'bwd_avg_bulk_rate'
]

caracteristicas_con_valor_maximo_infinito = [
      'flow_bytes/s', 'flow_packets/s'
]

Cargando datos

In [11]:
df = pd.read_csv(dataset_path)

#### En primer lugar, transformamos los nombres de las columnas

In [13]:
# Transformación de nombres de columnas
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print(df.columns)
print(f"Total de características: {len(df.columns)}")

Index(['destination_port', 'flow_duration', 'total_fwd_packets',
       'total_backward_packets', 'total_length_of_fwd_packets',
       'total_length_of_bwd_packets', 'fwd_packet_length_max',
       'fwd_packet_length_min', 'fwd_packet_length_mean',
       'fwd_packet_length_std', 'bwd_packet_length_max',
       'bwd_packet_length_min', 'bwd_packet_length_mean',
       'bwd_packet_length_std', 'flow_bytes/s', 'flow_packets/s',
       'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 'flow_iat_min',
       'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max',
       'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean', 'bwd_iat_std',
       'bwd_iat_max', 'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags',
       'fwd_urg_flags', 'bwd_urg_flags', 'fwd_header_length',
       'bwd_header_length', 'fwd_packets/s', 'bwd_packets/s',
       'min_packet_length', 'max_packet_length', 'packet_length_mean',
       'packet_length_std', 'packet_length_variance', 'fin_flag_count',
       'syn_flag_co

## Selección de datos relevantes

En primera instancia, como se mencionó, existen datos que no recolectaron información para entrenar modelos, por ello se retiran del modelo

In [14]:
df = df.drop(caracteristicas_no_utiles, axis=1)
print(df.columns)
print(f"Características: {len(df.columns.tolist())}")

Index(['destination_port', 'flow_duration', 'total_fwd_packets',
       'total_backward_packets', 'total_length_of_fwd_packets',
       'total_length_of_bwd_packets', 'fwd_packet_length_max',
       'fwd_packet_length_min', 'fwd_packet_length_mean',
       'fwd_packet_length_std', 'bwd_packet_length_max',
       'bwd_packet_length_min', 'bwd_packet_length_mean',
       'bwd_packet_length_std', 'flow_bytes/s', 'flow_packets/s',
       'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 'flow_iat_min',
       'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max',
       'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean', 'bwd_iat_std',
       'bwd_iat_max', 'bwd_iat_min', 'fwd_psh_flags', 'fwd_urg_flags',
       'fwd_header_length', 'bwd_header_length', 'fwd_packets/s',
       'bwd_packets/s', 'min_packet_length', 'max_packet_length',
       'packet_length_mean', 'packet_length_std', 'packet_length_variance',
       'fin_flag_count', 'syn_flag_count', 'rst_flag_count', 'psh_flag_count',

## Limpieza de datos

### 1. Limpieza de etiquetas de la columna label

In [15]:
df["label"] = df["label"].str.upper().str.replace("� ", "").str.replace(" ", "_").str.replace("-", "_")
print(df["label"].unique())
print(f"Total de etiquetas: {len(df["label"].unique())}")

['BENIGN' 'FTP_PATATOR' 'SSH_PATATOR' 'DOS_SLOWLORIS' 'DOS_SLOWHTTPTEST'
 'DOS_HULK' 'DOS_GOLDENEYE' 'HEARTBLEED' 'WEB_ATTACK_BRUTE_FORCE'
 'WEB_ATTACK_XSS' 'WEB_ATTACK_SQL_INJECTION' 'INFILTRATION' 'BOT'
 'PORTSCAN' 'DDOS']
Total de etiquetas: 15


### 2. Imputación de datos faltantes

#### 3.1. flow_bytes/s

Los valores nulos de la característica flow_bytes/s serán reemplazados por la mediana del dataset

Los valores infinitos de la característica flow_bytes/s serán reemplazados por el máximo del dataset

##### Valores nulos

In [None]:
imputacion_con_filtro = True

if imputacion_con_filtro :
  # Cálculo de medianas de etiquetas de la característica Flow Bytes/s filtrando por la variable objetivo 
  mediana_dos_hulk = df[df[" Label"] == "DoS Hulk"]["Flow Bytes/s"].median()
  mediana_benign   = df[df[" Label"] == "BENIGN"]["Flow Bytes/s"].median()  
  print(f"""
Mediana de la característica Flow Bytes/s en la etiqueta DOS Hulk: {mediana_dos_hulk}
Mediana de la característica Flow Bytes/s en la etiqueta BENIGN: {mediana_benign}
  """)
  df.loc[df["Flow Bytes/s"].isnull() & (df[" Label"] == "DoS Hulk"), "Flow Bytes/s"] = mediana_dos_hulk
  df.loc[df["Flow Bytes/s"].isnull() & (df[" Label"] == "BENIGN"), "Flow Bytes/s"] = mediana_benign
else : 
  mediana = df["Flow Bytes/s"].median()
  print(f"Mediana de la característica Flow Bytes/s: {mediana}\n")
  df["flow_bytes/s"] = df["flow_bytes/s"].fillna(mediana)

print(f"Valores faltantes en DOS_HULK: {df[df[" Label"] == "DoS Hulk"]["Flow Bytes/s"].isnull().sum()}")
print(F"Valores faltantes en BENIGN: {df[df[" Label"] == "BENIGN"]["Flow Bytes/s"].isnull().sum()}")

##### Valores infinitos

In [None]:
df["flow_bytes/s"] = df["flow_bytes/s"].replace(np.inf, np.nan)
print(df["flow_bytes/s"].max())
df["flow_bytes/s"] = df["flow_bytes/s"].fillna(df["flow_bytes/s"].max())

#### 3.2. flow_packets/s

In [None]:
df["flow_packets/s"] = df["flow_packets/s"].replace(np.inf, np.nan)
print(df["flow_packets/s"].max())
df["flow_packets/s"] = df["flow_packets/s"].fillna(df["flow_packets/s"].max())

## Construcción de nuevos datos

Para preparar los datos para el análisis, se normalizarán los datos numéricos, esto quiere decir que se escalarán los datos de 0 a 1

In [167]:
class IDSFeatureTransformer(BaseEstimator, TransformerMixin) : 
  def __init__(self):
    super().__init__()
    self.n_components = 9
    self.scaler = MinMaxScaler()
    self.pca = PCA(n_components=self.n_components)
    self.drop_cols = [
      'bwd_psh_flags',
      'bwd_urg_flags', 
      'fwd_avg_bytes/bulk', 'fwd_avg_packets/bulk', 'fwd_avg_bulk_rate', 
      'bwd_avg_bytes/bulk', 'bwd_avg_packets/bulk', 'bwd_avg_bulk_rate'
    ]
    self.caracteristicas_numericas = [
      'flow_duration', 'total_fwd_packets', 'total_backward_packets', 
      'total_length_of_fwd_packets', 'total_length_of_bwd_packets', 
      'fwd_packet_length_max', 'fwd_packet_length_min', 
      'fwd_packet_length_mean', 'fwd_packet_length_std', 
      'bwd_packet_length_max', 'bwd_packet_length_min', 
      'bwd_packet_length_mean', 'bwd_packet_length_std', 
      'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 
      'flow_iat_min', 'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std', 
      'fwd_iat_max', 'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean', 
      'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min', 'fwd_header_length', 
      'bwd_header_length', 'fwd_packets/s', 'bwd_packets/s',
      'min_packet_length', 'max_packet_length', 'packet_length_mean',
      'packet_length_std', 'packet_length_variance', 'down/up_ratio',
      'average_packet_size', 'avg_fwd_segment_size', 
      'avg_bwd_segment_size', 'fwd_header_length.1', 'subflow_fwd_packets', 
      'subflow_fwd_bytes', 'subflow_bwd_packets', 'subflow_bwd_bytes', 
      'init_win_bytes_forward', 'init_win_bytes_backward', 
      'act_data_pkt_fwd', 'min_seg_size_forward', 'active_mean', 
      'active_std', 'active_max', 'active_min', 'idle_mean', 'idle_std', 
      'idle_max', 'idle_min', 'flow_bytes/s', 'flow_packets/s'
    ]
    self.caracteristica_nominal  = ['destination_port']
    self.caracteristica_objetivo = ['label']
    self.caracteristicas_binarias  = [
      'fwd_psh_flags', 'fwd_urg_flags',
      'fin_flag_count', 'syn_flag_count', 'rst_flag_count', 
      'psh_flag_count', 'ack_flag_count', 'urg_flag_count', 
      'cwe_flag_count', 'ece_flag_count'
    ]
    self.flow_packets_col = {
        "column_name" : "flow_packets/s",
        "sum_values" : ["fwd_packets/s", "bwd_packets/s"]
    }
    self.inf_null_col = [
      'flow_bytes/s'
    ]
  
  def label_transform(self, y) :
    y_new = y.copy()
    y_new["label"] = y_new["label"].str.upper().str.replace("� ", "").str.replace(" ", "_").str.replace("-", "_")
    return y_new

  def columns_transform(self, X : pd.DataFrame) : 
    X_new = X.copy()
    X_new.columns = X_new.columns.str.strip().str.lower().str.replace(" ", "_")
    return X_new
  
  def data_selection(self, X : pd.DataFrame) : 
    X_new = X.copy()
    X_new.drop(columns= self.drop_cols, inplace=True, errors='ignore')
    return X_new
  
  def data_cleaning(self, X : pd.DataFrame) :
    X_new = X.copy()

    # Imputación de valores NaN por la mediana en la columna flow_bytes/s
    X_new[self.inf_null_col] = X_new[self.inf_null_col].fillna(X_new[self.inf_null_col].median())
    # Imputación de valores infinitos por el máximo en la columna flow_bytes/s
    # Primero reemplazamos los valores infinitos por NaN
    X_new[self.inf_null_col] = X_new[self.inf_null_col].replace(np.inf, np.nan)
    # Ahora reemplazamos los valores NaN por el nuevo máximo 
    X_new[self.inf_null_col] = X_new[self.inf_null_col].fillna(X_new[self.inf_null_col].max())

    # Imputamos la característica flow_packets/s reemplazando los infinitos por la suma de fwd_packets/s y bwd_packets/s 
    mask = np.isinf(X_new[self.flow_packets_col["column_name"]])
    X_new.loc[mask, self.flow_packets_col["column_name"]] = X_new[self.flow_packets_col["sum_values"]].sum(axis=1)

    return X_new
  
  def fit_transform(self, X : pd.DataFrame, y : pd.DataFrame) : 
    X_pre = self.columns_transform(X.copy())
    y_new = self.columns_transform(y)
    y_new = self.label_transform(y_new)

    """ Selección de datos relevantes """
    X_new = self.data_selection(X_pre)

    """ Limpieza de datos """
    X_new = self.data_cleaning(X_new)

    """ Construcción de nuevos datos """
    # Entrenamos el MinMaxScaler() con los datos preparados, 
    # asimismo, normalizamos el dataset con fit_transform()
    X_num_scaled = self.scaler.fit_transform(X_new[self.caracteristicas_numericas])

    # Utilizamos PCA() para reducir la dimensionalidad del dataset
    X_pca = pd.DataFrame(
      self.pca.fit_transform(X_num_scaled),
      columns=[f"component_{i+1}" for i in range(0, self.n_components)]
    )
    
    """ Integración de datos """

    X_concat = pd.concat([
      X_pca.reset_index(drop=True), 
      X_new[self.caracteristicas_binarias].reset_index(drop=True), 
      X_new[self.caracteristica_nominal].reset_index(drop=True)
    ], axis=1)

    self.columns = X_concat.columns.tolist()
    self.variance = self.pca.explained_variance_ratio_

    return X_concat, y_new

  def transform(self, X : pd.DataFrame, y : pd.DataFrame) :
    X_pre = self.columns_transform(X.copy())
    y_new = self.columns_transform(y)
    y_new = self.label_transform(y_new)

    """ Selección de datos relevantes """
    X_new = self.data_selection(X_pre)

    """ Limpieza de datos """
    X_new = self.data_cleaning(X_new)

    """ Construcción de nuevos datos """
    # Normalizamos el data frame, el MinMaxScaler() ya fue entrenado en fit_transform()
    X_num_scaled  = self.scaler.transform(X_new[self.caracteristicas_numericas])

    X_pca = pd.DataFrame(
      self.pca.transform(X_num_scaled),
      columns=[f"component_{i+1}" for i in range(0, self.n_components)]
    )

    X_concat = pd.concat([
      X_pca.reset_index(drop=True), 
      X_new[self.caracteristicas_binarias].reset_index(drop=True), 
      X_new[self.caracteristica_nominal].reset_index(drop=True)
    ], axis=1)

    return X_concat, y_new
  
transformer = IDSFeatureTransformer()

In [136]:
df = pd.read_csv(dataset_path, sep=",")
df.columns

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', '

In [168]:
X = df.drop([" Label"], axis=1)
y = df[[" Label"]]

print(f"""
Dataset
      X: {X.shape}
      y: {y.shape}
""")


Dataset
      X: (2830743, 78)
      y: (2830743, 1)



In [173]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

print(f"""
Dataset entrenamiento
      X: {X_train.shape}
      y: {y_train.shape}

Dataset prueba
      X: {X_test.shape}
      y: {y_test.shape}
""")

X_train, y_train = transformer.fit_transform(X_train, y_train)
X_test, y_test = transformer.transform(X_test, y_test)

print(f"""
Dataset entrenamiento
      X: {X_train.shape}
      y: {y_train.shape}

Dataset prueba
      X: {X_test.shape}
      y: {y_test.shape}
""")

y_train = y_train["label"] 
y_test = y_test["label"] 


Dataset entrenamiento
      X: (2264594, 78)
      y: (2264594, 1)

Dataset prueba
      X: (566149, 78)
      y: (566149, 1)


Dataset entrenamiento
      X: (2264594, 20)
      y: (2264594, 1)

Dataset prueba
      X: (566149, 20)
      y: (566149, 1)



In [174]:
rf_clf = RandomForestClassifier(random_state=seed, n_estimators=100)
rf_clf.fit(X_train, y_train)

In [None]:
y_pred = rf_clf.predict(X_test)

print(classification_report(y_test, y_pred))

# biplot

                          precision    recall  f1-score   support

                  BENIGN       1.00      1.00      1.00    454620
                     BOT       0.85      0.76      0.81       393
                    DDOS       1.00      1.00      1.00     25606
           DOS_GOLDENEYE       1.00      0.99      0.99      2059
                DOS_HULK       1.00      1.00      1.00     46215
        DOS_SLOWHTTPTEST       0.99      0.99      0.99      1100
           DOS_SLOWLORIS       0.99      0.99      0.99      1159
             FTP_PATATOR       1.00      1.00      1.00      1588
              HEARTBLEED       1.00      1.00      1.00         2
            INFILTRATION       1.00      0.14      0.25         7
                PORTSCAN       0.99      1.00      0.99     31786
             SSH_PATATOR       1.00      1.00      1.00      1179
  WEB_ATTACK_BRUTE_FORCE       0.73      0.78      0.76       301
WEB_ATTACK_SQL_INJECTION       1.00      0.25      0.40         4
         