# Modelado - Transformador

In [1]:
# Importaciones
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from sklearn.feature_selection import mutual_info_classif
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.under_sampling import TomekLinks
from sklearn.base import BaseEstimator, TransformerMixin

## 1. Utils

In [2]:
SEED = 9603

K_ITERS = 10

TRAIN_SIZE = 0.75

caracteristicas_numericas = [
  'flow_duration', 'total_fwd_packet', 'total_bwd_packets', 
  'total_length_of_fwd_packet', 'total_length_of_bwd_packet', 
  'fwd_packet_length_max', 'fwd_packet_length_min', 'fwd_packet_length_mean', 
  'fwd_packet_length_std', 'bwd_packet_length_max', 'bwd_packet_length_min', 
  'bwd_packet_length_mean', 'bwd_packet_length_std', 'flow_bytes/s', 
  'flow_packets/s', 'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 
  'flow_iat_min', 'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std', 
  'fwd_iat_max', 'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean', 
  'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min', 'fwd_psh_flags', 
  'bwd_psh_flags', 'fwd_header_length', 'bwd_header_length', 'fwd_packets/s', 
  'bwd_packets/s', 'packet_length_min', 'packet_length_max', 'packet_length_mean', 
  'packet_length_std', 'packet_length_variance', 'fin_flag_count', 'syn_flag_count',
  'rst_flag_count', 'psh_flag_count', 'ack_flag_count', 'cwr_flag_count', 
  'ece_flag_count', 'down/up_ratio', 'average_packet_size', 'fwd_segment_size_avg', 
  'bwd_segment_size_avg', 'fwd_bytes/bulk_avg', 'fwd_packet/bulk_avg', 
  'fwd_bulk_rate_avg', 'bwd_bytes/bulk_avg', 'bwd_packet/bulk_avg', 
  'bwd_bulk_rate_avg', 'subflow_fwd_packets', 'subflow_fwd_bytes', 
  'subflow_bwd_packets', 'subflow_bwd_bytes', 'fwd_init_win_bytes', 
  'bwd_init_win_bytes', 'fwd_act_data_pkts', 'fwd_seg_size_min', 
  'active_mean', 'active_std', 'active_max', 'active_min', 
  'idle_mean', 'idle_std', 'idle_max', 'idle_min'
]

caracteristicas_var_null = ['flow_bytes/s', 'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 'flow_iat_min']

caracteristicas_var_max_inf = ['flow_bytes/s', 'flow_packets/s']

caracteristicas_nominales = ['port_type_registered', 'port_type_well_known', 'protocol_6', 'protocol_17']

caracteristica_objetivo = "label"

## 2. Creación de transformador

In [3]:
class Transformer(BaseEstimator, TransformerMixin) : 
  def __init__(self, seed, categorical_chars, continous_chars, obj_char, obj_char_majority_class, selected_chars=None, imputer_strategy='median', threshold_redundance=0.9):
    allowed_meassures = {'mean', 'median'}
    super().__init__()
    self.seed = seed
    self.categorical_chars = categorical_chars 
    self.continous_chars = continous_chars
    self.obj_char = obj_char
    self.obj_char_majority_class = obj_char_majority_class
    self.selected_chars = selected_chars
    self.threshold_redundance = threshold_redundance
    if imputer_strategy not in allowed_meassures : 
      raise ValueError(f"Valor inválido: {imputer_strategy}. Debe ser uno de {allowed_meassures}")
    self.imputer_strategy = imputer_strategy

  def inf_null_data_detection(self, X_y : pd.DataFrame) : 
    self.inf_chars  = X_y[self.continous_chars].columns[np.isinf(X_y[self.continous_chars]).any()].tolist()
    self.null_chars = X_y[self.continous_chars].columns[X_y[self.continous_chars].isnull().any()].tolist()
    
  def inf_data_imputation(self, X_y : pd.DataFrame) : 
    X_y_ = X_y.copy()
    X_y_.replace([np.inf], np.nan, inplace=True)
    return X_y_

  def null_train_data_imputation(self, X_y : pd.DataFrame) :
    X_y_ = X_y.copy()
    for char in list(set(self.inf_chars+self.null_chars)) : 
      mask = X_y_[char].isna()
      # Se obtienen las etiquetas de la variable objetivo que tienen valores nulos
      null_values_obj_classes = X_y_[mask][self.obj_char].unique().tolist()
      for obj_tag in null_values_obj_classes : 
        # Obtención de medida de la columna filtrandolo por la variable objetivo
        measure_per_obj_tag = getattr(X_y_[X_y_[self.obj_char] == obj_tag][char], self.imputer_strategy)()

        # Reemplazo de valores nulos por la columna obtenida
        X_y_.loc[mask & (X_y_[self.obj_char] == obj_tag), char] = measure_per_obj_tag
    # for char in list(set(self.inf_chars+self.null_chars)) : 
    #   mask = X_y[char].isna()
    #   # Se obtienen las etiquetas de la variable objetivo que tienen valores nulos
    #   null_values_obj_classes = X_y[mask][self.obj_char].unique().tolist()
    #   for obj_tag in null_values_obj_classes : 
    #     # Obtención de medida de la columna filtrandolo por la variable objetivo
    #     measure_per_obj_tag = getattr(X_y[X_y[self.obj_char] == obj_tag][char], self.imputer_strategy)()

    #     # Reemplazo de valores nulos por la columna obtenida
    #     X_y.loc[mask & (X_y[self.obj_char] == obj_tag), char] = measure_per_obj_tag
    return X_y_
  
  def get_measures_for_imputer(self, X_y : pd.DataFrame) : 
    measures = {}
    for label in list(set(self.inf_chars+self.null_chars)) : 
      measures[label] = getattr(X_y[label], self.imputer_strategy)()
    self.measures = measures
    
  def chars_selection(self, X_y : pd.DataFrame) : 
    def obtener_indice_grupo(corr_list, char):
      for i, group in enumerate(corr_list):
          if char in group :
              return i
      return False  # Si no se encuentra 

    all_chars = set(self.continous_chars)

    corr_matrix = X_y[self.continous_chars].corr(method='spearman').abs()
    np.fill_diagonal(corr_matrix.values, 0)

    already_added_chars = set()
    corr_groups = []

    """ Buscar grupos de características con alta correlación """
    for char in self.continous_chars : 
      correlated_chars = list(corr_matrix.index[corr_matrix[char] > self.threshold_redundance])
      correlated_chars.append(char)
      # Eliminar duplicados
      group = list(set(correlated_chars))
      # Verificamos si la característica no está incluida entre las características ya agregadas
      matches = [char for char in group if char in already_added_chars]
      if len(group) > 1 : 
        if not matches :
          already_added_chars.update(group)
          corr_groups.append(group)
        else : 
          idx = obtener_indice_grupo(corr_groups, matches[0])
          already_added_chars.update(group)
          detected_group = set(corr_groups[idx])
          detected_group.update(group)
          corr_groups[idx] = list(detected_group) 

    """ Fusión de grupos con alta correlación con las mismas características """
    merged_groups = []

    for sublist in corr_groups : 
      sublist_set = set(sublist)
      new_groups = []
      merged_group = sublist_set.copy()
      for group in merged_groups : 
        if sublist_set & set(group) : 
          merged_group.update(group)
        else : 
          new_groups.append(merged_group)
      new_groups.append(merged_group)
      merged_groups = new_groups

    correlated_chars = set().union(*merged_groups)
    no_correlated_chars = list(all_chars-correlated_chars)

    """ Selección de características por Información Mutua """
    mi_scores = mutual_info_classif(X_y[self.categorical_chars+self.continous_chars],X_y[self.obj_char])
    df_mi_scores = pd.DataFrame()
    df_mi_scores["char"] = self.categorical_chars+self.continous_chars
    df_mi_scores["score"] = mi_scores

    selected_chars = []
    for group in merged_groups : 
      mask = df_mi_scores['char'].isin(group)
      df_mi_scores_filtered = df_mi_scores[mask].sort_values(by='score', ascending=False)
      selected_char = df_mi_scores_filtered.iloc[0]
      selected_chars.append(selected_char['char'])

    selected_chars = selected_chars+no_correlated_chars+self.categorical_chars
    
    mask = df_mi_scores['char'].isin(selected_chars)
    final_selected_chars = df_mi_scores[mask].sort_values(by='score', ascending=False).head(10)['char'].to_list()

    categorical_selected_chars = [char for char in self.categorical_chars if char in final_selected_chars]
    continous_selected_chars   = [char for char in self.continous_chars if char in final_selected_chars]

    self.n_categorical_selected_chars = len(categorical_selected_chars)
    self.selected_chars = categorical_selected_chars+continous_selected_chars
    self.selected_indices_mi = df_mi_scores[mask].sort_values(by='score', ascending=False).head(10)
    return X_y[self.selected_chars+[self.obj_char]]

  def smotenc_resample(self, X_y : pd.DataFrame) : 
    minority_classes = set(X_y[self.obj_char].unique()) - set([self.obj_char_majority_class])
    cant_majority_class = len(X_y[X_y[self.obj_char] == self.obj_char_majority_class])
    strategy = {
      self.obj_char_majority_class : cant_majority_class
    }
    for class_ in minority_classes : 
      calc_q = int(round(cant_majority_class/len(minority_classes), 0))
      real_q = len(X_y[X_y[self.obj_char] == class_])

      strategy[class_] = calc_q if calc_q>real_q else real_q

    if self.n_categorical_selected_chars > 0 :
      smt = SMOTENC(
        categorical_features=list(range(self.n_categorical_selected_chars)),
        random_state=self.seed,
        sampling_strategy=strategy
      )
    else :
      smt = SMOTE(
        random_state=self.seed,
        sampling_strategy=strategy
      )

    X_resample, y_resample = smt.fit_resample(
      X_y.drop([self.obj_char], axis=1),
      X_y[self.obj_char]
    )

    df_resample = pd.DataFrame(
      X_resample,
       columns=self.selected_chars
    )

    df_resample[self.obj_char] = y_resample

    return df_resample

  def tomek_links_subsample(self, X_y : pd.DataFrame) : 
    tmk = TomekLinks(
      sampling_strategy=[self.obj_char_majority_class]
    )

    X_subsample, y_subsample = tmk.fit_resample(
      X_y.drop([self.obj_char], axis=1),
      X_y[self.obj_char]
    )
    
    df_subsample = pd.DataFrame(
      X_subsample,
       columns=self.selected_chars
    )

    df_subsample[self.obj_char] = y_subsample

    return df_subsample

  def null_test_data_imputation(self, X_y : pd.DataFrame) : 
    X_y_ = X_y.copy()
    for k, v in self.measures.items() : 
      X_y_[k] = X_y_[k].fillna(v)
    return X_y_

  def fit(self, X_y) : 
    self.inf_null_data_detection(X_y=X_y)
    X_y = self.inf_data_imputation(X_y=X_y)
    X_y = self.null_train_data_imputation(X_y=X_y)
    self.get_measures_for_imputer(X_y=X_y)

    self.chars_selection(X_y=X_y)

    X_y = self.smotenc_resample(X_y=X_y)

    X_y = self.tomek_links_subsample(X_y=X_y)

  def fit_transform(self, X : pd.DataFrame, y : pd.Series) : 
    X_y = X.copy()
    X_y[self.obj_char] = y
    print('Limpieza de datos')
    self.inf_null_data_detection(X_y=X_y)
    X_y = self.inf_data_imputation(X_y=X_y)
    X_y = self.null_train_data_imputation(X_y=X_y)
    self.get_measures_for_imputer(X_y=X_y)
    print(self.measures)

    print('Selección de características')
    X_y = self.chars_selection(X_y=X_y)
    print(self.selected_chars)

    print('Sobremuestreo con SMOTENC')
    X_y = self.smotenc_resample(X_y=X_y)

    print('SUbmuestreo con TomekLinks')
    X_y = self.tomek_links_subsample(X_y=X_y)

    return X_y[self.selected_chars], X_y[self.obj_char]
  
  def transform(self, X_y : pd.DataFrame) : 
    X_y = self.inf_data_imputation(X_y=X_y)

    X_y = self.null_test_data_imputation(X_y=X_y)

    return X_y[self.selected_chars]

## 3. K-Iteraciones

In [4]:
# bagging_resample() : Función para muestreo Bagging
def bagging_resample(X_, y_, seed) :
  indices = np.arange(len(X_)) 
  bootstrap_indices = resample(
    indices,
    replace=True,
    stratify=y_,
    random_state=seed,
    n_samples=int(round(len(X_)*TRAIN_SIZE, 0))
  )
  # np.setdiff1d permite obtener los índices que no aparecen 
  # en los indices tras el muestreo Bootstrap
  oob_indices = np.setdiff1d(indices, bootstrap_indices)

  X_train = X_.iloc[bootstrap_indices]
  y_train = y_.iloc[bootstrap_indices]
  X_valid = X_.iloc[oob_indices]
  y_valid = y_.iloc[oob_indices]

  return X_train, y_train, X_valid, y_valid

In [5]:
def drop_duplicates(X_, y_) : 
  df = pd.DataFrame(
    columns=caracteristicas_nominales+caracteristicas_numericas,
    data=X_
  )
  df[caracteristica_objetivo] = y_
  df.drop_duplicates()
  return df[caracteristicas_nominales+caracteristicas_numericas], df[caracteristica_objetivo]

In [6]:
# save_df() : Función para guardar dataframes
def save_df(X_, y_, save_path) : 
  df_save = X_.copy()
  df_save[caracteristica_objetivo] = y_

  df_save.to_csv(save_path, index=False) 

In [7]:
df_dataset = pd.read_csv('DB/dataset.csv')

X = df_dataset.drop([caracteristica_objetivo], axis=1)
y = df_dataset[caracteristica_objetivo]

In [8]:
# Declaración de valores necesarios para K-Fold
skf = StratifiedKFold(n_splits=K_ITERS, shuffle=True, random_state=SEED)

n_models = 4
prng = np.random.RandomState(seed=SEED)
max_int32 = np.iinfo(np.int32).max
seeds_per_model = prng.randint(0, max_int32, size=n_models)
print(seeds_per_model)

# Contador de iteraciones
iter_cont = 0

bagging_model = [
  'rf',
  'dt',
  'mlp',
  'knn'
]

[793494059 498241738 377997800 912782427]


In [9]:
for temp_index, test_index in skf.split(X, y) : 
  X_temp, X_test = X.iloc[temp_index], X.iloc[test_index]
  y_temp, y_test = y.iloc[temp_index], y.iloc[test_index]

  X_test, y_test = drop_duplicates(X_test, y_test)

  print(f'\n    ITERACIÓN {iter_cont+1}\n')

  for i in range(n_models) : 
    X_train, y_train, X_valid, y_valid = bagging_resample(
      X_temp, y_temp, seeds_per_model[i]
    )

    print(f'MODELO {bagging_model[i]}')

    transformer = Transformer(
      seed=seeds_per_model[i], 
      categorical_chars=caracteristicas_nominales, 
      continous_chars=caracteristicas_numericas, 
      obj_char=caracteristica_objetivo, 
      obj_char_majority_class='BENIGN', 
      selected_chars=None, 
      imputer_strategy='median', 
      threshold_redundance=0.8
    )

    print('Preparación del conjunto de entrenamiento')
    X_train, y_train = transformer.fit_transform(X_train, y_train)

    print('Preparación del conjunto de validación')
    X_valid, y_valid = drop_duplicates(X_valid, y_valid)
    X_valid = transformer.transform(X_valid)

    print('Preparación del conjunto de prueba')
    X_test_per_model = transformer.transform(X_test)

    save_df(
      X_=X_train,
      y_=y_train,
      save_path=f'DB/model_evaluation/{iter_cont+1}/df_train_{bagging_model[i]}.csv'
    )
    save_df(
      X_=X_valid,
      y_=y_valid,
      save_path=f'DB/model_evaluation/{iter_cont+1}/df_valid_{bagging_model[i]}.csv'
    )
    save_df(
      X_=X_test_per_model,
      y_=y_test,
      save_path=f'DB/model_evaluation/{iter_cont+1}/df_test_{bagging_model[i]}.csv'
    )

  iter_cont += 1


    ITERACIÓN 1

MODELO rf
Preparación del conjunto de entrenamiento
Limpieza de datos
{'flow_iat_mean': 17368.85, 'flow_iat_std': 17836.6592163443, 'flow_iat_max': 60257.0, 'flow_iat_min': 3.0, 'flow_bytes/s': 3903.935294499232, 'flow_packets/s': 75.45746085644218}
Selección de características
['flow_duration', 'fwd_packet_length_max', 'bwd_packet_length_max', 'fwd_header_length', 'bwd_header_length', 'packet_length_max', 'packet_length_mean', 'packet_length_std', 'packet_length_variance', 'average_packet_size']
Sobremuestreo con SMOTENC
SUbmuestreo con TomekLinks
Preparación del conjunto de validación
Preparación del conjunto de prueba
MODELO dt
Preparación del conjunto de entrenamiento
Limpieza de datos
{'flow_iat_mean': 17310.0, 'flow_iat_std': 17855.134424958367, 'flow_iat_max': 60301.0, 'flow_iat_min': 3.0, 'flow_bytes/s': 3917.684253765352, 'flow_packets/s': 75.45793534512573}
Selección de características
['flow_duration', 'fwd_packet_length_max', 'bwd_packet_length_max', 'fwd_