In [1]:
# Importaciones
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample

In [2]:
CARACTERISTICAS_VAR_INF  = ['flow_bytes/s']

CARACTERISTICAS_VAR_NULL = ['flow_iat_min', 'flow_bytes/s']

NUM_SELECTED_CHARS = [
  'bwd_iat_min', 'packet_length_std', 'flow_bytes/s', 'cwr_flag_count', 'flow_iat_min', 
  'fwd_iat_min', 'fwd_bulk_rate_avg', 'subflow_bwd_packets', 'active_std', 
  'bwd_bytes/bulk_avg', 'flow_duration', 'fwd_packet_length_max', 'subflow_fwd_packets', 
  'down/up_ratio', 'idle_max', 'fwd_act_data_pkts', 'rst_flag_count'
]

CAT_SELECTED_CHARS = ['port_type_registered', 'port_type_well_known', 'protocol_6', 'protocol_17']

CARACTERISTICA_OBJETIVO = "label"

DB_PATH = "DB/"

In [3]:
# df_ = pd.read_csv("DB/1/df_train_dt.csv")

In [4]:
bagging_model = [
  {
    'path_name' : 'rf', 
    'model_name' : 'RandomForestClassifier',
    'model' : RandomForestClassifier(criterion="gini", class_weight='balanced', max_depth=20, n_estimators=100, bootstrap=True, random_state=9603)
  },
  {
    'path_name' : 'dt', 
    'model_name' : 'DecisionTreeClassifier',
    'model' : DecisionTreeClassifier(criterion="gini", class_weight='balanced', max_depth=20, random_state=9603)
  },
  {
    'path_name' : 'mlp',
    'model_name' : 'MLPClassifier',
    'model' : MLPClassifier(hidden_layer_sizes=(64,), max_iter=100, early_stopping=True, random_state=9603)
  },
  {
    'path_name' : 'knn',
    'model_name' : 'KNeighborsClassifier',
    'model' : KNeighborsClassifier(n_neighbors=3, weights='distance', metric='manhattan')
  },
]

In [5]:
# getFrequency(): Función para obtener la distribución de frecuencias de la columna label
def getFrequency(df : pd.DataFrame, caracteristica) :
  frecuencia = df[caracteristica].value_counts()
  porcentaje = df[caracteristica].value_counts(normalize=True) * 100

  tabla_frecuencia = pd.DataFrame({
    "Frecuencia": frecuencia,
    "Frecuencia(%)": porcentaje
  })
  
  tabla_frecuencia["Frecuencia(%)"].round(2)
  
  print(tabla_frecuencia)
  print(df.shape)

# A

In [6]:
def transform(df_ : pd.DataFrame) : 
  df = df_.copy()
  df['protocol_6'] = (df['protocol'] == 6).astype(int)
  df['protocol_17'] = (df['protocol'] == 17).astype(int)
  df = df[CAT_SELECTED_CHARS+NUM_SELECTED_CHARS+[CARACTERISTICA_OBJETIVO]]
  df[CARACTERISTICA_OBJETIVO] = df[CARACTERISTICA_OBJETIVO].apply(lambda x : LABELS[x])
  print(df.columns)
  getFrequency(df, CARACTERISTICA_OBJETIVO)
  return df

In [7]:
# df_test = pd.read_csv('DB/pre/1/df_test.csv')
# df_test = transform(df_test)

In [8]:
# for i in range(10) : 
#   for j in bagging_model : 
#     print("Transformación de conjunto de validación")
#     df_valid = pd.read_csv(f"DB/pre/{i+1}/df_valid_{j['path_name']}.csv")
#     df_valid = transform(df_valid)
#     df_valid.to_csv(f"DB/{i+1}/df_valid_{j['path_name']}.csv")
#   df_test = pd.read_csv(f"DB/pre/{i+1}/df_test.csv")
#   df_test = transform(df_test)
#   df_test.to_csv(f"DB/{i+1}/df_test.csv")

# B

In [9]:
df_ = pd.read_csv('DB/1/df_train_rf.csv')
LABELS = sorted(df_['label'].unique())
LABELS

['BENIGN',
 'BOT',
 'DDOS',
 'DOS_GOLDENEYE',
 'DOS_HULK',
 'DOS_SLOWHTTPTEST',
 'DOS_SLOWLORIS',
 'FTP_PATATOR',
 'PORTSCAN',
 'SSH_PATATOR']

In [10]:
encoder = LabelEncoder()

In [11]:
# get_results() : Función para automatizar la generación de la Matriz de confusión y AUC de las curvas ROC y Sensibilidad Precisión
def get_results(y_test, y_pred, threshold) :
  y_pred_under_threshold = y_pred >= threshold
  y_pred_for_cm = y_pred_under_threshold.astype(int)
  """
  Matriz de confusión
              Predicted
              0     1
  Actual 0  [[TN,   FP],
         1   [FN,   TP]]
  """
  cm = confusion_matrix(y_test, y_pred_for_cm)
  auc_roc = roc_auc_score(y_test, y_pred)
  precision, recall, pr_thresholds = precision_recall_curve(y_test, y_pred)
  auc_sp = auc(recall, precision)
  
  return cm, auc_roc, auc_sp

In [12]:
# one_hot_encode() : Función para codificar la variable objetivo en razón de las clases que codificó la instancia de Machine Learning
def one_hot_encode(y, classes) : 
  one_hot = np.zeros((len(y), len(classes)))
  for i, label in enumerate(y) :
    one_hot[i, int(label)] = 1
  return one_hot

In [13]:
def save_df(X_, y_, save_path) : 
  df_save = pd.DataFrame(X_, columns=CAT_SELECTED_CHARS+NUM_SELECTED_CHARS)
  df_save[CARACTERISTICA_OBJETIVO] = y_

  df_save.to_csv(save_path, index=False) 

In [None]:
df_validation = pd.DataFrame(columns=["iter", "model", "label", "TP", "TN", "FP", "FN", "AUC_SP", "ms"])
df_testing    = pd.DataFrame(columns=["iter", "model", "label", "TP", "TN", "FP", "FN", "Exac", "Prec", "Sens", "F1", "AUC_SP", "ms"])

for iter in range(10) : 
  print(f"""
    Iteración {iter+1}
  """)
  results_list = []
  for instance in bagging_model : 
    print("Modelo", instance['model_name'])
    df_train = pd.read_csv(f"{DB_PATH}{iter+1}/df_train_{instance['path_name']}.csv")
    X_train = df_train.drop([CARACTERISTICA_OBJETIVO], axis=1).values
    y_train = encoder.fit_transform(df_train[CARACTERISTICA_OBJETIVO].values.ravel())

    df_valid = pd.read_csv(f"{DB_PATH}{iter+1}/df_valid_{instance['path_name']}.csv")
    X_valid = df_valid.drop([CARACTERISTICA_OBJETIVO], axis=1).values
    y_valid = one_hot_encode(
      encoder.transform(df_valid[CARACTERISTICA_OBJETIVO].values.ravel()), 
      LABELS
    )
    print("Conjunto de entrenamiento")
    getFrequency(df_train, 'label')
    print("Conjunto de validación")
    getFrequency(df_valid, 'label')

    print("Entrenamiento")

    instance['model'].fit(X_train, y_train)

    print("Validación")

    valid_start = time.time()
    y_valid_pred = instance['model'].predict_proba(X_valid)
    valid_end = time.time()

    for i in range(len(LABELS)) : 
      cm, auc_roc, auc_sp = get_results(
        y_test=y_valid[:,i],
        y_pred=y_valid_pred[:,i],
        threshold=0.5
      )
      df_validation.loc[len(df_validation)] = [
        f"Iteración {iter+1}",
        instance["model_name"], 
        LABELS[i],
        cm[1][1], 
        cm[0][0], 
        cm[0][1], 
        cm[1][0], 
        auc_sp, 
        valid_end-valid_start
      ]

    print("Prueba del modelo")
    X_test = pd.read_csv(f"{DB_PATH}{iter+1}/df_test_{instance['path_name']}.csv").drop([CARACTERISTICA_OBJETIVO], axis=1).values

    """ Prueba de los modelos """
    pred_start = time.perf_counter()
    y_pred = instance["model"].predict_proba(X_test)
    pred_end   = time.perf_counter()

    print("Registro de resultados")
    results_list.append({
      "model" : instance["model_name"],
      "pred" : y_pred,
      "time_start" : pred_start,
      "time_end" : pred_end
    })

  y_test = pd.read_csv(f"{DB_PATH}{iter+1}/df_test_rf.csv")[CARACTERISTICA_OBJETIVO]
  y_test = one_hot_encode(
    encoder.transform(y_test.values.ravel()), 
    LABELS
  )

  print(f'results_list : {len(results_list)}')
  y_pred_bagging = np.zeros(y_test.shape)
  print(f"y_pred_bagging : {y_pred_bagging.shape}")

  for result in results_list : 
    print(f"y_pred_{result['model']} : {result['pred'].shape}")
    y_pred_bagging = y_pred_bagging + result['pred']

  y_pred_bagging /= 4

  results_list.append({
    'model' : 'BaggingClassifier',
    'pred' : y_pred_bagging,
    'time_start' : results_list[0]["time_start"],
    "time_end" : results_list[3]["time_end"]
  })
  print(f'results_list : {len(results_list)}')

  for result in results_list : 
    for i in range(len(LABELS)) :
      cm, auc_roc, auc_sp = get_results(
        y_test=y_test[:,i],
        y_pred=result["pred"][:,i],
        threshold=0.5
      )
      exac = (cm[1][1] + cm[0][0])/(cm[1][1]+cm[0][0]+cm[0][1]+cm[1][0])
      prec = (cm[1][1])/(cm[1][1]+cm[0][1])
      sens = (cm[1][1])/(cm[1][1]+cm[1][0])
      F1sc = (sens*prec*2)/(sens+prec)
      df_testing.loc[len(df_testing)] = [
        f"Iteración {iter+1}",
        result["model"],
        LABELS[i],
        cm[1][1], 
        cm[0][0], 
        cm[0][1], 
        cm[1][0], 
        exac,
        prec,
        sens,
        F1sc,
        auc_sp, 
        result["time_end"] - result["time_start"]
      ]
  iter+=1
      
# pd.DataFrame(columns=["iter", "model", "label", "TP", "TN", "FP", "FN", "Exac", "Prec", "Sens", "F1", "AUC_SP", "ms"])


    Iteración 1
  
Modelo RandomForestClassifier
Conjunto de entrenamiento
                  Frecuencia  Frecuencia(%)
label                                      
BENIGN                 77779      47.046406
PORTSCAN               13428       8.122233
DOS_HULK               13371       8.087755
DDOS                    8678       5.249087
SSH_PATATOR             8678       5.249087
DOS_SLOWLORIS           8678       5.249087
DOS_SLOWHTTPTEST        8678       5.249087
DOS_GOLDENEYE           8678       5.249087
FTP_PATATOR             8678       5.249087
BOT                     8678       5.249087
(165324, 22)
Conjunto de validación
                  Frecuencia  Frecuencia(%)
label                                      
BENIGN                 49148      68.054113
PORTSCAN                8467      11.724062
DOS_HULK                8425      11.665905
DDOS                    5042       6.981542
DOS_GOLDENEYE            416       0.576026
FTP_PATATOR              224       0.310168
DOS_SLOW

[WinError 2] El sistema no puede encontrar el archivo especificado
  File "c:\Users\User\anaconda3\envs\ids_thesis\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\User\anaconda3\envs\ids_thesis\Lib\subprocess.py", line 550, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\User\anaconda3\envs\ids_thesis\Lib\subprocess.py", line 1028, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\User\anaconda3\envs\ids_thesis\Lib\subprocess.py", line 1540, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Prueba del metamodelo Bagging
Registro de resultados
results_list : 4
y_pred_bagging : (16993, 10)
y_pred_RandomForestClassifier : (16993, 10)
y_pred_DecisionTreeClassifier : (16993, 10)
y_pred_MLPClassifier : (16993, 10)
y_pred_KNeighborsClassifier : (16993, 10)
results_list : 5

    Iteración 2
  
Modelo RandomForestClassifier
Conjunto de entrenamiento
                  Frecuencia  Frecuencia(%)
label                                      
BENIGN                 77789      47.049609
PORTSCAN               13428       8.121741
DOS_HULK               13371       8.087266
DDOS                    8678       5.248769
SSH_PATATOR             8678       5.248769
DOS_SLOWLORIS           8678       5.248769
DOS_SLOWHTTPTEST        8678       5.248769
DOS_GOLDENEYE           8678       5.248769
FTP_PATATOR             8678       5.248769
BOT                     8678       5.248769
(165334, 22)
Conjunto de validación
                  Frecuencia  Frecuencia(%)
label                              



Validación
Prueba del metamodelo Bagging
Registro de resultados
Modelo KNeighborsClassifier
Conjunto de entrenamiento
                  Frecuencia  Frecuencia(%)
label                                      
BENIGN                 77787      47.048968
PORTSCAN               13428       8.121840
DOS_HULK               13371       8.087364
DDOS                    8678       5.248833
DOS_GOLDENEYE           8678       5.248833
DOS_SLOWLORIS           8678       5.248833
FTP_PATATOR             8678       5.248833
SSH_PATATOR             8678       5.248833
BOT                     8678       5.248833
DOS_SLOWHTTPTEST        8678       5.248833
(165332, 22)
Conjunto de validación
                  Frecuencia  Frecuencia(%)
label                                      
BENIGN                 49138      67.991310
PORTSCAN                8479      11.732230
DOS_HULK                8438      11.675499
DDOS                    5096       7.051238
DOS_GOLDENEYE            398       0.550705
FTP_PATATO

In [15]:
df_validation

Unnamed: 0,iter,model,label,TP,TN,FP,FN,AUC_SP,ms
0,Iteración 1,RandomForestClassifier,BENIGN,49113,23050,21,35,0.999962,0.356037
1,Iteración 1,RandomForestClassifier,BOT,38,72180,0,1,1.000000,0.356037
2,Iteración 1,RandomForestClassifier,DDOS,5042,67175,2,0,0.999998,0.356037
3,Iteración 1,RandomForestClassifier,DOS_GOLDENEYE,402,71801,2,14,0.997974,0.356037
4,Iteración 1,RandomForestClassifier,DOS_HULK,8423,63794,0,2,0.999993,0.356037
...,...,...,...,...,...,...,...,...,...
395,Iteración 10,KNeighborsClassifier,DOS_SLOWHTTPTEST,91,72169,7,4,0.955305,13.982771
396,Iteración 10,KNeighborsClassifier,DOS_SLOWLORIS,203,72039,23,6,0.964386,13.982771
397,Iteración 10,KNeighborsClassifier,FTP_PATATOR,219,72051,0,1,0.997714,13.982771
398,Iteración 10,KNeighborsClassifier,PORTSCAN,8466,63761,31,13,0.998127,13.982771


In [16]:
df_testing

Unnamed: 0,iter,model,label,TP,TN,FP,FN,Exac,Prec,Sens,F1,AUC_SP,ms
0,Iteración 1,RandomForestClassifier,BENIGN,11564,5416,6,7,0.999235,0.999481,0.999395,0.999438,0.999894,0.081429
1,Iteración 1,RandomForestClassifier,BOT,9,16984,0,0,1.000000,1.000000,1.000000,1.000000,1.000000,0.081429
2,Iteración 1,RandomForestClassifier,DDOS,1189,15804,0,0,1.000000,1.000000,1.000000,1.000000,1.000000,0.081429
3,Iteración 1,RandomForestClassifier,DOS_GOLDENEYE,91,16897,2,3,0.999706,0.978495,0.968085,0.973262,0.998891,0.081429
4,Iteración 1,RandomForestClassifier,DOS_HULK,1979,15012,0,2,0.999882,1.000000,0.998990,0.999495,0.999733,0.081429
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Iteración 10,BaggingClassifier,DOS_SLOWHTTPTEST,21,16970,0,1,0.999941,1.000000,0.954545,0.976744,0.990079,43.372202
496,Iteración 10,BaggingClassifier,DOS_SLOWLORIS,49,16940,2,1,0.999823,0.960784,0.980000,0.970297,0.997838,43.372202
497,Iteración 10,BaggingClassifier,FTP_PATATOR,50,16942,0,0,1.000000,1.000000,1.000000,1.000000,1.000000,43.372202
498,Iteración 10,BaggingClassifier,PORTSCAN,1989,14995,7,1,0.999529,0.996493,0.999497,0.997993,0.999539,43.372202


In [17]:
df_validation.to_excel("output/validation_results.xlsx")
df_testing.to_excel("output/testing_results.xlsx")
print("Guardado")

Guardado
