In [1]:
# Importaciones
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import TomekLinks
from itertools import combinations
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve, auc
from sklearn.model_selection import StratifiedKFold
from scipy.stats import friedmanchisquare
from scipy import stats

In [2]:
bagging_model = [
  {
    'path_name' : 'rf', 
    'model_name' : 'RandomForestClassifier',
    'model' : RandomForestClassifier(criterion="gini", n_estimators=100)
  },
  {
    'path_name' : 'dt', 
    'model_name' : 'DecisionTreeClassifier',
    'model' : DecisionTreeClassifier(criterion="gini")
  },
  {
    'path_name' : 'mlp',
    'model_name' : 'MLPClassifier',
    'model' : MLPClassifier(hidden_layer_sizes=(64,), max_iter=1000, early_stopping=True, random_state=9603)
  },
  {
    'path_name' : 'knn',
    'model_name' : 'KNeighborsClassifier',
    'model' : KNeighborsClassifier(n_neighbors=3, weights='distance', metric='manhattan')
  },
]

In [52]:
df_results = pd.read_excel("output/testing_results.xlsx", index_col='Unnamed: 0')
df_results

Unnamed: 0,iter,model,label,TP,TN,FP,FN,Exac,Prec,Sens,F1,AUC_SP,ms
0,Iteración 1,RandomForestClassifier,BENIGN,11564,5416,6,7,0.999235,0.999481,0.999395,0.999438,0.999894,0.081429
1,Iteración 1,RandomForestClassifier,BOT,9,16984,0,0,1.000000,1.000000,1.000000,1.000000,1.000000,0.081429
2,Iteración 1,RandomForestClassifier,DDOS,1189,15804,0,0,1.000000,1.000000,1.000000,1.000000,1.000000,0.081429
3,Iteración 1,RandomForestClassifier,DOS_GOLDENEYE,91,16897,2,3,0.999706,0.978495,0.968085,0.973262,0.998891,0.081429
4,Iteración 1,RandomForestClassifier,DOS_HULK,1979,15012,0,2,0.999882,1.000000,0.998990,0.999495,0.999733,0.081429
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Iteración 10,BaggingClassifier,DOS_SLOWHTTPTEST,21,16970,0,1,0.999941,1.000000,0.954545,0.976744,0.990079,43.372202
496,Iteración 10,BaggingClassifier,DOS_SLOWLORIS,49,16940,2,1,0.999823,0.960784,0.980000,0.970297,0.997838,43.372202
497,Iteración 10,BaggingClassifier,FTP_PATATOR,50,16942,0,0,1.000000,1.000000,1.000000,1.000000,1.000000,43.372202
498,Iteración 10,BaggingClassifier,PORTSCAN,1989,14995,7,1,0.999529,0.996493,0.999497,0.997993,0.999539,43.372202


In [53]:
df_results['label'].unique()

array(['BENIGN', 'BOT', 'DDOS', 'DOS_GOLDENEYE', 'DOS_HULK',
       'DOS_SLOWHTTPTEST', 'DOS_SLOWLORIS', 'FTP_PATATOR', 'PORTSCAN',
       'SSH_PATATOR'], dtype=object)

In [55]:
df_results = df_results[["iter", "model", "label", "TP", "TN", "FP", "FN", "Exac", "Prec", "Sens", "F1", "ms", "AUC_SP"]]
df_results['ms'] = df_results['ms'] * 1000
df_results['n_iter'] = df_results['iter'].str.extract(r'(\d+)').astype(int)
df_results['model'] = pd.Categorical(df_results['model'], categories=['RandomForestClassifier', 'DecisionTreeClassifier', 'MLPClassifier', 'KNeighborsClassifier', 'BaggingClassifier'], ordered=True)
df_results['label'] = pd.Categorical(df_results['label'], categories=['BENIGN', 'BOT', 'DDOS', 'DOS_GOLDENEYE', 'DOS_HULK', 'DOS_SLOWHTTPTEST', 'DOS_SLOWLORIS', 'FTP_PATATOR', 'PORTSCAN', 'SSH_PATATOR'], ordered=True)
df_results = df_results.sort_values(by=['n_iter', 'model', 'label'])
df_results[["iter", "model", "label", "TP", "TN", "FP", "FN", "Exac", "Prec", "Sens", "F1", "ms", "AUC_SP"]].to_excel("output/df_sorted_results.xlsx")
df_results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_results['ms'] = df_results['ms'] * 1000


Unnamed: 0,iter,model,label,TP,TN,FP,FN,Exac,Prec,Sens,F1,ms,AUC_SP,n_iter
0,Iteración 1,RandomForestClassifier,BENIGN,11564,5416,6,7,0.999235,0.999481,0.999395,0.999438,81.4288,0.999894,1
1,Iteración 1,RandomForestClassifier,BOT,9,16984,0,0,1.000000,1.000000,1.000000,1.000000,81.4288,1.000000,1
2,Iteración 1,RandomForestClassifier,DDOS,1189,15804,0,0,1.000000,1.000000,1.000000,1.000000,81.4288,1.000000,1
3,Iteración 1,RandomForestClassifier,DOS_GOLDENEYE,91,16897,2,3,0.999706,0.978495,0.968085,0.973262,81.4288,0.998891,1
4,Iteración 1,RandomForestClassifier,DOS_HULK,1979,15012,0,2,0.999882,1.000000,0.998990,0.999495,81.4288,0.999733,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Iteración 10,BaggingClassifier,DOS_SLOWHTTPTEST,21,16970,0,1,0.999941,1.000000,0.954545,0.976744,43372.2024,0.990079,10
496,Iteración 10,BaggingClassifier,DOS_SLOWLORIS,49,16940,2,1,0.999823,0.960784,0.980000,0.970297,43372.2024,0.997838,10
497,Iteración 10,BaggingClassifier,FTP_PATATOR,50,16942,0,0,1.000000,1.000000,1.000000,1.000000,43372.2024,1.000000,10
498,Iteración 10,BaggingClassifier,PORTSCAN,1989,14995,7,1,0.999529,0.996493,0.999497,0.997993,43372.2024,0.999539,10


In [None]:
models = df_results['model'].unique()
metrics = ["Exac", "Prec", "Sens", "F1", "AUC_SP", "ms"]
df_friedmann = pd.DataFrame(columns=["label", "metric", "stadistic", "p_value", "difieren?"])
df_shapiro   = pd.DataFrame(columns=["label", "metric", "stadistic", "p_value", "es normal?"])

nivel_significancia = 0.05

def obtener_residuos(data) :
  df = pd.Series(data)
  mean = df.mean()
  df = df - mean
  return df.to_list()

for i in df_results["label"].unique() : 
  print(f"\n       Etiqueta : {i}\n")

  with pd.ExcelWriter(f'output/muestra_para_analisis/{i}.xlsx') as writer:
    for j in metrics : 
      print(f"Indicador : {j}")
      df_muestra_general = pd.DataFrame(columns=['label', 'model', 'iter_1', 'iter_2', 'iter_3', 'iter_4', 'iter_5', 'iter_6', 'iter_7', 'iter_8', 'iter_9', 'iter_10'])
      lista_shapiro_residuos   = []
      lista_friedmann = []

      """ Obtención de muestras de cada modelo por la variable objetivo """
      mask = df_results["label"] == i

      for model in models : 
        # Obtención de muestra de resultados de cada iteración de cada modelo 
        muestra = df_results[mask & (df_results["model"] == model)][j].to_list()

        # Agregando muestra al dataframe
        df_muestra_general.loc[len(df_muestra_general)] = [j, model] + muestra

        # Obtención de residuos por muestra
        residuos_muestra = obtener_residuos(muestra)

        lista_shapiro_residuos = lista_shapiro_residuos + residuos_muestra
        lista_friedmann.append(muestra)

      print(lista_shapiro_residuos)
      print(lista_friedmann)

      df_muestra_general.to_excel(writer, sheet_name=j, index=False)

      """ Test de Friedmann """
      # H0 (hipótesis nula): No hay diferencias significativas entre los grupos comparados (los modelos rinden igual).
      # H1 (hipótesis alternativa): Al menos un grupo difiere significativamente de los demás.
      # Nivel de significancia = 0.05
      # Si p <= nivel de significancia  ->  Se rechaza H0 (al menos una muestra difiere de las demás)

      """ Test de Shapiro-Wilk """
      # H0 (hipótesis nula): La muestra tiene una distribución normal
      # H1 (hipótesis alternativa): La muestre no tiene distribución normal
      # Nivel de significancia = 0.05
      # Si p <= nivel de significancia  ->  Se rechaza H0 (la distribución no es normal)

      # Guardando resultados de test shapiro sobre la muestra
      stat, p = stats.shapiro(lista_shapiro_residuos)
      df_shapiro.loc[len(df_shapiro)] = [
        i, 
        j, 
        stat, 
        p, 
        p <= nivel_significancia
      ]

      # Guardando resultados de test friedman sobre la muestra
      stat, p = stats.friedmanchisquare(*lista_friedmann)
      df_friedmann.loc[len(df_friedmann)] = [
        i,
        j,
        stat,
        p,
        p > nivel_significancia
      ]


       Etiqueta : BENIGN

Indicador : Exac
[1.1778556651798056e-05, 7.062631749443771e-05, -0.0003413080084043729, 0.00012947407833718838, -4.706920419095262e-05, 0.00012947407833718838, 7.062631749443771e-05, -4.706920419095262e-05, 0.00018828720655228448, -0.00016482013808050144, -0.0001765197323412293, -0.0012357794275098533, 0.00011771907187230202, 0.00023541459355769234, 2.355018691169164e-08, -0.0005296062973974003, 0.00035311011524308267, 0.000647348919456614, 0.0004118470516774053, 0.00017644215525558504, -0.0006295910396469928, 1.7734329622709488e-05, 0.0007827552205778021, 0.00042966865552163114, -0.00045304775711896283, -0.00045304775711896283, 0.00019427761215073946, 0.0008416029814204418, -0.0010419651999166168, 0.0003116129545089885, -0.00046486995088157457, 0.00035899870091626873, -0.00028832666835343357, -0.00011178338582540359, 0.00018245541838812773, 0.0004178464617589084, 0.00012360765754548808, 0.0004178464617589084, -0.00022961051149572587, -0.00040616418381211883

In [5]:
df_friedmann[df_friedmann["difieren?"]]

Unnamed: 0,label,metric,stadistic,p_value,difieren?
8,BOT,Sens,5.6,0.231078,True
32,DOS_SLOWHTTPTEST,Sens,4.747664,0.314182,True
38,DOS_SLOWLORIS,Sens,5.360544,0.252262,True
44,FTP_PATATOR,Sens,1.333333,0.855695,True
50,PORTSCAN,Sens,4.624277,0.328065,True
54,SSH_PATATOR,Exac,8.153846,0.086102,True
56,SSH_PATATOR,Sens,8.642857,0.070673,True
57,SSH_PATATOR,F1,7.647059,0.105397,True


In [7]:
df_shapiro[~df_shapiro["es normal?"]]

Unnamed: 0,label,metric,stadistic,p_value,es normal?
0,BENIGN,Exac,0.959989,0.088802,False
1,BENIGN,Prec,0.984516,0.750621,False
3,BENIGN,F1,0.959763,0.086879,False
24,DOS_HULK,Exac,0.965534,0.151488,False
26,DOS_HULK,Sens,0.98794,0.886738,False
27,DOS_HULK,F1,0.965533,0.151476,False
31,DOS_SLOWHTTPTEST,Prec,0.954669,0.053218,False
33,DOS_SLOWHTTPTEST,F1,0.958126,0.074195,False
51,PORTSCAN,F1,0.957114,0.067305,False
