<h1>Imports</h1>

In [None]:
import pandas as pd
import numpy as np

from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split, KFold, cross_val_score, cross_validate
from sklearn.utils import shuffle
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
import random

import matplotlib.pyplot as plt

<h1>Carregando os dados</h1>

In [None]:
#Carregar o dataset
features = pd.read_csv('outReduced2.csv')

In [None]:
features.head(10)

In [None]:
#Para identificar todos labels
print(features['Label'].unique())

In [None]:
#Para contar todos labels
features['Label'].value_counts()

In [None]:
#Para plotar o grafico dos labels
#'pie' - grafico de pizza
#'bar' - grafico em barras
features['Label'].value_counts().plot(kind = 'pie')

In [None]:
#Agrupando os tipos de ataques
features['Label'] = features['Label'].replace('Web Attack - Brute Force', 'Web Attack')
features['Label'] = features['Label'].replace('Web Attack - XSS', 'Web Attack')
features['Label'] = features['Label'].replace('Web Attack - Sql Injection', 'Web Attack')
features['Label'] = features['Label'].replace('FTP-Patator', 'FTP-Brute')
features['Label'] = features['Label'].replace('SSH-Patator', 'SSH-Brute')
features['Label'] = features['Label'].replace('DoS slowloris', 'DoS')
features['Label'] = features['Label'].replace('DoS Slowhttptest', 'DoS')
features['Label'] = features['Label'].replace('DoS Hulk', 'DoS')
features['Label'] = features['Label'].replace('DoS GoldenEye', 'DoS')
features['Label'] = features['Label'].replace('Bot', 'BotNet')

In [None]:
#Removendo classes com baixa quantidade de instancias
features.drop(features[features['Label'] == 'Heartbleed'].index, inplace = True)
features.drop(features[features['Label'] == 'Infiltration'].index, inplace = True)

In [None]:
#Transformação dos labels nominais para valores numéricos
features['Label'] = features['Label'].replace('BENIGN', 0)
features['Label'] = features['Label'].replace('DoS', 1)
features['Label'] = features['Label'].replace('PortScan', 2)
features['Label'] = features['Label'].replace('DDoS', 3)
features['Label'] = features['Label'].replace('FTP-Brute', 4)
features['Label'] = features['Label'].replace('SSH-Brute', 5)
features['Label'] = features['Label'].replace('Web Attack', 6)
features['Label'] = features['Label'].replace('BotNet', 7)

In [None]:
#Seleção de atributos (24 no total com Label)
features = features[['Bwd Packet Length Min','Subflow Fwd Bytes','Total Length of Fwd Packets','Fwd Packet Length Mean','Bwd Packet Length Std','Flow IAT Min','Fwd IAT Min','Flow IAT Mean','Flow Duration','Flow IAT Std','Active Min','Active Mean','Bwd IAT Mean','Fwd IAT Mean','ACK Flag Count','Fwd PSH Flags','SYN Flag Count','Fwd Packets/s','Init_Win_bytes_backward','Bwd Packets/s','Init_Win_bytes_forward','PSH Flag Count','Average Packet Size','Label']]

In [None]:
#Tratamento de dados infinitos, NaN e Nulos
features[features == np.inf] = np.nan

#Checa se há valores nulos/NaN
features.isnull().values.any()

#Substitua pela valor médio do atributo em questão caso haja valores nulos/NaN
features.fillna(features.mean(), inplace=True)

In [None]:
#Guarda os labels em um array e exclui a coluna 'Label' do dataframe "features"
labels = np.array(features['Label'])
features = features.drop('Label', axis = 1)

In [None]:
#Realiza o undersampling nas classes
sampling_strategy = {0: 20000, 1: 20000, 2: 20000, 3: 20000, 4: 7938, 5: 5897, 6: 2180, 7:1966}
rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
features, labels = rus.fit_resample(features, labels)

In [None]:
#Realiza o SMOTE + ENN (oversampling) nas classes
features, labels = SMOTEENN().fit_resample(features, labels)
print(np.unique(labels, return_counts=True))

<h1>Preparando os classificadores</h1>

In [None]:
#Normaliza os dados para o KNN dar certo
scaler = MinMaxScaler()
features = scaler.fit_transform(features)

In [None]:
#Separa as entradas no treino/teste para a busca do melhor parametro
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.3, shuffle=True, random_state = 0)

In [None]:
#Checa a quantidade de instancias de cada classe do teste/treino
print(np.unique(train_labels, return_counts=True))
print(np.unique(test_labels, return_counts=True))

In [None]:
#Realiza a "busca" pelo melhor hiperparametro (estimator) para o Random Forest
param_dist = {'n_estimators': randint(100,400)}
rf = RandomForestClassifier()
rand_search = RandomizedSearchCV(rf, param_distributions = param_dist, n_iter=5, cv=5)
rand_search.fit(X_train, y_train)

#Variável do melhor classificador
best_rf = rand_search.best_estimator_

#Mostra o melhor hiperparametro
print('Melhor hiperparametro RF:',  best_rf)

In [None]:
#Realiza a "busca" pelo melhor hiperparametro (estimator) para o XGBoost
param_dist = {'n_estimators': randint(100,400)}
xgb = XGBClassifier()
rand_search = RandomizedSearchCV(xgb, param_distributions = param_dist, n_iter=5, cv=5)
rand_search.fit(X_train, y_train)

#Variável do melhor classificador
best_xgb = rand_search.best_estimator_

#Mostra o melhor hiperparametro
print('Melhor hiperparametro XGBoost:',  best_xgb)

In [None]:
#Realiza a "busca" pelo melhor hiperparametro (estimator) para o KNN
param_dist = {'n_neighbors': randint(1,17)}
knn = KNeighborsClassifier()
rand_search = RandomizedSearchCV(knn, param_distributions = param_dist, n_iter=5, cv=5)
rand_search.fit(X_train, y_train)

#Variável do melhor classificador
best_knn = rand_search.best_params_

#Mostra o melhor hiperparametro
print('Melhor hiperparametro KNN:',  best_knn)

In [None]:
#Criando os classificadores RF, XGBoost e KNN com os melhores parametros
rf = RandomForestClassifier(n_estimators = 300)
xgb = XGBClassifier(n_estimators = 300)
knn = KNeighborsClassifier(n_neighbors = 11)

<h1>Treino e Teste dos modelos</h1>

In [None]:
kFold = KFold(n_splits=5,shuffle=True)
for train_index, test_index in kFold.split(features):
    
    X_train, X_test, y_train, y_test = features[train_index], features[test_index], labels[train_index], labels[test_index]
    
    
    rf.fit(X_train, y_train)
    
    xgb.fit(X_train, y_train)
    
    knn.fit(X_train, y_train)
    
    
    predrf = rf.predict(X_test)
    print("Accuracy RF: ", round(accuracy_score(y_test, predrf), 4), "\n")
    print("Precision RF: ", round(precision_score(y_test, predrf, average='macro'), 4), "\n --- \n")
    
    
    predxgb = xgb.predict(X_test)
    print("Accuracy XGB: ", round(accuracy_score(y_test, predxgb), 4), "\n")
    print("Precision XGB: ", round(precision_score(y_test, predxgb, average='macro'), 4), "\n --- \n")

    
    predknn = knn.predict(X_test)
    print("Accuracy KNN: ", round(accuracy_score(y_test, predknn), 4), "\n")
    print("Precision KNN: ", round(precision_score(y_test, predknn, average='macro'), 4), "\n --- \n")


<h1>Construindo e plotando matrizes de confusão dos modelos<h1>
<h2>Utiliza o último resultado do K-Fold</h2>

In [None]:
#Matriz de confusão do RF
cmrf = confusion_matrix(y_test, predrf)
ConfusionMatrixDisplay(confusion_matrix=cmrf).plot();

In [None]:
#Matriz de confusão do XGBoost
cmxgb = confusion_matrix(y_test, predxgb)
ConfusionMatrixDisplay(confusion_matrix=cmxgb).plot();

In [None]:
#Matriz de confusão do KNN
cmknn = confusion_matrix(y_test, predknn)
ConfusionMatrixDisplay(confusion_matrix=cmknn).plot();

In [None]:
#Se desejado, é possível realizar o cross_validate (CV) (que usa o StratifiedKFold) para re-testar os modelos

#CV do RF
print(cross_validate(rf, features, labels, scoring=['accuracy', 'precision_macro'], cv=5))

#CV do XGBoost
print(cross_validate(xgb, features, labels, scoring=['accuracy', 'precision_macro'], cv=5))

#CV do KNN
print(cross_validate(knn, features, labels, scoring=['accuracy', 'precision_macro'], cv=5))