In [1]:
# Imports

import numpy as np
import pandas as pd
from shared.utils import load_data
from datasets import preprocess_dataset, datasets_types
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from scipy.stats import beta as beta_dist
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
seed = 42

In [2]:
load_dataset = True
name = "CIC-IDS_2017_MAB"
if not load_dataset:
    # Preprocesar el dataset
    """      """
    df = load_data(
        [
            "./shared/data/CIC_2017/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
            "./shared/data/CIC_2017/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
            "./shared/data/CIC_2017/Tuesday-WorkingHours.pcap_ISCX.csv"
        ],
        seed
    )
    print("Dataset cargado")
    df_preprocessed = preprocess_dataset(
        df, save=True, dataset_type="CIC_2017", seed=seed, load=load_dataset, name_save=name, name_load=name)
    print("Dataset Preprocesado")
else:
    df_preprocessed = preprocess_dataset(
        pd.DataFrame(), save=True, dataset_type="CIC_2017", seed=seed, load=load_dataset, name_save=name, name_load=name)
    print("Dataset Preprocesado")


Dataset cargado
Loading new data
labels: {'SSH-Patator', 'FTP-Patator', 'Web Attack � Brute Force', 'Infiltration', 'Web Attack � Sql Injection', 'Web Attack � XSS'}
Dataset Preprocesado


In [3]:
mlp = MLPClassifier(
            hidden_layer_sizes=(32),
            max_iter=200,
            verbose=False,
            random_state=seed,
            batch_size=200,
            early_stopping=True,
            activation='tanh',
            solver='adam'
        )

In [4]:
class MultiArmedBanditThompsonSampling:

    def __init__(self, n_arms, n_clusters):
        self.n_arms = n_arms
        self.n_clusters = n_clusters
        self.arms = [RandomForestClassifier(), LogisticRegression(),
                     GaussianNB(), DecisionTreeClassifier(), mlp]
        self.cluster_centers = None
        self.cluster_assignments = None
        self.reward_sums = {}
        for cluster in range(n_clusters):
            self.reward_sums[cluster] = np.zeros(n_arms)
        self.alpha = np.ones(self.n_arms)
        self.beta = np.ones(self.n_arms)

    def train(self, X_train, y_train):
        kmeans = KMeans(n_clusters=self.n_clusters)
        self.cluster_assignments = kmeans.fit_predict(X_train)
        self.cluster_centers = kmeans.cluster_centers_
        # Print the number of samples in each cluster

        for i in range(self.n_clusters):
            print('Cluster {}: {}'.format(
                i, np.sum(self.cluster_assignments == i)))
            cluster_mask = self.cluster_assignments == i
            cluster_X_train = X_train[cluster_mask]
            cluster_y_train = y_train[cluster_mask]
            for arm in range(self.n_arms):
                print('Training arm {} on cluster {}'.format(arm, i))
                arm_mask = cluster_y_train == arm
                arm_X_train = cluster_X_train[arm_mask]
                arm_y_train = cluster_y_train[arm_mask]
                if len(arm_X_train) > 0 and len(np.unique(arm_y_train)) > 1:
                    self.arms[arm].fit(arm_X_train, arm_y_train)
                else:
                    self.arms[arm].fit(X_train, y_train)

        # Set the arms rewards for each cluster
        for i in range(self.n_clusters):
            cluster_mask = self.cluster_assignments == i
            cluster_X_test = X_train[cluster_mask]
            cluster_y_test = y_train[cluster_mask]
            for arm in range(self.n_arms):
                print('Setting reward_sums arm {} on cluster {}'.format(arm, i))
                arm_mask = cluster_y_test == arm
                arm_X_test = cluster_X_test[arm_mask]
                arm_y_test = cluster_y_test[arm_mask]
                if len(arm_X_test) > 0:
                    arm_y_pred = self.arms[arm].predict(arm_X_test)
                    self.reward_sums[i][arm] = np.mean(
                        arm_y_pred == arm_y_test)

    def select_arm(self, cluster):
        # Select the arm with the highest reward
        theta = np.zeros(self.n_arms)
        for arm in range(self.n_arms):
            theta[arm] = np.random.beta(self.alpha[arm] + self.reward_sums[cluster]
                                        [arm], self.beta[arm] + 1 - self.reward_sums[cluster][arm])
        return np.argmax(theta)

    def predict(self, X_test):
        # Select the arm for each sample
        arms = np.zeros(len(X_test))
        for i in range(len(X_test)):
            cluster = np.argmin(np.linalg.norm(
                self.cluster_centers - X_test[i], axis=1))
            arms[i] = self.select_arm(cluster)
        # Predict using the selected arm
        y_pred = np.zeros(len(X_test))
        for arm in range(self.n_arms):
            arm_mask = arms == arm
            arm_X_test = X_test[arm_mask]
            if len(arm_X_test) > 0:
                y_pred[arm_mask] = self.arms[arm].predict(arm_X_test)
        return y_pred, arms


In [5]:
df_preprocessed.y_train

335097    0
239304    0
372543    0
214897    0
485479    0
         ..
150160    0
205126    0
809270    0
739403    0
526654    0
Name:  Label, Length: 633413, dtype: object

In [7]:
# Train the MAB
mab = MultiArmedBanditThompsonSampling(n_arms=5, n_clusters=2)
mab.train(df_preprocessed.x_train, df_preprocessed.y_train)



Cluster 0: 153209
Training arm 0 on cluster 0


In [8]:
# Test the MAB
y_pred, selected_arms = mab.predict(df_preprocessed.x_test)
print("Accuracy:", accuracy_score(df_preprocessed.y_test, y_pred))
print(classification_report(df_preprocessed.y_test, y_pred))

Accuracy: 0.9549280798653159
              precision    recall  f1-score   support

           0       1.00      0.90      0.95     14655
           1       0.93      1.00      0.96     19202

    accuracy                           0.95     33857
   macro avg       0.96      0.95      0.95     33857
weighted avg       0.96      0.95      0.95     33857



In [9]:
# Test the MAB
y_pred2, selected_arms = mab.predict(X_test2)
print("Accuracy:", accuracy_score(y_test2, y_pred2))
print(classification_report(y_test2, y_pred2))

Accuracy: 0.9548690078861092
              precision    recall  f1-score   support

           0       1.00      0.90      0.94     14595
           1       0.93      1.00      0.96     19262

    accuracy                           0.95     33857
   macro avg       0.96      0.95      0.95     33857
weighted avg       0.96      0.95      0.95     33857



In [109]:
for i in range(y_pred.shape[0]):
    print("Selected arm:", selected_arms[i], "Predicted:", y_pred[i], "Actual:", y_test[i])

Selected arm: 2.0 Predicted: 0.0 Actual: 0
Selected arm: 0.0 Predicted: 0.0 Actual: 0
Selected arm: 0.0 Predicted: 0.0 Actual: 0
Selected arm: 0.0 Predicted: 0.0 Actual: 0
Selected arm: 1.0 Predicted: 0.0 Actual: 0
Selected arm: 0.0 Predicted: 0.0 Actual: 0
Selected arm: 0.0 Predicted: 0.0 Actual: 0
Selected arm: 0.0 Predicted: 1.0 Actual: 1
Selected arm: 0.0 Predicted: 0.0 Actual: 0
Selected arm: 0.0 Predicted: 0.0 Actual: 0
Selected arm: 0.0 Predicted: 0.0 Actual: 0
Selected arm: 0.0 Predicted: 1.0 Actual: 0
Selected arm: 0.0 Predicted: 0.0 Actual: 0
Selected arm: 1.0 Predicted: 1.0 Actual: 0
Selected arm: 2.0 Predicted: 0.0 Actual: 0
Selected arm: 2.0 Predicted: 0.0 Actual: 0
Selected arm: 2.0 Predicted: 0.0 Actual: 0
Selected arm: 0.0 Predicted: 0.0 Actual: 0
Selected arm: 0.0 Predicted: 0.0 Actual: 0
Selected arm: 0.0 Predicted: 0.0 Actual: 0
Selected arm: 0.0 Predicted: 1.0 Actual: 1
Selected arm: 2.0 Predicted: 0.0 Actual: 0
Selected arm: 1.0 Predicted: 0.0 Actual: 0
Selected ar