In [1]:
from sklearnex import patch_sklearn
patch_sklearn()

import numpy as np
from matplotlib import pyplot as plt
from tqdm.autonotebook import tqdm
import pandas as pd
import pickle
import os

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin, clone

from sklearn.feature_selection import SelectFromModel
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as skLDA

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn import set_config

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import random
from deap import base, creator, tools, algorithms

import multiprocessing

from cuml.svm import LinearSVC as cuSVC
from cuml import LogisticRegression as cuLR
from cuml.neighbors import KNeighborsClassifier as cuKNN
from cuml.ensemble import RandomForestClassifier as cuRF
from cuml.naive_bayes import GaussianNB as cuNB
from sklearn.naive_bayes import GaussianNB as skNB
from sklearn.ensemble import RandomForestClassifier as skRF
from sklearn.neighbors import KNeighborsClassifier as skKNN
from sklearn.svm import LinearSVC as skSVC
from sklearn.linear_model import LogisticRegression as skLR

import random
import shap
import time
import functools

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
  from tqdm.autonotebook import tqdm


In [2]:
df = pd.read_parquet("data/BaIoT/danmini_combined.parquet")
X_data = StandardScaler().fit_transform(df.drop(['traffic_type'], axis=1))
y_data = LabelEncoder().fit_transform(df['traffic_type'])
del df

In [7]:
def has_converged(series, w, eps):
    if w >= len(series):
        return False

    current = series[len(series)-1]
    window = series[-w-1:-1]
    average_error = np.mean(np.abs(window-current)/np.abs(current))
    return average_error <= eps

def bag_of_little_bootstraps(X, y, clf, scorer, gamma, r_eps = 0.05, r_w = 20, s_eps = 0.05, s_w = 3):
    n = len(X)
    b = round(n**gamma)
    print(b)
    subset_scores = []
    while not has_converged(subset_scores, s_w, s_eps):
        subsample_idx = random.sample(range(n), k=b)
        X_sampled = X[subsample_idx, :]
        y_sampled = y[subsample_idx]
        train, test = train_test_split(np.arange(b), test_size=0.33, stratify=y_sampled, random_state=42)
        monte_carlo_scores = []
        while not has_converged(monte_carlo_scores, r_w, r_eps):
            sample_weights = np.random.multinomial(n=n, pvals=[1/b]*b, size=1)[0]
            model = clone(clf)
            model.fit(X=X_sampled[train, :], 
                    y=y_sampled[train], 
                    sample_weight=sample_weights[train])
            monte_carlo_scores.append(
                scorer(y_sampled[test], 
                       model.predict_proba(X_sampled[test, :]), 
                       sample_weight=sample_weights[test])
            )
        subset_scores.append(np.mean(monte_carlo_scores))
    print("num subsets taken: " + str(len(subset_scores)))
    print("mean acc: " + str(np.mean(subset_scores)))
    return np.mean(subset_scores)

In [8]:
bag_of_little_bootstraps(X_data, y_data, skNB(), functools.partial(roc_auc_score, average="weighted", multi_class="ovr"), 0.65, r_eps = 0.05, r_w = 20, s_eps = 0.03, s_w = 5)

8037
num subsets taken: 6
mean acc: 0.9920372276763697


0.9920372276763697