In [26]:
%pip install pandas scikit-learn seaborn optuna matplotlib river optuna-dashboard

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
# import lightgbm as lgb
import seaborn as sns
import optuna
import os
import sys
import datetime

In [5]:
from river import anomaly
from river import stream
from river import evaluate
from river import metrics
from river import compose
from river import preprocessing
from river import evaluate

In [6]:
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import shuffle

In [7]:
PROJECT_ROOT                     = "../"
OPTUNA_STORAGE                   = "sqlite:///db.sqlite3"

SOURCE_DATASET_ROOT              = PROJECT_ROOT + "datasets/isp3/"

SOURCE_DATASET_FILENAME          = SOURCE_DATASET_ROOT + "[dvwa.isp_xvwa.isp]_https_apache.csv"
SOURCE_DATASET_FILENAME_D        = SOURCE_DATASET_ROOT + "[dvwa.isp]_https_apache.csv"
SOURCE_DATASET_FILENAME_X        = SOURCE_DATASET_ROOT + "[xvwa.isp]_https_apache.csv"
# SOURCE_DATASET_TRAIN_FILENAME    = SOURCE_DATASET_ROOT + "DataSetN_Train.csv"
# SOURCE_DATASET_TEST_FILENAME     = SOURCE_DATASET_ROOT + "DataSetN_Test.csv"

# Фичи

In [8]:
df = pd.read_csv(SOURCE_DATASET_FILENAME)
df['Label'].value_counts()

  df = pd.read_csv(SOURCE_DATASET_FILENAME)


Label
Web Attack - DDoS                 65295
Benign                             4446
Web Attack - XSS                   2686
Web Attack - SQL Injection         1483
Web Attack - Command Injection     1400
Web Attack - Brute Force            497
Web Attack - Web Shell              353
Web Attack - CSRF                    60
Name: count, dtype: int64

In [9]:
excluded = ['Flow_ID', 'Source_IP', 'Source_Port', 'Destination_IP', 'Destination_Port', 'Protocol', 'Timestamp', "Unnamed: 0", "xvwa.isp", "Label", "dvwa.isp"]
df = df.drop(columns=excluded, errors='ignore')

In [10]:
excluded2 = ['Session_Index',  'Target','Http_Reqest','Root','GlobalLabel','Type','Tools','ToolsThreads','ToolsDelay','ToolsAdditional',
             'ChannelSpeedBefore','ChannelSpeedAfter','NetemString','Server','KeepAliveTimeout','TargetProtocol','File','SessionAnalizerMode']
df = df.drop(columns=excluded2)

In [11]:
# webattack_features = list(df.columns)
webattack_features = ['Flow_Duration',
 'Total_Fwd_Packets',
 'Fwd_Packet_Length_Max',
 'Bwd_Packet_Length_Max',
 'Flow_Bytes_s',
 'Flow_Packets_s',
 'Flow_IAT_Min',
 'Fwd_IAT_Min',
 'Bwd_IAT_Mean',
 'Bwd_IAT_Std',
 'Bwd_IAT_Min',
 'Packet_Length_Variance',
 'Fwd_FIN_Flags',
 'Fwd_SYN_Flags',
 'Fwd_RST_Flags',
 'Bwd_FIN_Flags',
 'Stream_Session_Number',
 'Stream_Was_Prev_Session',
 'Stream_Was_Next_Session',
 'Stream_Session_Intervals_Min_Duration',
 'Stream_Session_Intervals_Std_Duration',
 'Stream_Session_Durations_Min_Duration',
 'Stream_Session_Durations_Max_Duration',
 'Fwd_Bandwidth_Max',
 'Bwd_Bandwidth_Min',
 'Bwd_Bandwidth_Max']

In [12]:
df = pd.read_csv(SOURCE_DATASET_FILENAME)
df['Label'].value_counts()

  df = pd.read_csv(SOURCE_DATASET_FILENAME)


Label
Web Attack - DDoS                 65295
Benign                             4446
Web Attack - XSS                   2686
Web Attack - SQL Injection         1483
Web Attack - Command Injection     1400
Web Attack - Brute Force            497
Web Attack - Web Shell              353
Web Attack - CSRF                    60
Name: count, dtype: int64

In [13]:
label_rows = df[df['Label'] == 'Web Attack - DDoS']
rows_to_remove = label_rows.sample(frac=0.94, random_state=42)
df = df.drop(index=rows_to_remove.index)
df['Label'].value_counts()

Label
Benign                            4446
Web Attack - DDoS                 3918
Web Attack - XSS                  2686
Web Attack - SQL Injection        1483
Web Attack - Command Injection    1400
Web Attack - Brute Force           497
Web Attack - Web Shell             353
Web Attack - CSRF                   60
Name: count, dtype: int64

In [14]:
# g = df.groupby('Label')
# g = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))
# df = g.reset_index(drop=True)

In [15]:
attacks = {
    'Web Attack - XSS',
    'Web Attack - CSRF',
    'Web Attack - Brute Force',
    'Web Attack - Web Shell',
    'Web Attack - Command Injection',
    'Web Attack - SQL Injection',
    'Web Attack - DDoS'
}

attack_to_exclude = {'Web Attack - Web Shell'}

benign = {'Benign', 'FromInSide'}

In [16]:
df['Label'] = df['Label'].replace(to_replace=benign, value='Benign')
df_full = df[df['Label'].isin(attacks | benign)]
df_part = df_full[~df_full['Label'].isin(attack_to_exclude)]

In [17]:
def prepare_dataset(df, train_size=0.2, test_size=0.8):
    X = df[webattack_features]
    y = df['Label']
    if train_size == 0 and test_size == 1:
        return [], X, [], y
    else:
        X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, 
                                                            train_size=train_size, test_size=test_size, shuffle=True, random_state=0)
    return X_train, X_test, y_train, y_test

In [18]:
_, X_test1, _, y_test1 = prepare_dataset(df_part, train_size=0, test_size=1)

In [19]:
_, X_test2, _, y_test2 = prepare_dataset(df_full, train_size=0, test_size=1)

In [20]:
df_part["Label"].value_counts()

Label
Benign                            4446
Web Attack - DDoS                 3918
Web Attack - XSS                  2686
Web Attack - SQL Injection        1483
Web Attack - Command Injection    1400
Web Attack - Brute Force           497
Web Attack - CSRF                   60
Name: count, dtype: int64

# Выбор шиперпараметров

In [21]:
disable_progressbar = True

In [22]:
def add_result(result, key, val):
    if key in result:
        result[key].append(val)
    else:
        result[key] = [val]

In [23]:
def test(model, df, attacks, benign, test_ood):
    df_full = df[df['Label'].isin(attacks | benign)]
    result = {"Label": []}
    for attack_to_exclude in tqdm(attacks, disable=disable_progressbar):
        report, name = test_ood(model, df_full, {attack_to_exclude})
        
        result["Label"].append(attack_to_exclude)
        # add_result(result, name, f"{report._f1s[True].recall.get()*100}/{report._f1s[False].get()*100}")
        add_result(result, "recall for anomaly", report._f1s[True].recall.get())
        add_result(result, "f1 for normal", report._f1s[False].get())
        add_result(result, "all", (result["recall for anomaly"][-1]+result["f1 for normal"][-1])/2)
    result = pd.DataFrame(result)
    result = result.set_index("Label")
    columns=[(name, c) for c in result.columns]
    result.columns=pd.MultiIndex.from_tuples(columns)
    return result["OCSVM"]["all"].mean()

In [24]:
def test_ood_OneClassSVM(model, df, attack_to_exclude):
    def comp_threshold(scores: np.ndarray):
        for i in np.linspace(0.0, 0.9, num=50):
            res = (scores<i).sum()/scores.shape[0]
            if res>0.95:
                return i
    
    def prepare_df(df, scaler=None):
        X = df[webattack_features]
        y = df['Label']
        if scaler is None:
            scaler = StandardScaler()
            X = scaler.fit_transform(X)
        else:
            X = scaler.transform(X)
        y = y.to_numpy()
        return X, y, scaler

    def fit(model, X, y):
        # X, y = prepare_df(df)
        # model = OneClassSVM(kernel='rbf', gamma='auto', nu=0.01)
        model = model.fit(X)
    
        scores = model.score_samples(X)
        th = comp_threshold(scores)
        return model, th
    
    df_part = df[~df['Label'].isin(attack_to_exclude)]
    df_target = df[df['Label'].isin(attack_to_exclude)]
    
    X, y, scaler = prepare_df(df_part)

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, 
                                                        train_size=0.5, test_size=0.5, 
                                                        shuffle=True, random_state=2024)

    X_target, y_target, scaler = prepare_df(df_target, scaler)
    
    X_test = np.append(X_test, X_target, axis=0)
    y_test = np.append(y_test, y_target, axis=0)
    X_test, y_test = shuffle(X_test, y_test, random_state=2024)
    
    model, th = fit(model, X_train, y_train)
    
    report = metrics.ClassificationReport()
    result = {"Label": [], "Score": [], "is_anomaly": []}
    for x, yi1 in tqdm(stream.iter_array(X_test, y_test), total=y_test.shape[0], leave=False, disable=disable_progressbar):
        x = np.array(list(x.values())).reshape(1, -1)
        result["Label"].append(yi1)
        is_anomaly = model.predict(x)[0] == -1
        result["is_anomaly"].append(is_anomaly)
        report.update(yi1 in attack_to_exclude, is_anomaly)
    
    f = open(os.devnull, 'w')
    stdout_t, sys.stdout = sys.stdout, f
    print(report)
    sys.stdout = stdout_t
    return report, "OCSVM"

In [25]:
%optuna-dashboard sqlite:///db.sqlite3

UsageError: Line magic function `%optuna-dashboard` not found.


In [23]:
import optuna

import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
import sklearn.svm


def objective(trial):
    nu = trial.suggest_float("nu", 1e-10, 0.3, log=True)
    kernel = trial.suggest_categorical("kernel", ['poly', 'rbf', 'sigmoid'])
    # kernel = trial.suggest_categorical("kernel", ['linear', 'poly', 'rbf', 'sigmoid'])
    gamma = trial.suggest_categorical("gamma", ['scale', 'auto'])
    degree = 3
    if kernel == "poly":
        degree = trial.suggest_int("degree", 1, 32, log=True)
    classifier_obj = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu, degree=degree)

    accuracy = test(classifier_obj, df, attacks, benign, test_ood_OneClassSVM)
    return accuracy


study = optuna.create_study(
    storage=OPTUNA_STORAGE,
    direction="maximize",
    study_name=f"OneClassSVM_{datetime.datetime.now()}"
)
study.optimize(objective, n_trials=100, n_jobs=-1, show_progress_bar=True, timeout=3600)
print(f"Best value: {study.best_value} (params: {study.best_params})")
print(study.best_trial)

[I 2024-08-20 17:27:12,317] A new study created in RDB with name: OneClassSVM_2024-08-20 17:27:11.717702


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2024-08-20 17:27:42,487] Trial 4 finished with value: 0.4473364195269873 and parameters: {'nu': 2.0360539220701245e-08, 'kernel': 'sigmoid', 'gamma': 'scale'}. Best is trial 4 with value: 0.4473364195269873.
[I 2024-08-20 17:27:48,923] Trial 5 finished with value: 0.4778683284002109 and parameters: {'nu': 3.451995120257347e-05, 'kernel': 'sigmoid', 'gamma': 'auto'}. Best is trial 5 with value: 0.4778683284002109.
[I 2024-08-20 17:27:49,120] Trial 6 finished with value: 0.4782701561775437 and parameters: {'nu': 1.193424406470777e-06, 'kernel': 'sigmoid', 'gamma': 'auto'}. Best is trial 0 with value: 0.7048105016895708.
[I 2024-08-20 17:27:49,483] Trial 0 finished with value: 0.7048105016895708 and parameters: {'nu': 0.00038085162697265064, 'kernel': 'poly', 'gamma': 'scale', 'degree': 5}. Best is trial 0 with value: 0.7048105016895708.
[I 2024-08-20 17:28:11,341] Trial 8 finished with value: 0.5 and parameters: {'nu': 4.6770502474944275e-09, 'kernel': 'rbf', 'gamma': 'auto'}. Best is

KeyboardInterrupt: 

In [None]:
import optuna

import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
import sklearn.svm


def objective(trial):
    n_neighbors = trial.suggest_int("n_neighbors", 5, 50, log=True)
    algorithm = trial.suggest_categorical("algorithm", ['auto', 'ball_tree', 'kd_tree', 'brute'])
    leaf_size = trial.suggest_int("leaf_size", 5, 50, log=True)
    p = trial.suggest_categorical("p", [1, 2])
    classifier_obj = LocalOutlierFactor(novelty=True, n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, p=p)

    accuracy = test(classifier_obj, df, attacks, benign, test_ood_OneClassSVM)
    return accuracy


study = optuna.create_study(
    storage=OPTUNA_STORAGE,
    direction="maximize",
    study_name=f"LocalOutlierFactor_{datetime.datetime.now()}"
)
study.optimize(objective, n_trials=100, n_jobs=-1, show_progress_bar=True)
print(f"Best value: {study.best_value} (params: {study.best_params})")
print(study.best_trial)

[I 2024-06-03 12:36:32,533] A new study created in RDB with name: LocalOutlierFactor_2024-06-03 12:36:32.471143


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2024-06-03 12:42:16,452] Trial 8 finished with value: 0.8560441756587658 and parameters: {'n_neighbors': 22, 'algorithm': 'ball_tree', 'leaf_size': 7, 'p': 1}. Best is trial 8 with value: 0.8560441756587658.
[I 2024-06-03 12:42:19,862] Trial 9 finished with value: 0.8316963066256005 and parameters: {'n_neighbors': 11, 'algorithm': 'ball_tree', 'leaf_size': 22, 'p': 2}. Best is trial 8 with value: 0.8560441756587658.
[I 2024-06-03 12:42:24,655] Trial 10 finished with value: 0.8525430301793241 and parameters: {'n_neighbors': 12, 'algorithm': 'ball_tree', 'leaf_size': 47, 'p': 1}. Best is trial 8 with value: 0.8560441756587658.
[I 2024-06-03 12:42:25,772] Trial 4 finished with value: 0.8309157741243052 and parameters: {'n_neighbors': 37, 'algorithm': 'kd_tree', 'leaf_size': 24, 'p': 2}. Best is trial 8 with value: 0.8560441756587658.
[I 2024-06-03 12:42:26,504] Trial 3 finished with value: 0.8156394737126851 and parameters: {'n_neighbors': 30, 'algorithm': 'kd_tree', 'leaf_size': 24, '

In [None]:
import optuna

import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
import sklearn.svm


def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 20, 200, log=True)
    max_features = trial.suggest_float("max_features", 0.1, 1, log=True)
    classifier_obj = IsolationForest(n_estimators=n_estimators, max_features=max_features)

    accuracy = test(classifier_obj, df, attacks, benign, test_ood_OneClassSVM)
    return accuracy


study = optuna.create_study(
    storage=OPTUNA_STORAGE,
    direction="maximize",
    study_name=f"IsolationForest_{datetime.datetime.now()}"
)
study.optimize(objective, n_trials=100, n_jobs=-1, show_progress_bar=True)
print(f"Best value: {study.best_value} (params: {study.best_params})")
print(study.best_trial)

In [None]:
# import optuna

# import sklearn.datasets
# import sklearn.ensemble
# import sklearn.model_selection
# import sklearn.svm


# # FYI: Objective functions can take additional arguments
# # (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).
# def objective(trial):
#     iris = sklearn.datasets.load_iris()
#     x, y = iris.data, iris.target

#     classifier_name = trial.suggest_categorical("classifier", ["SVC", "RandomForest"])
#     if classifier_name == "SVC":
#         svc_c = trial.suggest_float("svc_c", 1e-10, 1e10, log=True)
#         classifier_obj = sklearn.svm.SVC(C=svc_c, gamma="auto")
#     else:
#         rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
#         classifier_obj = sklearn.ensemble.RandomForestClassifier(
#             max_depth=rf_max_depth, n_estimators=10
#         )

#     score = sklearn.model_selection.cross_val_score(classifier_obj, x, y, n_jobs=-1, cv=3)
#     accuracy = score.mean()
#     return accuracy


# study = optuna.create_study(direction="maximize")
# study = optuna.create_study(
#     storage="sqlite:///db.sqlite3",  # Specify the storage URL here.
#     study_name="quadratic-simple2"
# )
# study.optimize(objective, n_trials=100)
# print(f"Best value: {study.best_value} (params: {study.best_params})")
# print(study.best_trial)

In [None]:
a = np.array([[1,2,3], [1,2,3], [1,2,3]])
b = np.array([[4,5,6], [4,5,6], [4,5,6]])

In [None]:
np.append(a,b)