In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import lightgbm as lgb
import seaborn as sns

In [51]:
from river import anomaly
from river import stream
from river import evaluate
from river import metrics
from river import compose
from river import preprocessing

In [52]:
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest

In [53]:
PROJECT_ROOT                     = "../../"

SOURCE_DATASET_ROOT              = PROJECT_ROOT + "datasets/isp3/"

SOURCE_DATASET_FILENAME          = SOURCE_DATASET_ROOT + "[dvwa.isp_xvwa.isp]_https_apache.csv"
SOURCE_DATASET_FILENAME_D        = SOURCE_DATASET_ROOT + "[dvwa.isp]_https_apache.csv"
SOURCE_DATASET_FILENAME_X        = SOURCE_DATASET_ROOT + "[xvwa.isp]_https_apache.csv"
# SOURCE_DATASET_TRAIN_FILENAME    = SOURCE_DATASET_ROOT + "DataSetN_Train.csv"
# SOURCE_DATASET_TEST_FILENAME     = SOURCE_DATASET_ROOT + "DataSetN_Test.csv"

# Фичи

In [54]:
df = pd.read_csv(SOURCE_DATASET_FILENAME)
df['Label'].value_counts()

  df = pd.read_csv(SOURCE_DATASET_FILENAME)


Label
Web Attack - DDoS                 65295
Benign                             4446
Web Attack - XSS                   2686
Web Attack - SQL Injection         1483
Web Attack - Command Injection     1400
Web Attack - Brute Force            497
Web Attack - Web Shell              353
Web Attack - CSRF                    60
Name: count, dtype: int64

In [55]:
excluded = ['Flow_ID', 'Source_IP', 'Source_Port', 'Destination_IP', 'Destination_Port', 'Protocol', 'Timestamp', "Unnamed: 0", "xvwa.isp", "Label", "dvwa.isp"]
df = df.drop(columns=excluded, errors='ignore')

In [56]:
excluded2 = ['Session_Index',  'Target','Http_Reqest','Root','GlobalLabel','Type','Tools','ToolsThreads','ToolsDelay','ToolsAdditional',
             'ChannelSpeedBefore','ChannelSpeedAfter','NetemString','Server','KeepAliveTimeout','TargetProtocol','File','SessionAnalizerMode']
df = df.drop(columns=excluded2)

In [57]:
webattack_features = list(df.columns)

In [58]:
df = pd.read_csv(SOURCE_DATASET_FILENAME)
df['Label'].value_counts()

  df = pd.read_csv(SOURCE_DATASET_FILENAME)


Label
Web Attack - DDoS                 65295
Benign                             4446
Web Attack - XSS                   2686
Web Attack - SQL Injection         1483
Web Attack - Command Injection     1400
Web Attack - Brute Force            497
Web Attack - Web Shell              353
Web Attack - CSRF                    60
Name: count, dtype: int64

In [59]:
label_rows = df[df['Label'] == 'Web Attack - DDoS']
rows_to_remove = label_rows.sample(frac=0.94, random_state=42)
df = df.drop(index=rows_to_remove.index)
df['Label'].value_counts()

Label
Benign                            4446
Web Attack - DDoS                 3918
Web Attack - XSS                  2686
Web Attack - SQL Injection        1483
Web Attack - Command Injection    1400
Web Attack - Brute Force           497
Web Attack - Web Shell             353
Web Attack - CSRF                   60
Name: count, dtype: int64

In [60]:
# g = df.groupby('Label')
# g = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))
# df = g.reset_index(drop=True)

In [61]:
attacks = {
    'Web Attack - XSS',
    'Web Attack - CSRF',
    'Web Attack - Brute Force',
    'Web Attack - Web Shell',
    'Web Attack - Command Injection',
    'Web Attack - SQL Injection',
    'Web Attack - DDoS'
}

attack_to_exclude = {'Web Attack - Web Shell'}

benign = {'Benign', 'FromInSide'}

In [62]:
df['Label'] = df['Label'].replace(to_replace=benign, value='Benign')
df_full = df[df['Label'].isin(attacks | benign)]
df_part = df_full[~df_full['Label'].isin(attack_to_exclude)]

In [63]:
def prepare_dataset(df, train_size=0.2, test_size=0.8):
    X = df[webattack_features]
    y = df['Label']
    if train_size == 0 and test_size == 1:
        return [], X, [], y
    else:
        X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, 
                                                            train_size=train_size, test_size=test_size, shuffle=True, random_state=0)
    return X_train, X_test, y_train, y_test

In [64]:
_, X_test1, _, y_test1 = prepare_dataset(df_part, train_size=0, test_size=1)

In [65]:
_, X_test2, _, y_test2 = prepare_dataset(df_full, train_size=0, test_size=1)

In [66]:
df_part["Label"].value_counts()

Label
Benign                            4446
Web Attack - DDoS                 3918
Web Attack - XSS                  2686
Web Attack - SQL Injection        1483
Web Attack - Command Injection    1400
Web Attack - Brute Force           497
Web Attack - CSRF                   60
Name: count, dtype: int64

# Границы

In [67]:
def add_result(result, key, val):
    if key in result:
        result[key].append(val)
    else:
        result[key] = [val]

In [68]:
def test(df, attacks, benign, test_ood):
    df_full = df[df['Label'].isin(attacks | benign)]
    result = {"Label": []}
    for attack_to_exclude in tqdm(attacks):
        report, name = test_ood(df_full, {attack_to_exclude})
        
        result["Label"].append(attack_to_exclude)
        # add_result(result, name, f"{report._f1s[True].recall.get()*100}/{report._f1s[False].get()*100}")
        add_result(result, "recall for anomaly", report._f1s[True].recall.get())
        add_result(result, "f1 for normal", report._f1s[False].get())
    result = pd.DataFrame(result)
    result = result.set_index("Label")
    columns=[(name, c) for c in result.columns]
    result.columns=pd.MultiIndex.from_tuples(columns)
    return result

In [69]:
def test_ood_OneClassSVM(df, attack_to_exclude):
    def comp_threshold(scores: np.ndarray):
        for i in np.linspace(0.0, 0.9, num=50):
            res = (scores<i).sum()/scores.shape[0]
            if res>0.95:
                return i
    
    def prepare_df(df):
        X = df[webattack_features]
        y = df['Label']
        MinMaxScaler(df[webattack_features])
        return X, y

    def fit(df):
        X, y = prepare_df(df)
        model = OneClassSVM(nu=2.5247696078015696e-05, kernel='poly', gamma='auto', degree=3)
        model = model.fit(X.to_numpy())
    
        scores = model.score_samples(X.to_numpy())
        th = comp_threshold(scores)
        return model, th
    
    df_part = df[~df['Label'].isin(attack_to_exclude)]
    model, th = fit(df_part)
    
    X, y = prepare_df(df)
    
    report = metrics.ClassificationReport()
    result = {"Label": [], "Score": [], "is_anomaly": []}
    for x, yi1 in tqdm(stream.iter_pandas(X, y), total=y.shape[0], leave=False):
        x = np.array(list(x.values())).reshape(1, -1)
        result["Label"].append(yi1)
        is_anomaly = model.predict(x)[0] == -1
        result["is_anomaly"].append(is_anomaly)
        report.update(yi1 in attack_to_exclude, is_anomaly)
        
    print(report)
    return report, "OCSVM"

In [70]:
def test_ood_LocalOutlierFactor(df, attack_to_exclude):
    def prepare_df(df):
        X = df[webattack_features]
        y = df['Label']
        return X, y
    
    def fit(df):
        X, y = prepare_df(df)
        model = LocalOutlierFactor(novelty=True, n_neighbors=50, algorithm='brute', leaf_size=18, p=2)
        # for x, yil in tqdm(stream.iter_pandas(X, y), total=y.shape[0], leave=False):
        #     x = np.array(list(x.values())).reshape(1, -1)
        #     model = model.fit(x)
        model = model.fit(X.to_numpy())
        return model
    
    df_part = df[~df['Label'].isin(attack_to_exclude)]
    model = fit(df_part)
    
    X, y = prepare_df(df)
    
    report = metrics.ClassificationReport()
    result = {"Label": [], "Score": [], "is_anomaly": []}
    for x, yi1 in tqdm(stream.iter_pandas(X, y), total=y.shape[0], leave=False):
        x = np.array(list(x.values())).reshape(1, -1)
        result["Label"].append(yi1)
        is_anomaly = model.predict(x)[0] == -1
        result["is_anomaly"].append(is_anomaly)
        report.update(yi1 in attack_to_exclude, is_anomaly)
        
    print(report)
    return report, "LOF"

In [71]:
def test_ood_IsolationForest(df, attack_to_exclude):
    def prepare_df(df):
        X = df[webattack_features]
        y = df['Label']
        return X, y
    
    def fit(df):
        X, y = prepare_df(df)
        model = IsolationForest(n_estimators=35, max_features=0.2396517137577818)
        # for x, yil in tqdm(stream.iter_pandas(X, y), total=y.shape[0], leave=False):
        #     x = np.array(list(x.values())).reshape(1, -1)
        #     model = model.fit(x)
        model = model.fit(X.to_numpy())
        return model
    
    df_part = df[~df['Label'].isin(attack_to_exclude)]
    model = fit(df_part)
    
    X, y = prepare_df(df)
    
    report = metrics.ClassificationReport()
    result = {"Label": [], "Score": [], "is_anomaly": []}
    for x, yi1 in tqdm(stream.iter_pandas(X, y), total=y.shape[0], leave=False):
        x = np.array(list(x.values())).reshape(1, -1)
        result["Label"].append(yi1)
        is_anomaly = model.predict(x)[0] == -1
        result["is_anomaly"].append(is_anomaly)
        report.update(yi1 in attack_to_exclude, is_anomaly)
        
    print(report)
    return report, "IF"

In [72]:
class AnsambleOOD:
    def __init__(self, models: list):
        self.models = models
        
    def fit(self, X):
        for model in self.models:
            model.fit(X)
        return self
            
    def predict(self, X):
        scores = np.zeros(X.shape[0])
        for model in self.models:
            scores += model.predict(X)
        return scores

In [73]:
def test_ood_AnsambleOOD(df, attack_to_exclude):
    def prepare_df(df):
        X = df[webattack_features]
        y = df['Label']
        return X, y
    
    def fit(df):
        X, y = prepare_df(df)
        model = AnsambleOOD([
            # OneClassSVM(kernel='rbf', gamma='auto', nu=0.01),
            OneClassSVM(nu=0.6011613204478358, kernel='linear', gamma='scale'),
            LocalOutlierFactor(novelty=True),
            IsolationForest()
        ])
        model = model.fit(X.to_numpy())
        return model
    
    df_part = df[~df['Label'].isin(attack_to_exclude)]
    model = fit(df_part)
    
    X, y = prepare_df(df)
    
    report = metrics.ClassificationReport()
    result = {"Label": [], "Score": [], "is_anomaly": []}
    for x, yi1 in tqdm(stream.iter_pandas(X, y), total=y.shape[0], leave=False):
        x = np.array(list(x.values())).reshape(1, -1)
        result["Label"].append(yi1)
        is_anomaly = model.predict(x)[0] < 0
        result["is_anomaly"].append(is_anomaly)
        report.update(yi1 in attack_to_exclude, is_anomaly)
        
    print(report)
    return report, "Ansamble"

In [None]:
result1 = test(df, attacks, benign, test_ood_OneClassSVM)

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/14843 [00:00<?, ?it/s]

           Precision   Recall   F1       Support  
                                                  
   False      73.61%   99.99%   84.80%     10925  
    True      66.67%    0.05%    0.10%      3918  
                                                  
   Macro      70.14%   50.02%   42.45%            
   Micro      73.61%   73.61%   73.61%            
Weighted      71.78%   73.61%   62.44%            

                 73.61% accuracy                  


  0%|          | 0/14843 [00:00<?, ?it/s]

           Precision   Recall   F1       Support  
                                                  
   False      93.91%   99.99%   96.86%     13360  
    True      99.84%   41.60%   58.73%      1483  
                                                  
   Macro      96.88%   70.80%   77.80%            
   Micro      94.16%   94.16%   94.16%            
Weighted      94.50%   94.16%   93.05%            

                 94.16% accuracy                  


  0%|          | 0/14843 [00:00<?, ?it/s]

           Precision   Recall   F1       Support  
                                                  
   False      99.60%   99.99%   99.79%     14783  
    True       0.00%    0.00%    0.00%        60  
                                                  
   Macro      49.80%   50.00%   49.90%            
   Micro      99.59%   99.59%   99.59%            
Weighted      99.19%   99.59%   99.39%            

                 99.59% accuracy                  


In [None]:
result2 = test(df, attacks, benign, test_ood_IsolationForest)

In [None]:
result3 = test(df, attacks, benign, test_ood_LocalOutlierFactor)

In [None]:
result_a = test(df, attacks, benign, test_ood_AnsambleOOD)

In [None]:
result_a

In [None]:
results_list = [result1, result2, result3, result_a]

In [None]:
results = pd.DataFrame()
results["Label"] = (result1.index).map(lambda x: x[13:])
for result in results_list:
    name = result.columns[0][0]
    print(name)
    a = (result[name]['recall for anomaly']*100).round(1)
    b = (result[name]['f1 for normal']*100).round(1)
    results[name] = a.combine(b, lambda x, y: f"{x}/{y}").reset_index(drop=True)
results

In [None]:
# result = pd.merge(result1, result2, left_index=True, right_index=True)
# result = pd.merge(result, result3, left_index=True, right_index=True)
# result = pd.merge(result, result_a, left_index=True, right_index=True)
# display(result)
# display(pd.DataFrame({"max": result.max(), "min": result.min(), "mean": result.mean()}))

In [None]:
# result = result.round(3)

In [None]:
# result.index = result.index.map(lambda s: s[13:])

In [47]:
result1

Unnamed: 0_level_0,OCSVM,OCSVM
Unnamed: 0_level_1,recall for anomaly,f1 for normal
Label,Unnamed: 1_level_2,Unnamed: 2_level_2
Web Attack - DDoS,0.518121,0.919327
Web Attack - SQL Injection,0.53203,0.973674
Web Attack - CSRF,0.0,0.996891
Web Attack - Brute Force,0.0,0.981892
Web Attack - Web Shell,0.0,0.986895
Web Attack - XSS,0.001489,0.89947
Web Attack - Command Injection,0.0,0.949428


In [150]:
results.to_csv('ood-results.csv', index=False)

In [48]:
results = pd.read_csv('ood-results.csv', header=[0, 1], index_col=0)

In [49]:
results

Label,OCSVM,IF,LOF,Ansamble
XSS - xsstrike,100.0/38.8,0.0/88.7,16.4/88.5,16.4/85.6
XSS,100.0/38.6,1.7/87.1,73.3/92.2,74.8/89.5
