In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from collections import deque
import time

import joblib
import pandas as pd
import river

from ic import util

In [4]:
def step_to_dataframe(evaluation_step):
    return pd.DataFrame({
        'accuracy': evaluation_step['Accuracy'].get(),
        'recall': evaluation_step['Recall'].get(),
        'precision': evaluation_step['Precision'].get(),
        'f1': evaluation_step['F1'].get(),
    }, index=[evaluation_step['Step']])


def evaluate_prequential_delayed(
    model, stream, delay=500, eval_step=500
):
    metrics = river.metrics.base.Metrics(
        metrics=[
            river.metrics.Accuracy(),  
            river.metrics.Recall(), 
            river.metrics.Precision(),
            river.metrics.F1()
        ]
    )
    
    step_iterator = river.evaluate.iter_progressive_val_score(
        model=model, 
        dataset=stream, 
        metric=metrics, 
        step=eval_step,
        delay=delay
    )
    
    metrics_df = pd.DataFrame()
    
    for step in step_iterator:
        metrics_df = pd.concat([metrics_df, step_to_dataframe(step)])
    
    return metrics_df


def metrics_to_dataframe(step, metrics):
    return pd.DataFrame(dict(
        zip(
            ['accuracy', 'recall', 'precision', 'f1'],
            [ [ m.get() ] for m in metrics ]
        )
    ), index=[step])


def evaluate_prequential_delayed_1(
    model, stream, delay=500, eval_step=500
):
    delay_queue = deque()
    
    metrics = river.metrics.base.Metrics(
        metrics=[
            river.metrics.Accuracy(),  
            river.metrics.Recall(), 
            river.metrics.Precision(),
            river.metrics.F1()
        ]
    )
    
    metrics_df = pd.DataFrame()
    
    for step, (X, y) in enumerate(stream):
        y_pred = model.predict_one(X)
        metrics.update(y_true=y, y_pred=y_pred)
        
        delay_queue.append((X, y))
        while len(delay_queue) > delay:
            X, y = delay_queue.popleft()
            model.learn_one(X, y)
    
        if (step+1)%eval_step == 0:
            metrics_df = pd.concat([metrics_df, metrics_to_dataframe(step+1, metrics)])
    
    return metrics_df


def warm_up_and_evaluate(model, stream, n_warm_up_rows=1000):
    for _ in range(n_warm_up_rows):
        X, y = next(stream)
        model.learn_one(X, y)
    
    return evaluate_prequential_delayed(model, stream)

In [9]:
def yield_dataset(path, target):
    chunks = pd.read_csv(path, chunksize=1000)
    for chunk in chunks:
        for _, row in chunk.iterrows():
            yield (row[row.index != target].to_dict(), row[target])
            
            
def process_evaluation(model, label, file_path):
    out_path = f'../../data/2022_08_01/metrics/{util.basename(file_path)}.{label}.metrics.csv'        
    stream = yield_dataset(file_path, target='Label')

    start_time = time.time()
    h_m = time.strftime('%H:%M')
    print(f'Starting {label} - {h_m}')

    metrics = warm_up_and_evaluate(model, stream)
    metrics.to_csv(out_path, index_label='step')

    print(f'Took {int((time.time()-start_time)/60)} mins\n')

    model_out_path = f'../../trained_models/2022_08_01/{label}_{util.basename(file_path)}.joblib'
    joblib.dump(model, model_out_path)

In [5]:
process_evaluation(
    river.tree.HoeffdingTreeClassifier(),
    'HoeffdingTree',
    '../../data/2022_08_01/processed/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.processed.csv'
)

Starting HoeffdingTree - 19:36
Took 2 mins



In [6]:
process_evaluation(
    river.ensemble.BaggingClassifier(
        model=river.tree.HoeffdingTreeClassifier()
    ),
    'Bagging',
    '../../data/2022_08_01 rocessed/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.processed.csv'
)

Starting Bagging - 19:39
Took 21 mins



In [7]:
process_evaluation(
    river.ensemble.ADWINBaggingClassifier(
        model=river.tree.HoeffdingTreeClassifier()
    ),
    'ADWINBagging',
    '../../data/2022_08_01/processed/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.processed.csv'
)

Starting ADWINBagging - 20:01
Took 25 mins



---

In [8]:
process_evaluation(
    river.tree.HoeffdingTreeClassifier(),
    'HoeffdingTree',
    '../../data/2022_08_01/processed/Tuesday-WorkingHours.pcap_ISCX.processed.csv'
)

Starting HoeffdingTree - 20:45
Took 5 mins



In [9]:
process_evaluation(
   river.ensemble.BaggingClassifier(
        model=river.tree.HoeffdingTreeClassifier()
    ),
    'Bagging',
    '../../data/2022_08_01/processed/Tuesday-WorkingHours.pcap_ISCX.processed.csv'
)

Starting Bagging - 20:50
Took 45 mins



In [10]:
process_evaluation(
    river.ensemble.ADWINBaggingClassifier(
        model=river.tree.HoeffdingTreeClassifier()
    ),
    'ADWINBagging',
    '../../data/2022_08_01/processed/Tuesday-WorkingHours.pcap_ISCX.processed.csv'
)

Starting ADWINBagging - 21:36
Took 58 mins



---

In [11]:
process_evaluation(
    river.tree.HoeffdingTreeClassifier(),
    'HoeffdingTree',
    '../../data/2022_08_01/processed/Wednesday-workingHours.pcap_ISCX.processed.csv'
)

Starting HoeffdingTree - 22:34
Took 8 mins



In [12]:
process_evaluation(
   river.ensemble.BaggingClassifier(
        model=river.tree.HoeffdingTreeClassifier()
    ),
    'Bagging',
    '../../data/2022_08_01/processed/Wednesday-workingHours.pcap_ISCX.processed.csv'
)

Starting Bagging - 22:43
Took 72 mins



In [13]:
process_evaluation(
    river.ensemble.ADWINBaggingClassifier(
        model=river.tree.HoeffdingTreeClassifier()
    ),
    'ADWINBagging',
    '../../data/2022_08_01/processed/Wednesday-workingHours.pcap_ISCX.processed.csv'
)

Starting ADWINBagging - 23:56
Took 89 mins



---

In [14]:
process_evaluation(
    river.tree.HoeffdingTreeClassifier(),
    'HoeffdingTree',
    '../../data/2022_08_01/processed/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.processed.csv'
)

Starting HoeffdingTree - 01:26
Took 1 mins



In [15]:
process_evaluation(
   river.ensemble.BaggingClassifier(
        model=river.tree.HoeffdingTreeClassifier()
    ),
    'Bagging',
    '../../data/2022_08_01/processed/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.processed.csv'
)

Starting Bagging - 01:27
Took 14 mins



In [16]:
process_evaluation(
    river.ensemble.ADWINBaggingClassifier(
        model=river.tree.HoeffdingTreeClassifier()
    ),
    'ADWINBagging',
    '../../data/2022_08_01/processed/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.processed.csv'
)

Starting ADWINBagging - 01:42
Took 16 mins



---

In [17]:
process_evaluation(
    river.tree.HoeffdingTreeClassifier(),
    'HoeffdingTree',
    '../../data/2022_08_01/processed/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.processed.csv'
)

Starting HoeffdingTree - 01:59
Took 2 mins



In [18]:
process_evaluation(
   river.ensemble.BaggingClassifier(
        model=river.tree.HoeffdingTreeClassifier()
    ),
    'Bagging',
    '../../data/2022_08_01/processed/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.processed.csv'
)

Starting Bagging - 02:01
Took 19 mins



In [19]:
process_evaluation(
    river.ensemble.ADWINBaggingClassifier(
        model=river.tree.HoeffdingTreeClassifier()
    ),
    'ADWINBagging',
    '../../data/2022_08_01/processed/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.processed.csv'
)

Starting ADWINBagging - 02:21
Took 25 mins



---

In [20]:
process_evaluation(
    river.tree.HoeffdingTreeClassifier(),
    'HoeffdingTree',
    '../../data/2022_08_01/processed/Friday-WorkingHours-Morning.pcap_ISCX.processed.csv'
)

Starting HoeffdingTree - 02:46
Took 1 mins



In [21]:
process_evaluation(
   river.ensemble.BaggingClassifier(
        model=river.tree.HoeffdingTreeClassifier()
    ),
    'Bagging',
    '../../data/2022_08_01/processed/Friday-WorkingHours-Morning.pcap_ISCX.processed.csv'
)

Starting Bagging - 02:47
Took 9 mins



In [22]:
process_evaluation(
    river.ensemble.ADWINBaggingClassifier(
        model=river.tree.HoeffdingTreeClassifier()
    ),
    'ADWINBagging',
    '../../data/2022_08_01/processed/Friday-WorkingHours-Morning.pcap_ISCX.processed.csv'
)

Starting ADWINBagging - 02:57
Took 11 mins



---

In [30]:
import numpy as np
from river import base


from collections import deque
from functools import reduce
import heapq


def euclidean_distance(a, b):
    return np.linalg.norm(a-b)


class MyKNN(base.Classifier):
    def __init__(self, n_neighbors=5, window_size=1000, min_distance_keep=0.05):
        self.n_neighbors = n_neighbors
        self.window_size = window_size
        self.min_distance_keep = min_distance_keep
        self.points = deque()
        self.fields = None
        
    def learn_one(self, x, y):
        if not self.fields:
            self.fields = [ field for field in x ]
        
        new_point = np.array([ x[field] for field in self.fields])
        min_distance = reduce(
            lambda acc, x: min(acc, euclidean_distance(x[0], new_point)),
            self.points,
            float('inf')
        )
        
        if min_distance >= self.min_distance_keep:
            self.points.append((new_point, y))
            
        while len(self.points) > self.window_size:
            self.points.popleft()
        
        return self
    
    
    def predict_one(self, x):
        if not self.points:
            raise RuntimeError('KNN has no points yet')
        
        heap = []
        point_x = np.array([ x[field] for field in self.fields])
        
        for point, label in self.points:
            heapq.heappush(heap, (-euclidean_distance(point_x, point), label))
        
            while len(heap) > self.n_neighbors:
                heapq.heappop(heap)
        
        scores = {}
        for distance, label in heap:
            if not label in scores:
                scores[label] = 0
            
            score_for_point = (-1)/distance if distance != 0 else float('inf')
            scores[label] += score_for_point
        
        best_weight = float('-inf')
        predicted_label = None
        
        for label, weight in scores.items():
            if weight > best_weight:
                best_weight = weight
                predicted_label = label
        
        return predicted_label

In [31]:
process_evaluation(
    MyKNN(),
    'KNN',
    '../../data/2022_08_01/processed/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.processed.csv'
)

Starting KNN - 01:09
Took 20 mins



In [32]:
process_evaluation(
    MyKNN(),
    'KNN',
    '../../data/2022_08_01/processed/Tuesday-WorkingHours.pcap_ISCX.processed.csv'
)

Starting KNN - 03:05
Took 42 mins



In [33]:
process_evaluation(
    MyKNN(),
    'KNN',
    '../../data/2022_08_01/processed/Wednesday-workingHours.pcap_ISCX.processed.csv'
)

Starting KNN - 03:48
Took 65 mins



In [34]:
process_evaluation(
    MyKNN(),
    'KNN',
    '../../data/2022_08_01/processed/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.processed.csv'
)

Starting KNN - 04:53
Took 13 mins



In [35]:
process_evaluation(
    MyKNN(),
    'KNN',
     '../../data/2022_08_01/processed/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.processed.csv'
)

Starting KNN - 05:07
Took 19 mins



In [36]:
process_evaluation(
    MyKNN(),
    'KNN',
     '../../data/2022_08_01/processed/Friday-WorkingHours-Morning.pcap_ISCX.processed.csv'
)

Starting KNN - 05:27
Took 8 mins

