In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from collections import deque
import time

import joblib
import pandas as pd
import river

from ic import util

In [3]:
def step_to_dataframe(evaluation_step):
    return pd.DataFrame({
        'accuracy': evaluation_step['Accuracy'].get(),
        'recall': evaluation_step['Recall'].get(),
        'precision': evaluation_step['Precision'].get(),
        'f1': evaluation_step['F1'].get(),
    }, index=[evaluation_step['Step']])


def evaluate_prequential_delayed(
    model, stream, delay=500, eval_step=500
):
    metrics = river.metrics.base.Metrics(
        metrics=[
            river.metrics.Accuracy(),  
            river.metrics.Recall(), 
            river.metrics.Precision(),
            river.metrics.F1()
        ]
    )
    
    step_iterator = river.evaluate.iter_progressive_val_score(
        model=model, 
        dataset=stream, 
        metric=metrics, 
        step=eval_step,
        delay=delay
    )
    
    metrics_df = pd.DataFrame()
    
    for step in step_iterator:
        metrics_df = pd.concat([metrics_df, step_to_dataframe(step)])
    
    return metrics_df


def metrics_to_dataframe(step, metrics):
    return pd.DataFrame(dict(
        zip(
            ['accuracy', 'recall', 'precision', 'f1'],
            [ [ m.get() ] for m in metrics ]
        )
    ), index=[step])


def evaluate_prequential_delayed_1(
    model, stream, delay=500, eval_step=500
):
    delay_queue = deque()
    
    metrics = river.metrics.base.Metrics(
        metrics=[
            river.metrics.Accuracy(),  
            river.metrics.Recall(), 
            river.metrics.Precision(),
            river.metrics.F1()
        ]
    )
    
    metrics_df = pd.DataFrame()
    
    for step, (X, y) in enumerate(stream):
        y_pred = model.predict_one(X)
        metrics.update(y_true=y, y_pred=y_pred)
        
        delay_queue.append((X, y))
        while len(delay_queue) > delay:
            X, y = delay_queue.popleft()
            model.learn_one(X, y)
    
        if (step+1)%eval_step == 0:
            metrics_df = pd.concat([metrics_df, metrics_to_dataframe(step+1, metrics)])
    
    return metrics_df


def warm_up_and_evaluate(model, stream, n_warm_up_rows=1000):
    for _ in range(n_warm_up_rows):
        X, y = next(stream)
        model.learn_one(X, y)
    
    return evaluate_prequential_delayed(model, stream)

In [4]:
def yield_dataset(path, target):
    chunks = pd.read_csv(path, chunksize=1000)
    for chunk in chunks:
        for _, row in chunk.iterrows():
            yield (row[row.index != target], row[target])
            
            
def process_evaluation(model, label, file_path):
    out_path = f'../../data/2022_08_01/metrics/{util.basename(file_path)}.{label}.metrics.csv'        
    stream = yield_dataset(file_path, target='Label')

    start_time = time.time()
    h_m = time.strftime('%H:%M')
    print(f'Starting {label} - {h_m}')

    metrics = warm_up_and_evaluate(model, stream)
    metrics.to_csv(out_path, index_label='step')

    print(f'Took {int((time.time()-start_time)/60)} mins\n')

    model_out_path = f'../../trained_models/2022_08_01/{label}_{util.basename(file_path)}.joblib'
    joblib.dump(model, model_out_path)

In [5]:
process_evaluation(
    river.tree.HoeffdingTreeClassifier(),
    'HoeffdingTree',
    '../../data/2022_08_01/processed/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.processed.csv'
)

Starting HoeffdingTree - 22:58
Took 5 mins



In [6]:
process_evaluation(
    river.ensemble.BaggingClassifier(
        model=river.tree.HoeffdingTreeClassifier()
    ),
    'Bagging',
    '../../data/2022_08_01/processed/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.processed.csv'
)

Starting Bagging - 23:03
Took 46 mins



In [7]:
process_evaluation(
    river.ensemble.ADWINBaggingClassifier(
        model=river.tree.HoeffdingTreeClassifier()
    ),
    'ADWINBagging',
    '../../data/2022_08_01/processed/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.processed.csv'
)

Starting ADWINBagging - 23:50
Took 49 mins



In [None]:
process_evaluation(
    river.neighbors.KNNClassifier(),
    'KNN',
    '../../data/2022_08_01/processed/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.processed.csv'
)

Starting KNN - 00:39
