# CIC-Darknet2020: Exploration with very Simple Models 
--The dataset has no designated train and test sets. For this demonstration, we will therefore set a random seed. 
Leaving the choice to split the data into train, validation and test sets to the dataset users introduces more variability in the results and thus less reproducibility. 

The clean version is hosted [here on Kaggle](https://www.kaggle.com/datasets/dhoogla/cicdarknet2020) in a parquet file for fast loading and storage savings.

The models in this notebook will be very simple.One Rule per feature (OneR) and ensemble OneR. 

For VPN/NoVPN, ensemble OneR reaches a roc-auc score of 0.779, a lot weaker than the other CIC OneR ensemble models.

For Tor/NoTor, ensemble OneR reaches a roc-auc score of 0.980, extremely good for such a simple model


In [None]:
from fastcore.basics import *
from fastcore.parallel import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score, accuracy_score, RocCurveDisplay, balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from os import cpu_count
import pandas as pd
import numpy as np

: 

In [None]:
df = pd.read_parquet('dataset/cicdarknet.parquet')
df.shape

In [None]:
df.columns

In [None]:
target = 'Label'
df.Label.value_counts()

In [None]:
df.drop(columns=['Label.1'], inplace=True) # for now, do not use the program label

In [None]:
vpn = df.loc[(df.Label == 'VPN') | (df.Label == 'NonVPN')].copy(deep=True)
tor = df.loc[(df.Label == 'Tor') | (df.Label == 'Non-Tor')].copy(deep=True)
# df.drop(df.index[:], inplace=True)

## In this notebook the problems are treated as binary, first VPN or noVPN
The Label column typically offers the class and for some classes with multiple attacks it labels them individually.
In this notebook the first focus is the binary problem so we collapse the class label.

In [None]:
vpn['Label'] = vpn['Label'].astype(dtype='object')
vpn.loc[vpn['Label'] == 'VPN', 'Label'] = 1
vpn.loc[vpn['Label'] == 'NonVPN', 'Label'] = 0
print(vpn['Label'].value_counts())
vpn['Label'] = vpn['Label'].astype(dtype=np.int32)

In [None]:
conts = list(df.columns.difference([target]).values)
len(conts)

In [None]:
def xs_y(df_, targ): 
    if not isinstance(targ, list):
        xs = df_[df_.columns.difference([targ])].copy()
    else:
        xs = df_[df_.columns.difference(targ)].copy()
    y = df_[targ].copy()
    return xs, y

In [None]:
training_set = vpn.sample(frac=0.2, replace=False, random_state=42)
testing_set = vpn.drop(index=training_set.index)
training_set.shape, testing_set.shape

In [None]:
X_train, y_train = xs_y(training_set, targ=target)
X_test, y_test = xs_y(testing_set, targ=target)

### VPN recognition: OneR, one rule per feature

In [None]:
def evaluate_one_feature(feature, index='', metric=roc_auc_score):    
    rootnode = DecisionTreeClassifier(max_depth=1, criterion='gini', class_weight='balanced')    
    rootnode.fit(X_train[feature].array.reshape(-1,1), y_train)    
    preds = rootnode.predict(X_test[feature].array.reshape(-1,1))
    preds_tr = rootnode.predict(X_train[feature].array.reshape(-1,1))    
    met = round(metric(y_test, preds), 4)
    if met > 0.5:
        return [feature, met, rootnode, preds, preds_tr]
    else:
        return [feature, met, None, [], []]

In [None]:
evaluate_one_feature('Flow Bytes/s')

In [None]:
results = parallel(f=evaluate_one_feature, 
                  items=conts, n_workers=cpu_count(), threadpool=False, progress=True)

In [None]:
result_df = pd.DataFrame(data=results, columns=['feature', 'roc_auc_score', 'fitted_models', 'predictions', 'preds_train']).sort_values(by='roc_auc_score', ascending=False)

## VPN recognition: many features with just 1 split choice above .70 AUROC

In [None]:
result_df[['feature', 'roc_auc_score']].head(15)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data_for_plot = pd.concat(objs=[X_train.head(9), y_train.head(9)], axis=1, copy=False, sort=False)

In [None]:
fig, axes = plt.subplots(3,3, figsize=(12,12))
axes = axes.flatten()
for i, tf in enumerate(result_df['feature'].head(9)):    
    sns.histplot(data=data_for_plot, x=tf, stat='percent', hue='Label', bins=100, ax=axes[i])

## VPN Ensemble OneR: 0.780 ROC_AUC score, still a weak predictor
The simplest extension is the ensemble OneR model.

1. It takes all the features which actually have predictive power
2. It uses those single-feature models to predict all samples
3. The new predicted output class for a sample is the average (here unweighted) of the predicted output classes from each of the OneR models.

If the metric accepts scores or probabilities then you can pass them directly (which is the case for roc_auc).

In [None]:
useful_features = result_df.loc[result_df['roc_auc_score'] > 0.5]
print(f"{len(useful_features)} / {len(conts)} features have direct separating power (linear)")

In [None]:
useful_features['feature'].values

In [None]:
ensemble_preds = np.mean(np.vstack(useful_features['predictions'].to_numpy()), axis=0)
ensemble_preds.shape

In [None]:
ensemble_preds_train = np.mean(np.vstack(useful_features['preds_train'].to_numpy()), axis=0)
ensemble_preds_train.shape

In [None]:
fpr, tpr, thresholds = roc_curve(y_train, ensemble_preds_train)
# get the best threshold
J = tpr - fpr
ix = np.argmax(J)
best_thresh = thresholds[ix]
print("Best threshold", best_thresh)

In [None]:
print("The Ensemble OneR model (simple average)")
print("ROC-AUC", round(roc_auc_score(y_true=y_test, y_score=ensemble_preds),4))
print("Precision", round(precision_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))
print("Recall", round(recall_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))
print("F1", round(f1_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))

## Next Tor or No Tor? -> 0.980 AUROC with an ensemble One-Rule model
Same label collapse

In [None]:
tor['Label'] = tor['Label'].astype(dtype='object')
tor.loc[tor['Label'] == 'Tor', 'Label'] = 1
tor.loc[tor['Label'] == 'Non-Tor', 'Label'] = 0
print(tor['Label'].value_counts())
tor['Label'] = tor['Label'].astype(dtype=np.int32)

In [None]:
training_set = tor.sample(frac=0.2, replace=False, random_state=42)
testing_set = tor.drop(index=training_set.index)
training_set.shape, testing_set.shape

In [None]:
X_train, y_train = xs_y(training_set, targ=target)
X_test, y_test = xs_y(testing_set, targ=target)

In [None]:
results = parallel(f=evaluate_one_feature, 
                  items=conts, n_workers=os.cpu_count(), threadpool=False, progress=True)

In [None]:
result_df = pd.DataFrame(data=results, columns=['feature', 'roc_auc_score', 'fitted_models', 'predictions', 'preds_train']).sort_values(by='roc_auc_score', ascending=False)

## Tor recognition: many features with just 1 split choice above .70 AUROC, 2 above 0.89

In [None]:
result_df[['feature', 'roc_auc_score']].head(15)

In [None]:
data_for_plot = pd.concat(objs=[X_train.head(9), y_train.head(9)], axis=1, copy=False, sort=False)

In [None]:
fig, axes = plt.subplots(3,3, figsize=(12,12))
axes = axes.flatten()
for i, tf in enumerate(result_df['feature'].head(9)):    
    sns.histplot(data=data_for_plot, x=tf, stat='percent', hue='Label', bins=100, ax=axes[i])

In [None]:
useful_features = result_df.loc[result_df['roc_auc_score'] > 0.5]
print(f"{len(useful_features)} / {len(conts)} features have direct separating power (linear)")

In [None]:
useful_features['feature'].values

In [None]:
ensemble_preds = np.mean(np.vstack(useful_features['predictions'].to_numpy()), axis=0)
ensemble_preds.shape

In [None]:
ensemble_preds_train = np.mean(np.vstack(useful_features['preds_train'].to_numpy()), axis=0)
ensemble_preds_train.shape

In [None]:
fpr, tpr, thresholds = roc_curve(y_train, ensemble_preds_train)
# get the best threshold
J = tpr - fpr
ix = np.argmax(J)
best_thresh = thresholds[ix]
print("Best threshold", best_thresh)

In [None]:
fpr, tpr, _ = roc_curve(y_train, ensemble_preds_train, pos_label=1)
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

In [None]:
print("The Ensemble OneR model (simple average)")
print("ROC-AUC", round(roc_auc_score(y_true=y_test, y_score=ensemble_preds),4))
print("Precision", round(precision_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))
print("Recall", round(recall_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))
print("F1", round(f1_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))
print("Accuracy", round(accuracy_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))
print("Balanced accuracy", round(balanced_accuracy_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))

In [None]:
cm = confusion_matrix(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0))
cm_display = ConfusionMatrixDisplay(cm).plot()