In [None]:
from fastcore.basics import *
from fastcore.parallel import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import pandas as pd
import numpy as np
# import seaborn as sns
import matplotlib.pyplot as plt
import os


# Load dataset
df = pd.read_parquet('dataset/cicdarknet.parquet')

target = 'Label'
df.drop(columns=['Label.1'], inplace=True)  # Remove redundant column

tor = df.loc[(df.Label == 'Tor') | (df.Label == 'Non-Tor')].copy(deep=True)

tor['Label'] = tor['Label'].astype(dtype='object')
tor.loc[tor['Label'] == 'Tor', 'Label'] = 1
tor.loc[tor['Label'] == 'Non-Tor', 'Label'] = 0
tor['Label'] = tor['Label'].astype(dtype=np.int32)

# Split data into training and testing sets
training_set = tor.sample(frac=0.2, replace=False, random_state=42)
testing_set = tor.drop(index=training_set.index)
X_train, y_train = training_set.drop(columns=[target]), training_set[target]
X_test, y_test = testing_set.drop(columns=[target]), testing_set[target]

# Evaluate features using OneR
conts = list(df.columns.difference([target]).values)

def evaluate_one_feature(feature, metric=roc_auc_score):    
    model = DecisionTreeClassifier(max_depth=1, criterion='gini', class_weight='balanced')    
    model.fit(X_train[[feature]], y_train)    
    preds = model.predict(X_test[[feature]])
    preds_train = model.predict(X_train[[feature]])    
    score = round(metric(y_test, preds), 4)
    if score > 0.5:
        return [feature, score, model, preds, preds_train]
    return [feature, score, None, [], []]

# Run evaluation
results = parallel(f=evaluate_one_feature, items=conts, n_workers=os.cpu_count(), threadpool=False, progress=True)
result_df = pd.DataFrame(results, columns=['feature', 'roc_auc_score', 'fitted_models', 'predictions', 'preds_train']).sort_values(by='roc_auc_score', ascending=False)

# Select useful features
useful_features = result_df.loc[result_df['roc_auc_score'] > 0.5]
ensemble_preds = np.mean(np.vstack(useful_features['predictions'].to_numpy()), axis=0)
ensemble_preds_train = np.mean(np.vstack(useful_features['preds_train'].to_numpy()), axis=0)

# Determine best threshold
fpr, tpr, thresholds = roc_curve(y_train, ensemble_preds_train)
J = tpr - fpr
ix = np.argmax(J)
best_thresh = thresholds[ix]

# Evaluate model
print("Best threshold", best_thresh)
print("ROC-AUC", round(roc_auc_score(y_true=y_test, y_score=ensemble_preds), 4))
print("Precision", round(precision_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))
print("Recall", round(recall_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))
print("F1 Score", round(f1_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))
print("Accuracy", round(accuracy_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))
print("Balanced Accuracy", round(balanced_accuracy_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))

# Confusion matrix
cm = confusion_matrix(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0))
ConfusionMatrixDisplay(cm).plot()
plt.show()
