In [1]:
import os
import pickle

import numpy as np
from sklearn.cluster import DBSCAN

import matplotlib.pyplot as plt
import matplotlib as mpl

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

base_alpha = .4
alpha_diff = .6

# mpl.rcParams['font.size'] = 22
mpl.rcParams['grid.linestyle'] = ':'
mpl.rcParams['lines.markersize'] = 8
# font = fm.FontProperties(
#         family = 'Cambria', fname = 'C:\\Windows\\Fonts\\Cambria.ttf')

figure_format = 'svg'

In [2]:
seed = 42
np.random.seed(seed)
folder = '/nobackup/carda/datasets/2019-ecoc-demo'

In [3]:
with open(os.path.join(folder, 'scaled_final_dataset.h5'), 'rb') as file:
    dataset = pickle.load(file)
    scaled_final_dataframe = dataset['scaled_final_dataframe']
    class_names = dataset['class_names']
    class_columns = dataset['class_columns']
    del dataset

In [4]:
number_samples_attack = []
for ida, attack in enumerate(class_names):
    number_samples_attack.append(scaled_final_dataframe[(scaled_final_dataframe['attack'] == ida)].shape[0])

In [5]:
num_tests = 50
samples_normal_vec = [30, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275, 300, 400, 500]
samples_abnormal = 30

epsilon_configurations = [.1, .5, 1., 1.5, 2., 3., 4., 5., 10.]
min_samples_configurations = [3, 5, 8, 10, 12, 15, 20]

In [6]:
performance_dbscan = {}
highest_f1 = {}

for ids, samples_normal in enumerate(samples_normal_vec):

    performance_dbscan[samples_normal] = {}
    highest_f1[samples_normal] = .0

    # generating the dictionary
    for idm, min_samples in enumerate(min_samples_configurations):
        performance_dbscan[samples_normal][min_samples] = {}
        for ide, epsilon in enumerate(epsilon_configurations):
            performance_dbscan[samples_normal][min_samples][epsilon] = {}

    for idm, min_samples in enumerate(min_samples_configurations):
        
        for ide, epsilon in enumerate(epsilon_configurations):

            clf = DBSCAN(eps=epsilon, min_samples=min_samples, metric='euclidean')
            
            true_positives = []
            false_negatives = []
            true_negatives = []
            false_positives = []
            precisions = []
            recalls = []
            f1_scores = []
            num_centroids = 0
            
            for test in range(num_tests):
                start_index = np.random.randint(number_samples_attack[0]-samples_normal)
                p = scaled_final_dataframe[(scaled_final_dataframe['attack'] == 0)]
                x_data = p.values[start_index:start_index+samples_normal, :-1]
                y_data = p.values[start_index:start_index+samples_normal, -1]

                # start copy
                for i in range(num_tests):
                    x_data_test = np.copy(x_data)
                    y_data_test = np.copy(y_data)
                    for clazz in range(1, len(class_names)):
                        start_index = np.random.randint(number_samples_attack[clazz]-samples_abnormal)
                        x = scaled_final_dataframe[(scaled_final_dataframe['attack'] == clazz)]
                        x_data_test = np.concatenate((x_data_test, x.values[start_index:start_index+samples_abnormal, :-1]), axis=0)
                        y_data_test = np.concatenate((y_data_test, x.values[start_index:start_index+samples_abnormal, -1]), axis=0)

                    y_pred = clf.fit_predict(x_data_test)
    #                 y_pred = dbscan(x_data_test, epsilon, min_samples)
    #                 print(y_pred, '\n')
                    
    #                 num_centroids += len(clf.core_sample_indices_)

                    tpr = np.sum([1 for i in range(len(y_pred)) if y_pred[i] == -1 and y_data_test[i] > 0]) / np.sum([1 for i in range(len(y_pred)) if y_data_test[i] > 0])
                    fnr = np.sum([1 for i in range(len(y_pred)) if y_pred[i] >= 0 and y_data_test[i] > 0]) / np.sum([1 for i in range(len(y_pred)) if y_data_test[i] > 0])

                    tnr = np.sum([1 for i in range(len(y_pred)) if y_pred[i] >= 0 and y_data_test[i] == 0]) / np.sum([1 for i in range(len(y_pred)) if y_data_test[i] == 0])
                    fpr = np.sum([1 for i in range(len(y_pred)) if y_pred[i] == -1 and y_data_test[i] == 0]) / np.sum([1 for i in range(len(y_pred)) if y_data_test[i] == 0])

                    if tpr == 0: # avoids division by zero
                        precision = 0.
                        recall = 0.
                        f1 = 0.
                    else:
                        precision = tpr / (tpr + fpr)
                        recall = tpr / (tpr + fnr)
                        f1 = 2 * precision * recall / (precision + recall)
                    
                    true_positives.append(tpr)
                    false_negatives.append(fnr)
                    true_negatives.append(tnr)
                    false_positives.append(fpr)
                    precisions.append(precision)
                    recalls.append(recall)
                    f1_scores.append(f1)
            
            performance_dbscan[samples_normal][min_samples][epsilon]['min_samples'] = min_samples
            performance_dbscan[samples_normal][min_samples][epsilon]['epsilon'] = epsilon
            performance_dbscan[samples_normal][min_samples][epsilon]['true_positive_rate'] = np.mean(true_positives)
            performance_dbscan[samples_normal][min_samples][epsilon]['false_negative_rate'] = np.mean(false_negatives)
            performance_dbscan[samples_normal][min_samples][epsilon]['true_negative_rate'] = np.mean(true_negatives)
            performance_dbscan[samples_normal][min_samples][epsilon]['false_positive_rate'] = np.mean(false_positives)
            performance_dbscan[samples_normal][min_samples][epsilon]['precision'] = np.mean(precisions)
            performance_dbscan[samples_normal][min_samples][epsilon]['recall'] = np.mean(recalls)
            performance_dbscan[samples_normal][min_samples][epsilon]['f1_score'] = np.mean(f1_scores)
            msg = ''
            if performance_dbscan[samples_normal][min_samples][epsilon]['f1_score'] > highest_f1[samples_normal]:
                msg = '\t * highest'
                highest_f1[samples_normal] = performance_dbscan[samples_normal][min_samples][epsilon]['f1_score']
                print(f'{samples_normal:>4}\t{min_samples:>6}\t', f'{epsilon:>6}', '\t{:.3f}\t{:.3f}\t{:.3f}\t'.format(performance_dbscan[samples_normal][min_samples][epsilon]['false_positive_rate'],
                    performance_dbscan[samples_normal][min_samples][epsilon]['false_negative_rate'],
                    performance_dbscan[samples_normal][min_samples][epsilon]['f1_score']), msg)

    print(ids+1, ' / ', len(samples_normal_vec))

  30	     3	    0.1 	1.000	0.000	0.667	 	 * highest
  30	     5	    0.5 	0.910	0.024	0.678	 	 * highest
  30	    10	    1.0 	0.615	0.155	0.691	 	 * highest
  30	    12	    1.0 	0.658	0.098	0.711	 	 * highest
1  /  14
  50	     3	    0.1 	1.000	0.000	0.667	 	 * highest
  50	     3	    0.5 	0.651	0.107	0.704	 	 * highest
  50	     8	    1.0 	0.367	0.240	0.714	 	 * highest
  50	    10	    1.0 	0.487	0.153	0.729	 	 * highest
  50	    12	    1.0 	0.517	0.099	0.750	 	 * highest
2  /  14
  75	     3	    0.1 	1.000	0.000	0.667	 	 * highest
  75	     3	    0.5 	0.584	0.106	0.724	 	 * highest
  75	     8	    1.0 	0.261	0.244	0.750	 	 * highest
  75	    10	    1.0 	0.369	0.162	0.761	 	 * highest
  75	    12	    1.0 	0.415	0.101	0.781	 	 * highest
3  /  14
 100	     3	    0.1 	1.000	0.000	0.667	 	 * highest
 100	     3	    0.5 	0.557	0.106	0.731	 	 * highest
 100	     8	    1.0 	0.213	0.247	0.765	 	 * highest
 100	    10	    1.0 	0.249	0.160	0.804	 	 * highest
 100	    12	    1.0 	0.310	0.112	0.80

In [7]:
with open('../results/results-dbscan.h5', 'wb') as file:
    pickle.dump((performance_dbscan, highest_f1), file)