In [None]:
import gower
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import importlib
import seaborn as sns
from sklearn.metrics import rand_score, adjusted_rand_score
import sys
import os
from sklearn.metrics import f1_score
## import tSNE

In [None]:
## import custom scripts
sys.path.append(os.path.abspath(os.path.join('scripts')))
import preprocessing
import autoencoder_detection
import isolation_forest_detection
import lof_detection
import knn_detection
import dbscan_detection
import pca_detection
import utils

importlib.reload(utils)
importlib.reload(lof_detection)
importlib.reload(autoencoder_detection)
importlib.reload(preprocessing)
importlib.reload(isolation_forest_detection)
importlib.reload(knn_detection)
importlib.reload(dbscan_detection)
importlib.reload(pca_detection)

In [None]:
# Load dataset and compute Gower distance matrix
df = preprocessing.load_dataset(scaler=None, categ=False)
dm = gower.gower_matrix(df)

In [None]:
# Run all detection methods and get the indices of the detected anomalies
pca_indices = pca_detection.main(distance_matrix=dm)
pd.DataFrame(pca_indices).value_counts()

In [None]:
dbscan_indices = dbscan_detection.main(distance_matrix=dm)
pd.DataFrame(dbscan_indices).value_counts()

In [None]:
lof_indices = lof_detection.main(distance_matrix=dm)
pd.DataFrame(lof_indices).value_counts()

In [None]:
# train the autoencoder and get the results
importlib.reload(autoencoder_detection)
autoencoder_indices = np.array(
    autoencoder_detection.main(
        dataframe= preprocessing.load_dataset(scaler=None),
        normalize= False)
        )


In [None]:
isolation_forest_indices = pd.DataFrame(isolation_forest_detection.main(dataframe=df)).replace({1:0}).to_numpy()[:,0]
pd.DataFrame(isolation_forest_indices).value_counts()

In [None]:
knn_indices = np.array(knn_detection.main(distance_matrix=dm))
pd.DataFrame(knn_indices).value_counts()

In [None]:
# this array will contain the number of methods that classified the instance as an outlier
# in [0, 6]
tot_indices = (lof_indices + 
                autoencoder_indices +  
                isolation_forest_indices + 
                knn_indices + 
                dbscan_indices +
                pca_indices)
tot_indices = -1 * tot_indices

In [None]:
# we are creating a dataframe that will be used for the assessment of the results
comparison = pd.DataFrame()
comparison['tot_indices'] = tot_indices
comparison['lof_indices'] = lof_indices
comparison['autoencoder_indices'] = autoencoder_indices
comparison['isolation_forest_indices'] = isolation_forest_indices
comparison['knn_indices'] = knn_indices
comparison['dbscan_indices'] = dbscan_indices
comparison['pca_indices'] = pca_indices
(np.sum(comparison, axis = 0) * -1).iloc[1:]

In [None]:
# we are calculating the probabilities for each point to be an outlier, using our sigmoid function
outl_probs = list(map(utils.sigmoid_to_prob(k=6), tot_indices))
# outl_probs = list(map(lambda x: x/6, outl_probs))
pd.DataFrame(outl_probs, columns=["Probability"]).value_counts().sort_index()

In [None]:
# save the results concateating the probabilities to the original dataframe
df['outlierProb'] = outl_probs
df.to_csv('output/results.csv', index=False)

In [None]:
comparison.iloc[1077,:]

In [None]:
utils.plot_TSNE_2(dist_matrix= dm, labels=outl_probs)

In [None]:
out = tot_indices >= 5
non_out = tot_indices < 5

In [None]:
a = np.abs(np.sum(df[non_out].iloc[:,utils.binary_indices] - 1, axis=1)).describe()
b = np.abs(np.sum(df[out].iloc[:,utils.binary_indices] - 1, axis=1)).describe()
df_falses = pd.concat([a, b], axis=1)
df_falses.columns = ['non_outliers', 'outliers']
df_falses

In [None]:
## we define as ensemble method the one that classifies an instance as an outlier if at least 5 methods classify it as an outlier
list_of_indices =   [lof_indices,
                     autoencoder_indices ,
                    isolation_forest_indices,
                    knn_indices,
                    dbscan_indices,
                    pca_indices,
                    (-1 * (np.array(tot_indices) >= 5))]
                    #true_labels.to_numpy()]
method_names =  ["lof",
                 "autoencoder",
                "iso_forest",
                "knn",
                "dbscan",
                "pca",
                 "Ensemble"]
                 #"true_labels"]

In [None]:
jaccard = utils.jaccard_index(-1) ## returns a callable function

def compute_indices(label_lists):
    n = len(label_lists)
    indices = np.zeros((n, n))
    for i in range(n):
        for j in range(i,n): # swap with (i,n) if it is symmetric
            score = adjusted_rand_score(label_lists[i], label_lists[j])
            #score = rand_score(label_lists[i], label_lists[j])
            #score = jaccard(label_lists[i], label_lists[j])
            #score = agreement_index(label_lists[i], label_lists[j])
            #score = f1_score(label_lists[i], label_lists[j], pos_label=-1)
            indices[i, j] = score
            indices[j, i] = score
    return indices

def plot_heatmap(matrix, labels):
    plt.figure(figsize=(12, 10))
    sns.set_theme(font_scale=1.7)  # Increase the font scale
    ax = sns.heatmap(matrix, xticklabels=labels, yticklabels=labels, annot=True, cmap="viridis", vmin=0)
    ax.set_title('Jaccard similarity heatmap', fontsize=20)  # Increase title font size
    plt.tight_layout()
    plt.savefig('output/jaccard_similarity.png')
    plt.show()


In [None]:
plot_heatmap(compute_indices(list_of_indices), labels=method_names)

In [None]:
print(f"percentage of hard outlier is: {100 * (np.sum(tot_indices >= 5)/len(tot_indices)) :.2f}%")