In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
import gower
from sklearn.neighbors import NearestNeighbors
from kneed import KneeLocator
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..','scripts')))

# custom scripts
import utils
import preprocessing
importlib.reload(utils)
importlib.reload(preprocessing)



In [None]:
# Load data
df = preprocessing.load_dataset()

In [None]:
distance_matrix = gower.gower_matrix(df)
sns.heatmap(distance_matrix[:9,:9], cmap='viridis')

In [None]:
# Initialize a list to store the number of outliers for each k
outliers_count = []

# Gridsearch for k, the loop makes a plot and computes knee for each k
for k in range(4, 20):
    # fit KNN using precomputed distance
    neigh = NearestNeighbors(n_neighbors=k, metric='precomputed')
    neigh.fit(distance_matrix)
    
    # get the distances and indices of the k-th nearest neighbors
    distances, indices = neigh.kneighbors(distance_matrix)
    
    # consider distances to the k-th nearest neighbor and sort them
    distances_kth = distances[:, -1]
    distances_kth_sorted = np.sort(distances_kth)
    
    # get the treshold using a knee based approach
    kl = KneeLocator(np.arange(len(distances_kth_sorted)), 
                     distances_kth_sorted, 
                     S= 3,
                     curve='convex', 
                     direction='increasing')
    
    # label outliers: points with a distance greater than the knee point distance are considered outliers
    labels = (distances_kth > kl.knee_y) * -1
    
    # count the number of outliers
    num_outliers = np.sum(labels == -1)
    outliers_count.append(num_outliers)
    print(f"For k={k}, the number of outliers is {num_outliers}, the plot is below")
    
    plt.plot(distances_kth_sorted)
    plt.xlabel('Data points')
    plt.ylabel(f"Distance to the {k}-th nearest neighbor")
#plt.title(f"Knee method on sorted distances to the {k}-th nearest neighbor")
    plt.axvline(kl.knee, color='red', linestyle='dashed')
    plt.show()

In [None]:
# just for stats
plt.plot(range(4, 20), outliers_count, marker='x')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Number of Outliers')
plt.title('Number of Outliers for Different k')
plt.grid(True)
plt.show()