In [None]:
# -*- coding: utf-8 -*-
"""k-Nearest Neighbors Detector (kNN)"""

# Importing necessary libraries
from warnings import warn
import numpy as np
from sklearn.neighbors import BallTree, NearestNeighbors
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from .base import BaseDetector

# kNN class inherits from BaseDetector, used for outlier detection based on the distance to nearest neighbors
class KNN(BaseDetector):
    """kNN-based outlier detector.

    Computes an outlier score based on the distance of a point to its k nearest neighbors.
    Supports different methods for calculating outlier scores.
    """

    def __init__(self, contamination=0.1, n_neighbors=5, method='largest',
                 radius=1.0, algorithm='auto', leaf_size=30,
                 metric='minkowski', p=2, metric_params=None, n_jobs=1,
                 **kwargs):
        """
        Initialize KNN detector with various parameters.

        Parameters:
        - contamination: Proportion of outliers in the dataset.
        - n_neighbors: Number of neighbors to use.
        - method: Method for calculating outlier score.
        - algorithm: Nearest neighbor search algorithm.
        - metric: Distance metric for neighbor calculation.
        """
        super(KNN, self).__init__(contamination=contamination)
        
        # Setting class attributes
        self.n_neighbors = n_neighbors
        self.method = method
        self.radius = radius
        self.algorithm = algorithm
        self.leaf_size = leaf_size
        self.metric = metric
        self.p = p
        self.metric_params = metric_params
        self.n_jobs = n_jobs

        # Warns user if algorithm is deprecated
        if self.algorithm != 'auto' and self.algorithm != 'ball_tree':
            warn('algorithm parameter is deprecated and will be removed in version 0.7.6. By default, ball_tree will be used.',
                 FutureWarning)

        # Initializing NearestNeighbors model
        self.neigh_ = NearestNeighbors(n_neighbors=self.n_neighbors,
                                       radius=self.radius,
                                       algorithm=self.algorithm,
                                       leaf_size=self.leaf_size,
                                       metric=self.metric,
                                       p=self.p,
                                       metric_params=self.metric_params,
                                       n_jobs=self.n_jobs,
                                       **kwargs)

    def fit(self, X, y=None):
        """Fit the kNN detector on the dataset X.

        Parameters:
        - X: Input samples, numpy array.
        - y: Not used; present for compatibility.

        Returns:
        - self: Fitted kNN detector object.
        """

        # Validates input and fits the model on X
        X = check_array(X)
        self._set_n_classes(y)
        self.neigh_.fit(X)

        # Handling case where NearestNeighbors lacks a _tree attribute
        if self.neigh_._tree is not None:
            self.tree_ = self.neigh_._tree
        else:
            # Uses BallTree if metric_params are provided
            if self.metric_params is not None:
                self.tree_ = BallTree(X, leaf_size=self.leaf_size,
                                      metric=self.metric,
                                      **self.metric_params)
            else:
                self.tree_ = BallTree(X, leaf_size=self.leaf_size,
                                      metric=self.metric)

        # Finds distances to neighbors for each point
        dist_arr, _ = self.neigh_.kneighbors(n_neighbors=self.n_neighbors,
                                             return_distance=True)
        dist = self._get_dist_by_method(dist_arr)

        # Storing outlier scores and setting up decision threshold
        self.decision_scores_ = dist.ravel()
        self._process_decision_scores()

        return self

    def decision_function(self, X):
        """Calculate anomaly scores for input samples X.

        Parameters:
        - X: Input samples, numpy array.

        Returns:
        - anomaly_scores: Anomaly scores for each sample in X.
        """
        # Checks if model has been fitted before calling this method
        check_is_fitted(self, ['tree_', 'decision_scores_', 'threshold_', 'labels_'])
        X = check_array(X)

        # Initializing output scores
        pred_scores = np.zeros([X.shape[0], 1])

        # Calculate distance for each input sample to its k nearest neighbors
        for i in range(X.shape[0]):
            x_i = X[i, :].reshape(1, -1)
            dist_arr, _ = self.tree_.query(x_i, k=self.n_neighbors)
            dist = self._get_dist_by_method(dist_arr)
            pred_scores[i, :] = dist[-1]

        return pred_scores.ravel()

    def _get_dist_by_method(self, dist_arr):
        """Determine outlier score based on distance calculation method.

        Parameters:
        - dist_arr: Distance array to k nearest neighbors.

        Returns:
        - dist: Computed outlier scores.
        """
        # Returns outlier score based on chosen method: largest, mean, or median distance
        if self.method == 'largest':
            return dist_arr[:, -1]
        elif self.method == 'mean':
            return np.mean(dist_arr, axis=1)
        elif self.method == 'median':
            return np.median(dist_arr, axis=1)


In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
import random

def calculate_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=5):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    outlier_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Perform KNN
        knn = NearestNeighbors(n_neighbors=n_neighbors + 1)
        knn.fit(sample_data)
        
        # Compute distances to the k-nearest neighbors (excluding self)
        distances, _ = knn.kneighbors(sample_data)
        knn_scores = np.mean(distances[:, 1:], axis=1)  # Average distance to neighbors

        # Store the outlier score in the dictionary
        for idx, score in zip(sample_indices, knn_scores):
            outlier_scores[idx]['score'] += score
            outlier_scores[idx]['count'] += 1

    # Calculate the average outlier score for each instance
    avg_outlier_scores = {idx: scores['score'] / scores['count'] for idx, scores in outlier_scores.items()}
    
    # Convert to a DataFrame
    outlier_scores_df = pd.DataFrame(list(avg_outlier_scores.items()), columns=['Index', 'Avg_Outlier_Score'])
    
    return outlier_scores_df

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores using min of LOF, ABOD, and KNN
outlier_scores_df = calculate_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)
print(outlier_scores_df)


      Index  Avg_Outlier_Score
0      4522           0.808209
1      1138           1.625060
2      2605           1.073420
3      5978           1.648697
4      3871           1.608492
...     ...                ...
6796   3933           0.855732
6797   6783           1.788556
6798   5766           0.579464
6799   1154           1.588354
6800   2371           1.427185

[6801 rows x 2 columns]


In [2]:
import numpy as np
import pandas as pd
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_LOF_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    outlier_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        
        # Get the outlier scores for the sampled data
        sample_scores = lof.decision_scores_  # higher score -> more outlier
        
        # Store the outlier score in the dictionary
        for idx, score in zip(sample_indices, sample_scores):
            outlier_scores[idx]['score'] += score
            outlier_scores[idx]['count'] += 1

    # Calculate the average outlier score for each instance
    avg_outlier_scores = {idx: scores['score'] / scores['count'] for idx, scores in outlier_scores.items()}
    
    # Convert to a DataFrame
    outlier_scores_df = pd.DataFrame(list(avg_outlier_scores.items()), columns=['Index', 'Avg_Outlier_Score'])
    
    return outlier_scores_df

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores using min of LOF, ABOD, and KNN
outlier_scores_df = calculate_LOF_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)
print(outlier_scores_df)


      Index  Avg_Outlier_Score
0      5192           1.005503
1      3702           1.093693
2      4677           1.207639
3      1503           1.082789
4      3684           1.078337
...     ...                ...
6796   1597           0.965093
6797   5122           1.019429
6798   2540           1.018458
6799   2896           1.088706
6800    780           1.740018

[6801 rows x 2 columns]


In [3]:
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores using min of LOF, ABOD, and KNN
outlier_scores_df = calculate_LOF_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)
print(outlier_scores_df)


     Index  Avg_Outlier_Score
0      210           1.736596
1      196           1.155961
2      105           1.270112
3      188           1.029650
4      193           1.089597
..     ...                ...
242    221           1.123230
243    240           1.097034
244    222           1.142997
245    165           0.995672
246    204           1.130212

[247 rows x 2 columns]


In [4]:
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores using min of LOF, ABOD, and KNN
outlier_scores_df = calculate_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)
print(outlier_scores_df)


     Index  Avg_Outlier_Score
0      226          14.859557
1      158           9.786686
2      153          30.027396
3       50          11.663622
4      145          11.579079
..     ...                ...
242     16          10.793798
243    131          12.492385
244      1          17.041164
245    208          16.840518
246    210          22.146086

[247 rows x 2 columns]


In [8]:
file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores using min of LOF, ABOD, and KNN
outlier_scores_df = calculate_LOF_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)
print(outlier_scores_df)


     Index  Avg_Outlier_Score
0       87           0.975946
1       55           1.095323
2      134           0.980259
3       99           0.981735
4      128           1.105530
..     ...                ...
147    144           1.208109
148     94           1.157261
149     34           1.281596
150     60           0.977629
151     76           1.048944

[152 rows x 2 columns]


In [7]:
file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores using min of LOF, ABOD, and KNN
outlier_scores_df = calculate_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)
print(outlier_scores_df)


     Index  Avg_Outlier_Score
0      110           3.631841
1      149           3.428687
2      123           4.399089
3      127           3.561160
4       35           4.681719
..     ...                ...
147     38           5.142043
148    140           2.900206
149    102           5.187372
150     22           2.607539
151     37           3.523377

[152 rows x 2 columns]
