In [1]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
import random

def calculate_outlier_scores_knn(data, n_samples=5, sample_size=0.8, n_neighbors=5):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    outlier_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        
        # Get the outlier scores for the sampled data
        sample_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Store the outlier score in the dictionary
        for idx, score in zip(sample_indices, sample_scores):
            outlier_scores[idx]['score'] += score
            outlier_scores[idx]['count'] += 1

    # Calculate the average outlier score for each instance
    avg_outlier_scores = {idx: scores['score'] / scores['count'] for idx, scores in outlier_scores.items()}
    
    # Convert to a DataFrame
    outlier_scores_df = pd.DataFrame(list(avg_outlier_scores.items()), columns=['Index', 'Avg_Outlier_Score'])
    
    return outlier_scores_df

# Example usage
# Assuming 'data' is your dataset as a Pandas DataFrame or NumPy array
data = np.random.rand(100, 5)  # Generate a random dataset of 100 samples and 5 features
outlier_scores_df = calculate_outlier_scores_knn(data, n_samples=5, sample_size=0.8, n_neighbors=5)
print(outlier_scores_df)


    Index  Avg_Outlier_Score
0      80           1.430492
1      56           1.727346
2       5           1.453611
3       6           1.022992
4      15           1.495680
..    ...                ...
95      9           1.103098
96     16           1.654183
97     74           1.305197
98     65           1.773579
99     68           1.929459

[100 rows x 2 columns]
