In [21]:
import numpy as np
from sklearn.decomposition import PCA
from pyod.models.knn import KNN
import pandas as pd
from scipy.stats import mode
from sklearn.metrics import precision_score, recall_score, roc_auc_score

def calculate_knn_outlier_scores(data, n_samples=5, sample_fraction=0.8, n_components=2, n_neighbors=5, contamination=0.1):
    """
    Performs outlier detection using PCA and KNN with voting mechanism, returning final labels and scores.
    
    Parameters:
        data (numpy.ndarray): The input dataset.
        n_samples (int): Number of random samples to generate.
        sample_fraction (float): Fraction of the data to include in each sample.
        n_components (int): Number of PCA components.
        n_neighbors (int): Number of neighbors for KNN.
        contamination (float): Contamination level for KNN.
    
    Returns:
        numpy.ndarray: A binary array indicating whether each data point is an outlier (1) or not (0).
        numpy.ndarray: Average outlier scores for each data point.
    """
    n_data_points = data.shape[0]
    sample_size = int(n_data_points * sample_fraction)
    all_labels = np.zeros((n_samples, n_data_points), dtype=int)
    all_scores = np.zeros((n_samples, n_data_points), dtype=float)
    
    for i in range(n_samples):
        # Step 1: Random sampling
        sampled_indices = np.random.choice(n_data_points, sample_size, replace=False)
        sampled_data = data[sampled_indices]
        
        # Step 2: PCA transformation
        pca = PCA(n_components=n_components)
        reduced_data = pca.fit_transform(sampled_data)
        
        # Step 3: KNN outlier detection
        knn = KNN(n_neighbors=n_neighbors, contamination=contamination)
        knn.fit(reduced_data)
        
        # Assign outlier labels and scores to the full dataset
        full_reduced_data = pca.transform(data)
        labels = knn.predict(full_reduced_data)
        scores = knn.decision_function(full_reduced_data)  # Outlier scores
        
        all_labels[i] = labels
        all_scores[i] = scores
    
    # Step 4: Voting mechanism
    final_labels = mode(all_labels, axis=0)[0].flatten()
    average_scores = all_scores.mean(axis=0)
    
    return final_labels, average_scores

def evaluate_knn_outlier_detection(y_true, y_pred, y_scores):
    """
    Evaluates outlier detection performance using Precision, Recall, and AUC.
    
    Parameters:
        y_true (numpy.ndarray): Ground truth (0 for inliers, 1 for outliers).
        y_pred (numpy.ndarray): Predicted labels (0 for inliers, 1 for outliers).
        y_scores (numpy.ndarray): Outlier scores for each data point.
    
    Returns:
        Precision, Recall, and AUC scores.
    """
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_scores)
    return  precision, recall, auc



In [22]:
# KNN
# dataset: Annthyroid_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values


predicted_labels, outlier_scores = calculate_knn_outlier_scores(data, n_samples=5, sample_fraction=0.8, n_components=2, n_neighbors=5, contamination=0.1)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(labels, predicted_labels, outlier_scores)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.5157307709280261


  final_labels = mode(all_labels, axis=0)[0].flatten()


In [23]:
# KNN
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values


predicted_labels, outlier_scores = calculate_knn_outlier_scores(data, n_samples=5, sample_fraction=0.8, n_components=2, n_neighbors=5, contamination=0.1)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(labels, predicted_labels, outlier_scores)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.7718579234972678


  final_labels = mode(all_labels, axis=0)[0].flatten()


In [24]:
# KNN
# dataset: Cardiotocography_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values


predicted_labels, outlier_scores = calculate_knn_outlier_scores(data, n_samples=5, sample_fraction=0.8, n_components=2, n_neighbors=5, contamination=0.1)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(labels, predicted_labels, outlier_scores)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)



Precision: 0.2926829268292683
Recall: 0.75
AUC: 0.9608383685800604


  final_labels = mode(all_labels, axis=0)[0].flatten()


In [25]:
# KNN
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values


predicted_labels, outlier_scores = calculate_knn_outlier_scores(data, n_samples=5, sample_fraction=0.8, n_components=2, n_neighbors=5, contamination=0.1)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(labels, predicted_labels, outlier_scores)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.44666666666666666


  final_labels = mode(all_labels, axis=0)[0].flatten()


In [26]:
# KNN
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values


predicted_labels, outlier_scores = calculate_knn_outlier_scores(data, n_samples=5, sample_fraction=0.8, n_components=2, n_neighbors=5, contamination=0.1)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(labels, predicted_labels, outlier_scores)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.6919191919191919


  final_labels = mode(all_labels, axis=0)[0].flatten()


In [27]:
# KNN
# dataset: Lymphography_withoutdupl_norm_1ofn.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-2].values
    
    # Separate labels (last column)
labels = df.iloc[:, -2].values


predicted_labels, outlier_scores = calculate_knn_outlier_scores(data, n_samples=5, sample_fraction=0.8, n_components=2, n_neighbors=5, contamination=0.1)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(labels, predicted_labels, outlier_scores)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.6276595744680851


  final_labels = mode(all_labels, axis=0)[0].flatten()


In [28]:
# KNN
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values


predicted_labels, outlier_scores = calculate_knn_outlier_scores(data, n_samples=5, sample_fraction=0.8, n_components=2, n_neighbors=5, contamination=0.1)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(labels, predicted_labels, outlier_scores)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.5
Recall: 1.0
AUC: 1.0


  final_labels = mode(all_labels, axis=0)[0].flatten()


In [29]:
# KNN
# dataset: Pima_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values


predicted_labels, outlier_scores = calculate_knn_outlier_scores(data, n_samples=5, sample_fraction=0.8, n_components=2, n_neighbors=5, contamination=0.1)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(labels, predicted_labels, outlier_scores)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.5662222222222222


  final_labels = mode(all_labels, axis=0)[0].flatten()
