Recall: real outliers we said are outliers / all points we said are outliers(right or wrong)
Precision: real outliers we said are outliers / actual amount of outliers

KNN

In [89]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
from ucimlrepo import fetch_ucirepo 
import random

def calculate_knn_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=5):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    outlier_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        
        # Get the outlier scores for the sampled data
        sample_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Store the outlier score in the dictionary
        for idx, score in zip(sample_indices, sample_scores):
            outlier_scores[idx]['score'] += score
            outlier_scores[idx]['count'] += 1

    # Calculate the average outlier score for each instance
    avg_outlier_scores = {idx: scores['score'] / scores['count'] for idx, scores in outlier_scores.items()}
    
    # Convert to a DataFrame
    outlier_scores_df = pd.DataFrame(list(avg_outlier_scores.items()), columns=['Index', 'Avg_Outlier_Score'])
    
    return outlier_scores_df

def evaluate_knn_outlier_detection(outlier_scores_df, ground_truth):
    # Convert outlier scores and ground truth labels to the same order
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Outlier_Score'].values

    # Calculate precision, recall, and AUC
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile for demonstration
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc


In [90]:
# KNN
# dataset: Annthyroid_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.05433186490455213
Recall: 0.27205882352941174
AUC: 0.6857541591280174


In [91]:
# KNN
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.04
Recall: 0.3333333333333333
AUC: 0.8524590163934426


In [92]:
# KNN
# dataset: Cardiotocography_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0650887573964497
Recall: 0.34375
AUC: 0.7620468277945618


In [93]:
# KNN
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.7833333333333333


In [94]:
# KNN
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2857142857142857
Recall: 0.6666666666666666
AUC: 0.8434343434343434


In [95]:
# KNN
# dataset: Lymphography_withoutdupl_norm_1ofn.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-2].values
    
    # Separate labels (last column)
labels = df.iloc[:, -2].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.4
Recall: 1.0
AUC: 0.9952718676122931


In [96]:
# KNN
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 1.0
AUC: 0.9583333333333334


In [97]:
# KNN
# dataset: Pima_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0392156862745098
Recall: 0.2222222222222222
AUC: 0.666


LOF

In [98]:
import numpy as np
import pandas as pd
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_LOF_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    outlier_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        
        # Get the outlier scores for the sampled data
        sample_scores = lof.decision_scores_  # higher score -> more outlier
        
        # Store the outlier score in the dictionary
        for idx, score in zip(sample_indices, sample_scores):
            outlier_scores[idx]['score'] += score
            outlier_scores[idx]['count'] += 1

    # Calculate the average outlier score for each instance
    avg_outlier_scores = {idx: scores['score'] / scores['count'] for idx, scores in outlier_scores.items()}
    
    # Convert to a DataFrame
    outlier_scores_df = pd.DataFrame(list(avg_outlier_scores.items()), columns=['Index', 'Avg_Outlier_Score'])
    
    return outlier_scores_df

def evaluate_LOF_outlier_detection(outlier_scores_df, ground_truth):
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc


In [99]:
# LOF
# dataset: Annthyroid_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_LOF_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_LOF_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.07929515418502203
Recall: 0.39705882352941174
AUC: 0.7569116543841843


In [100]:
# LOF
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_LOF_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_LOF_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.04
Recall: 0.3333333333333333
AUC: 0.8538251366120219


In [101]:
# LOF
# dataset: Cardiotocography_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_LOF_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_LOF_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0650887573964497
Recall: 0.34375
AUC: 0.7878021148036254


In [102]:
# LOF
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_LOF_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_LOF_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.6966666666666667


In [103]:
# LOF
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_LOF_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_LOF_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2857142857142857
Recall: 0.6666666666666666
AUC: 0.8181818181818181


In [104]:
# LOF
# dataset: Lymphography_withoutdupl_norm_1ofn.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-2].values
    
    # Separate labels (last column)
labels = df.iloc[:, -2].values

# Calculate combined outlier scores
combined_scores_df = calculate_LOF_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_LOF_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.4
Recall: 1.0
AUC: 0.9917257683215129


In [105]:
# LOF
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_LOF_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_LOF_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 1.0
AUC: 0.9583333333333334


In [106]:
# LOF
# dataset: Pima_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_LOF_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_LOF_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.6326666666666667


ABOD

In [2]:
import numpy as np
import pandas as pd
from pyod.models.abod import ABOD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_ABOD_outlier_scores(data, n_samples=5, sample_size=0.8):
    """
    Calculates outlier scores using ABOD on random samples and averages scores
    for data points appearing in multiple samples.
    
    Parameters:
        data (numpy array): Input dataset of shape (n_samples, n_features).
        n_samples (int): Number of random samples to create.
        sample_size (float): Proportion of the dataset to include in each sample.
        
    Returns:
        pd.DataFrame: A DataFrame containing the indices and averaged ABOD scores.
    """
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    outlier_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply ABOD from pyod
        abod = ABOD()
        abod.fit(sample_data)
        sample_scores = abod.decision_scores_  # higher score -> more outlier
        
        # Store the outlier scores in the dictionary
        for idx, score in zip(sample_indices, sample_scores):
            outlier_scores[idx]['score'] += score
            outlier_scores[idx]['count'] += 1

    # Calculate the average outlier score for each instance
    avg_outlier_scores = {idx: scores['score'] / scores['count'] for idx, scores in outlier_scores.items()}
    
    # Convert to a DataFrame
    outlier_scores_df = pd.DataFrame(list(avg_outlier_scores.items()), columns=['Index', 'Avg_ABOD_Score'])
    
    return outlier_scores_df

def evaluate_ABOD_outlier_detection(outlier_scores_df, ground_truth):
    """
    Evaluates the outlier detection performance using precision, recall, and AUC.

    Parameters:
        outlier_scores_df (pd.DataFrame): DataFrame with outlier scores for each instance.
        ground_truth (pd.Series or numpy array): Ground truth labels (1 = outlier, 0 = normal).
        
    Returns:
        tuple: Precision, Recall, and AUC scores.
    """
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_ABOD_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc


In [108]:
# ABOD
# dataset: Annthyroid_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_ABOD_outlier_scores(data, n_samples=15, sample_size=0.8)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_ABOD_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.06607929515418502
Recall: 0.33088235294117646
AUC: 0.7693107100304488


In [6]:
# ABOD
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_ABOD_outlier_scores(data, n_samples=15, sample_size=0.8)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_ABOD_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.04
Recall: 0.3333333333333333
AUC: 0.8060109289617485


In [5]:
# ABOD
# dataset: Cardiotocography_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_ABOD_outlier_scores(data, n_samples=15, sample_size=0.8)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_ABOD_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.05917159763313609
Recall: 0.3125
AUC: 0.715879909365559


In [4]:
# ABOD
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_ABOD_outlier_scores(data, n_samples=15, sample_size=0.8)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_ABOD_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.7433333333333334


In [19]:
# ABOD
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_ABOD_outlier_scores(data, n_samples=15, sample_size=0.8)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_ABOD_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.14285714285714285
Recall: 0.3333333333333333
AUC: 0.7424242424242424


In [113]:
# ABOD
# dataset: Lymphography_withoutdupl_norm_1ofn.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-2].values
    
    # Separate labels (last column)
labels = df.iloc[:, -2].values

# Calculate combined outlier scores
combined_scores_df = calculate_ABOD_outlier_scores(data, n_samples=15, sample_size=0.8)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_ABOD_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.4
Recall: 1.0
AUC: 0.9763593380614658


In [114]:
# ABOD
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_ABOD_outlier_scores(data, n_samples=15, sample_size=0.8)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_ABOD_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 1.0
AUC: 0.9583333333333334


In [20]:
# ABOD
# dataset: Pima_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_ABOD_outlier_scores(data, n_samples=15, sample_size=0.8)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_ABOD_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0392156862745098
Recall: 0.2222222222222222
AUC: 0.6877777777777778


KNN and LOF with avg

In [22]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_knn_LOF_avg_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    combined_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        knn_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        lof_scores = lof.decision_scores_  # higher score -> more outlier

        # Normalize scores to [0, 1] range using MinMaxScaler
        minmax_scaler = MinMaxScaler()
        knn_scores_norm = minmax_scaler.fit_transform(knn_scores.reshape(-1, 1)).flatten()
        lof_scores_norm = minmax_scaler.fit_transform(lof_scores.reshape(-1, 1)).flatten()


        # Calculate the combined score as the average of KNN and LOF scores
        combined_sample_scores = (knn_scores_norm + lof_scores_norm) / 2

        # Store the combined score in the dictionary for each data point in the sample
        for idx, score in zip(sample_indices, combined_sample_scores):
            combined_scores[idx]['score'] += score
            combined_scores[idx]['count'] += 1

    # Calculate the average combined score for each instance
    avg_combined_scores = {idx: scores['score'] / scores['count'] for idx, scores in combined_scores.items()}
    
    # Convert to a DataFrame
    combined_scores_df = pd.DataFrame(list(avg_combined_scores.items()), columns=['Index', 'Avg_Combined_Outlier_Score'])
    
    return combined_scores_df

def evaluate_knn_LOF_avg_outlier_detection(outlier_scores_df, ground_truth):
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Combined_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc


In [117]:
# KNN LOF with avg
# dataset: Annthyroid_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.07342143906020558
Recall: 0.36764705882352944
AUC: 0.7323639733462778


In [118]:
# KNN LOF with avg
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.04
Recall: 0.3333333333333333
AUC: 0.8565573770491803


In [24]:
# KNN LOF with avg
# dataset: Cardiotocography_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.07692307692307693
Recall: 0.40625
AUC: 0.7818542296072507


In [27]:
# KNN LOF with avg
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.75


In [121]:
# KNN LOF with avg
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2857142857142857
Recall: 0.6666666666666666
AUC: 0.8181818181818181


In [122]:
# KNN LOF with avg
# dataset: Lymphography_withoutdupl_norm_1ofn.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-2].values
    
    # Separate labels (last column)
labels = df.iloc[:, -2].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.4
Recall: 1.0
AUC: 0.9929078014184397


In [123]:
# KNN LOF with avg
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 1.0
AUC: 0.9583333333333334


In [29]:
# KNN LOF with avg
# dataset: Pima_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.6497777777777778


KNN and LOF with max

In [31]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_knn_LOF_max_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    combined_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        knn_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        lof_scores = lof.decision_scores_  # higher score -> more outlier

        # Normalize scores to [0, 1] range using MinMaxScaler
        minmax_scaler = MinMaxScaler()
        knn_scores_norm = minmax_scaler.fit_transform(knn_scores.reshape(-1, 1)).flatten()
        lof_scores_norm = minmax_scaler.fit_transform(lof_scores.reshape(-1, 1)).flatten()

        # Calculate the combined score as the max of KNN and LOF scores
        combined_sample_scores = np.maximum(knn_scores_norm, lof_scores_norm)

        # Store the combined score in the dictionary for each data point in the sample
        for idx, score in zip(sample_indices, combined_sample_scores):
            combined_scores[idx]['score'] += score
            combined_scores[idx]['count'] += 1

    # Calculate the average combined score for each instance across samples
    avg_combined_scores = {idx: scores['score'] / scores['count'] for idx, scores in combined_scores.items()}
    
    # Convert to a DataFrame
    combined_scores_df = pd.DataFrame(list(avg_combined_scores.items()), columns=['Index', 'Avg_Max_Outlier_Score'])
    
    return combined_scores_df

def evaluate_knn_LOF_max_outlier_detection(outlier_scores_df, ground_truth):
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Max_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc


In [126]:
# KNN LOF with max
# dataset: Annthyroid_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.07048458149779736
Recall: 0.35294117647058826
AUC: 0.7282092581969021


In [127]:
# KNN LOF with max
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.04
Recall: 0.3333333333333333
AUC: 0.8524590163934426


In [128]:
# KNN LOF with max
# dataset: Cardiotocography_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores 
combined_scores_df = calculate_knn_LOF_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0650887573964497
Recall: 0.34375
AUC: 0.756797583081571


In [129]:
# KNN LOF with max
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.7933333333333333


In [130]:
# KNN LOF with max
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores 
combined_scores_df = calculate_knn_LOF_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2857142857142857
Recall: 0.6666666666666666
AUC: 0.8282828282828283


In [131]:
# KNN LOF with max
# dataset: Lymphography_withoutdupl_norm_1ofn.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-2].values
    
    # Separate labels (last column)
labels = df.iloc[:, -2].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.4
Recall: 1.0
AUC: 0.9976359338061466


In [132]:
# KNN LOF with max
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 1.0
AUC: 0.9583333333333334


In [33]:
# KNN LOF with max
# dataset: Pima_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.6557777777777778


KNN and LOF with min

In [34]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_knn_LOF_min_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    combined_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        knn_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        lof_scores = lof.decision_scores_  # higher score -> more outlier

        # Normalize scores to [0, 1] range using MinMaxScaler
        minmax_scaler = MinMaxScaler()
        knn_scores_norm = minmax_scaler.fit_transform(knn_scores.reshape(-1, 1)).flatten()
        lof_scores_norm = minmax_scaler.fit_transform(lof_scores.reshape(-1, 1)).flatten()

        # Calculate the combined score as the average of KNN and LOF scores
        combined_sample_scores = np.minimum(knn_scores_norm , lof_scores_norm)

        # Store the combined score in the dictionary for each data point in the sample
        for idx, score in zip(sample_indices, combined_sample_scores):
            combined_scores[idx]['score'] += score
            combined_scores[idx]['count'] += 1

    # Calculate the average combined score for each instance
    avg_combined_scores = {idx: scores['score'] / scores['count'] for idx, scores in combined_scores.items()}
    
    # Convert to a DataFrame
    combined_scores_df = pd.DataFrame(list(avg_combined_scores.items()), columns=['Index', 'Avg_Combined_Outlier_Score'])
    
    return combined_scores_df

def evaluate_knn_LOF_min_outlier_detection(outlier_scores_df, ground_truth):
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Combined_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc



In [35]:
# KNN LOF with min
# dataset: Annthyroid_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.07195301027900147
Recall: 0.3602941176470588
AUC: 0.7426459556065488


In [136]:
# KNN LOF with avg
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.04
Recall: 0.3333333333333333
AUC: 0.8524590163934427


In [137]:
# KNN LOF with min
# dataset: Cardiotocography_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.07692307692307693
Recall: 0.40625
AUC: 0.7793429003021148


In [36]:
# KNN LOF with min
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.7066666666666667


In [37]:
# KNN LOF with min
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2857142857142857
Recall: 0.6666666666666666
AUC: 0.8282828282828282


In [140]:
# KNN LOF with min
# dataset: Lymphography_withoutdupl_norm_1ofn.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-2].values
    
    # Separate labels (last column)
labels = df.iloc[:, -2].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.4
Recall: 1.0
AUC: 0.9917257683215129


In [141]:
# KNN LOF with min
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 1.0
AUC: 0.9583333333333334


In [38]:
# KNN LOF with min
# dataset: Pima_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.6348888888888888


KNN LOF ABOD with avg

In [39]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.abod import ABOD
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from collections import defaultdict
import random

def calculate_knn_LOF_ABOD_avg_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    """
    Calculates combined outlier scores using LOF, ABOD, and KNN on random samples
    and averages scores for data points appearing in multiple samples.
    
    Parameters:
        data (numpy array): Input dataset of shape (n_samples, n_features).
        n_samples (int): Number of random samples to create.
        sample_size (float): Proportion of the dataset to include in each sample.
        n_neighbors (int): Number of neighbors for LOF and KNN algorithms.
        
    Returns:
        pd.DataFrame: A DataFrame containing the indices and averaged combined scores.
    """
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    combined_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        knn_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        lof_scores = lof.decision_scores_  # higher score -> more outlier
        
        # Apply ABOD from pyod
        abod = ABOD()
        abod.fit(sample_data)
        abod_scores = abod.decision_scores_  # higher score -> more outlier

        # Normalize scores to [0, 1] range using MinMaxScaler
        minmax_scaler = MinMaxScaler()
        knn_scores_norm = minmax_scaler.fit_transform(knn_scores.reshape(-1, 1)).flatten()
        lof_scores_norm = minmax_scaler.fit_transform(lof_scores.reshape(-1, 1)).flatten()
        abod_scores_norm = minmax_scaler.fit_transform(abod_scores.reshape(-1, 1)).flatten()
        
        # Calculate the combined score as the average of KNN, LOF, and ABOD scores
        combined_sample_scores = (knn_scores_norm + lof_scores_norm + abod_scores_norm) / 3

        # Store the combined score in the dictionary for each data point in the sample
        for idx, score in zip(sample_indices, combined_sample_scores):
            combined_scores[idx]['score'] += score
            combined_scores[idx]['count'] += 1

    # Calculate the average combined score for each instance
    avg_combined_scores = {idx: scores['score'] / scores['count'] for idx, scores in combined_scores.items()}
    
    # Convert to a DataFrame
    combined_scores_df = pd.DataFrame(list(avg_combined_scores.items()), columns=['Index', 'Avg_Combined_Outlier_Score'])
    
    return combined_scores_df

def evaluate_knn_LOF_ABOD_avg_outlier_detection(outlier_scores_df, ground_truth):
    """
    Evaluates the outlier detection performance using precision, recall, and AUC.

    Parameters:
        outlier_scores_df (pd.DataFrame): DataFrame with outlier scores for each instance.
        ground_truth (pd.Series or numpy array): Ground truth labels (1 = outlier, 0 = normal).
        
    Returns:
        tuple: Precision, Recall, and AUC scores.
    """
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Combined_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc


In [42]:
# KNN LOF ABOD with avg
# dataset: Annthyroid_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_ABOD_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.07195301027900147
Recall: 0.3602941176470588
AUC: 0.7318487710162835


In [145]:
# KNN LOF ABOD with avg
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores 
combined_scores_df = calculate_knn_LOF_ABOD_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.04
Recall: 0.3333333333333333
AUC: 0.8592896174863388


In [146]:
# KNN LOF ABOD with avg
# dataset: Cardiotocography_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores 
combined_scores_df = calculate_knn_LOF_ABOD_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.07100591715976332
Recall: 0.375
AUC: 0.7831004531722054


In [147]:
# KNN LOF ABOD with avg
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores 
combined_scores_df = calculate_knn_LOF_ABOD_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.75


In [148]:
# KNN LOF ABOD with avg
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_ABOD_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2857142857142857
Recall: 0.6666666666666666
AUC: 0.8282828282828283


In [149]:
# KNN LOF ABOD with avg
# dataset: Lymphography_withoutdupl_norm_1ofn.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-2].values
    
    # Separate labels (last column)
labels = df.iloc[:, -2].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_ABOD_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.4
Recall: 1.0
AUC: 0.9952718676122931


In [150]:
# KNN LOF ABOD with avg
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_ABOD_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 1.0
AUC: 0.9583333333333334


In [151]:
# KNN LOF ABOD with avg
# dataset: Pima_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_ABOD_avg_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_avg_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.6657777777777778


KNN LOF ABOD with max

In [2]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.abod import ABOD
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_knn_LOF_ABOD_max_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    """
    Calculates combined outlier scores using the maximum score from LOF, ABOD, and KNN on random samples
    and averages scores for data points appearing in multiple samples.
    
    Parameters:
        data (numpy array): Input dataset of shape (n_samples, n_features).
        n_samples (int): Number of random samples to create.
        sample_size (float): Proportion of the dataset to include in each sample.
        n_neighbors (int): Number of neighbors for LOF and KNN algorithms.
        
    Returns:
        pd.DataFrame: A DataFrame containing the indices and averaged combined scores.
    """
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    combined_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        knn_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        lof_scores = lof.decision_scores_  # higher score -> more outlier
        
        # Apply ABOD from pyod
        abod = ABOD()
        abod.fit(sample_data)
        abod_scores = abod.decision_scores_  # higher score -> more outlier

        minmax_scaler = MinMaxScaler()
        knn_scores_norm = minmax_scaler.fit_transform(knn_scores.reshape(-1, 1)).flatten()
        lof_scores_norm = minmax_scaler.fit_transform(lof_scores.reshape(-1, 1)).flatten()
        abod_scores_norm = minmax_scaler.fit_transform(abod_scores.reshape(-1, 1)).flatten()
        
        # Calculate the combined score as the maximum of KNN, LOF, and ABOD scores
        combined_sample_scores = np.maximum.reduce([knn_scores_norm, lof_scores_norm, abod_scores_norm])

        # Store the combined score in the dictionary for each data point in the sample
        for idx, score in zip(sample_indices, combined_sample_scores):
            combined_scores[idx]['score'] += score
            combined_scores[idx]['count'] += 1

    # Calculate the average combined score for each instance
    avg_combined_scores = {idx: scores['score'] / scores['count'] for idx, scores in combined_scores.items()}
    
    # Convert to a DataFrame
    combined_scores_df = pd.DataFrame(list(avg_combined_scores.items()), columns=['Index', 'Avg_Combined_Outlier_Score'])
    
    return combined_scores_df

def evaluate_knn_LOF_ABOD_max_outlier_detection(outlier_scores_df, ground_truth):
    """
    Evaluates the outlier detection performance using precision, recall, and AUC.

    Parameters:
        outlier_scores_df (pd.DataFrame): DataFrame with outlier scores for each instance.
        ground_truth (pd.Series or numpy array): Ground truth labels (1 = outlier, 0 = normal).
        
    Returns:
        tuple: Precision, Recall, and AUC scores.
    """
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Combined_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc



In [153]:
# KNN LOF ABOD with max 
# dataset: Annthyroid_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores 
combined_scores_df = calculate_knn_LOF_ABOD_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.06461086637298091
Recall: 0.3235294117647059
AUC: 0.7663551917391114


In [154]:
# KNN LOF ABOD with max
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_ABOD_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.04
Recall: 0.3333333333333333
AUC: 0.7978142076502732


In [175]:
# KNN LOF ABOD with max
# dataset: Cardiotocography_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_ABOD_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.05917159763313609
Recall: 0.3125
AUC: 0.7126321752265861


In [156]:
# KNN LOF ABOD with max
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_ABOD_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.7433333333333334


In [20]:
# KNN LOF ABOD with max
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_ABOD_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.7373737373737373


In [158]:
# KNN LOF ABOD with max
# dataset: Lymphography_withoutdupl_norm_1ofn.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-2].values
    
    # Separate labels (last column)
labels = df.iloc[:, -2].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_ABOD_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.4
Recall: 1.0
AUC: 0.9763593380614658


In [159]:
# KNN LOF ABOD with max 
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_ABOD_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 1.0
AUC: 0.9583333333333334


In [160]:
# KNN LOF ABOD with max
# dataset: Pima_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores
combined_scores_df = calculate_knn_LOF_ABOD_max_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_max_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0392156862745098
Recall: 0.2222222222222222
AUC: 0.6866666666666666


KNN LOF ABOD with min

In [161]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.abod import ABOD
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_knn_LOF_ABOD_min_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    """
    Calculates combined outlier scores using the maximum score from LOF, ABOD, and KNN on random samples
    and averages scores for data points appearing in multiple samples.
    
    Parameters:
        data (numpy array): Input dataset of shape (n_samples, n_features).
        n_samples (int): Number of random samples to create.
        sample_size (float): Proportion of the dataset to include in each sample.
        n_neighbors (int): Number of neighbors for LOF and KNN algorithms.
        
    Returns:
        pd.DataFrame: A DataFrame containing the indices and averaged combined scores.
    """
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    combined_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        knn_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        lof_scores = lof.decision_scores_  # higher score -> more outlier
        
        # Apply ABOD from pyod
        abod = ABOD()
        abod.fit(sample_data)
        abod_scores = abod.decision_scores_  # higher score -> more outlier
        
        # Normalize scores to [0, 1] range using MinMaxScaler
        minmax_scaler = MinMaxScaler()
        knn_scores_norm = minmax_scaler.fit_transform(knn_scores.reshape(-1, 1)).flatten()
        lof_scores_norm = minmax_scaler.fit_transform(lof_scores.reshape(-1, 1)).flatten()
        abod_scores_norm = minmax_scaler.fit_transform(abod_scores.reshape(-1, 1)).flatten()

        # Calculate the combined score as the minimum of KNN, LOF, and ABOD scores
        combined_sample_scores = np.minimum.reduce([knn_scores_norm, lof_scores_norm, abod_scores_norm])

        # Store the combined score in the dictionary for each data point in the sample
        for idx, score in zip(sample_indices, combined_sample_scores):
            combined_scores[idx]['score'] += score
            combined_scores[idx]['count'] += 1

    # Calculate the average combined score for each instance
    avg_combined_scores = {idx: scores['score'] / scores['count'] for idx, scores in combined_scores.items()}
    
    # Convert to a DataFrame
    combined_scores_df = pd.DataFrame(list(avg_combined_scores.items()), columns=['Index', 'Avg_Combined_Outlier_Score'])
    
    return combined_scores_df

def evaluate_knn_LOF_ABOD_min_outlier_detection(outlier_scores_df, ground_truth):
    """
    Evaluates the outlier detection performance using precision, recall, and AUC.

    Parameters:
        outlier_scores_df (pd.DataFrame): DataFrame with outlier scores for each instance.
        ground_truth (pd.Series or numpy array): Ground truth labels (1 = outlier, 0 = normal).
        
    Returns:
        tuple: Precision, Recall, and AUC scores.
    """
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Combined_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc




In [162]:
# KNN LOF ABOD with min 
# dataset: Annthyroid_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores using min of LOF, ABOD, and KNN
combined_scores_df = calculate_knn_LOF_ABOD_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.07195301027900147
Recall: 0.3602941176470588
AUC: 0.7485437535854552


In [163]:
# KNN LOF ABOD with min 
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores using min of LOF, ABOD, and KNN
combined_scores_df = calculate_knn_LOF_ABOD_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.04
Recall: 0.3333333333333333
AUC: 0.8579234972677596


In [164]:
# KNN LOF ABOD with min 
# dataset: Cardiotocography_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores using min of LOF, ABOD, and KNN
combined_scores_df = calculate_knn_LOF_ABOD_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0650887573964497
Recall: 0.34375
AUC: 0.7973942598187311


In [165]:
# KNN LOF ABOD with min 
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores using min of LOF, ABOD, and KNN
combined_scores_df = calculate_knn_LOF_ABOD_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.7066666666666667


In [166]:
# KNN LOF ABOD with min 
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores using min of LOF, ABOD, and KNN
combined_scores_df = calculate_knn_LOF_ABOD_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2857142857142857
Recall: 0.6666666666666666
AUC: 0.8282828282828283


In [167]:
# KNN LOF ABOD with min 
# dataset: Lymphography_withoutdupl_norm_1ofn.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-2].values
    
    # Separate labels (last column)
labels = df.iloc[:, -2].values

# Calculate combined outlier scores using min of LOF, ABOD, and KNN
combined_scores_df = calculate_knn_LOF_ABOD_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.4
Recall: 1.0
AUC: 0.9917257683215129


In [168]:
# KNN LOF ABOD with min 
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores using min of LOF, ABOD, and KNN
combined_scores_df = calculate_knn_LOF_ABOD_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 1.0
AUC: 0.9583333333333334


In [169]:
# KNN LOF ABOD with min 
# dataset: Pima_withoutdupl_norm_02_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores using min of LOF, ABOD, and KNN
combined_scores_df = calculate_knn_LOF_ABOD_min_outlier_scores(data, n_samples=15, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_knn_LOF_ABOD_min_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.6355555555555555


dataset transformation

In [170]:
import csv

def process_arff_to_csv(input_file_path, output_file_path):
    """
    Processes an ARFF file starting from @DATA section and converts it to a CSV file.
    
    Parameters:
        input_file_path (str): Path to the input .arff file.
        output_file_path (str): Path to save the output .csv file.
    """
    data_section = False  # Flag to start processing after @DATA
    with open(input_file_path, 'r') as infile, open(output_file_path, 'w', newline='') as outfile:
        csv_writer = csv.writer(outfile)
        
        for line in infile:
            line = line.strip()
            
            # Check for @DATA to start processing
            if line.upper() == "@DATA":
                data_section = True
                continue
            
            if data_section and line:  # Process only after @DATA and ignore empty lines
                # Split the line by commas
                values = line.split(",")
                
                # Convert last column: 'yes' -> 1, 'no' -> 0
                if values[-1].strip() == "'yes'":
                    values[-1] = 1
                elif values[-1].strip() == "'no'":
                    values[-1] = 0
                
                # Write the processed line to CSV
                csv_writer.writerow(values)

# File paths
input_file1 = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid\\Annthyroid_norm_02_v01.arff"
output_file1 = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"

# File paths
input_file2 = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.arff"
output_file2 = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"

# File paths
input_file3 = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography\\Cardiotocography_norm_02_v01.arff"
output_file3 = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"

# File paths
input_file4 = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.arff"
output_file4 = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"

# File paths
input_file5 = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.arff"
output_file5 = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"

# File paths
input_file7 = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.arff"
output_file7 = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"

# File paths
input_file8 = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima\\Pima_withoutdupl_norm_02_v01.arff"
output_file8 = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"

# Convert ARFF to CSV
process_arff_to_csv(input_file1, output_file1)

process_arff_to_csv(input_file2, output_file2)

process_arff_to_csv(input_file3, output_file3)

process_arff_to_csv(input_file4, output_file4)

process_arff_to_csv(input_file5, output_file5)

process_arff_to_csv(input_file7, output_file7)

process_arff_to_csv(input_file8, output_file8)

print("successfully converted all datasets to csv file.")


successfully converted all datasets to csv file.


In [171]:
def process_arff_to_csv_Lym(input_file_path, output_file_path):
    """
    Processes an ARFF file starting from @DATA section and converts it to a CSV file.
    
    Parameters:
        input_file_path (str): Path to the input .arff file.
        output_file_path (str): Path to save the output .csv file.
    """
    data_section = False  # Flag to start processing after @DATA
    with open(input_file_path, 'r') as infile, open(output_file_path, 'w', newline='') as outfile:
        csv_writer = csv.writer(outfile)
        
        for line in infile:
            line = line.strip()
            
            # Check for @DATA to start processing
            if line.upper() == "@DATA":
                data_section = True
                continue
            
            if data_section and line:  # Process only after @DATA and ignore empty lines
                # Split the line by commas
                values = line.split(",")
                
                # Convert last column: 'yes' -> 1, 'no' -> 0
                if values[-2].strip() == "'yes'":
                    values[-2] = 1
                elif values[-2].strip() == "'no'":
                    values[-2] = 0
                
                # Write the processed line to CSV
                csv_writer.writerow(values)


# File paths
input_file6 = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.arff"
output_file6 = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"

process_arff_to_csv_Lym(input_file6, output_file6)
