In [35]:
#ECOD 2022
import numpy as np
import pandas as pd
from pyod.models.ecod import ECOD
from sklearn.metrics import precision_score, recall_score, roc_auc_score

def calculate_ecod_outlier_scores(data):
    """
    Calculate outlier scores using the ECOD model.

    Args:
        data: Input dataset (numpy array or pandas DataFrame).

    Returns:
        outlier_scores: Array of outlier scores.
    """
    # Apply ECOD from pyod
    ecod = ECOD()
    ecod.fit(data)

    # Get the outlier scores
    outlier_scores = ecod.decision_scores_  # higher score -> more outlier

    return outlier_scores

def evaluate_ecod_outlier_detection(outlier_scores, ground_truth):
    """
    Evaluate ECOD outlier detection results using precision, recall, and AUC.

    Args:
        outlier_scores: Array of outlier scores.
        ground_truth: Ground truth labels (numpy array or pandas Series).

    Returns:
        precision: Precision of the model.
        recall: Recall of the model.
        auc: AUC of the model.
    """
    # Calculate precision, recall, and AUC
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile for demonstration
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc



In [36]:
# ECOD
# dataset: Annthyroid_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_ecod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_ecod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_ecod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Precision: 0.051094890510948905
Recall: 0.25925925925925924
AUC: 0.738047642845244


In [37]:
# ECOD
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_ecod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_ecod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_ecod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.26530612244897955


In [38]:
# ECOD
# dataset: Cardiotocography_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_ecod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_ecod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_ecod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.058823529411764705
Recall: 0.3333333333333333
AUC: 0.8679718875502008


In [39]:
# ECOD
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (70% train, 30% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.3, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_ecod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_ecod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_ecod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Precision: 0.2
Recall: 1.0
AUC: 0.9111111111111111


In [40]:
# ECOD
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_ecod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_ecod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_ecod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Precision: 0.5
Recall: 1.0
AUC: 1.0


In [41]:
# ECOD
# dataset: Lymphography_withoutdupl_norm_1ofn.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-2].values

# Separate labels (last column)
labels = df.iloc[:, -2].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_ecod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_ecod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_ecod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.3333333333333333
Recall: 1.0
AUC: 1.0


In [42]:
# ECOD
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values


outlier_scores = calculate_ecod_outlier_scores(data)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_ecod_outlier_detection(outlier_scores, labels)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 1.0
AUC: 0.9583333333333334


In [43]:
# ECOD
# dataset: Pima_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_ecod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_ecod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_ecod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.5


In [44]:
#COPOD 2020
import numpy as np
import pandas as pd
from pyod.models.copod import COPOD
from sklearn.metrics import precision_score, recall_score, roc_auc_score

def calculate_copod_outlier_scores(data):
    """
    Calculate outlier scores using the COPOD model.

    Args:
        data: Input dataset (numpy array or pandas DataFrame).

    Returns:
        outlier_scores: Array of outlier scores.
    """
    # Apply COPOD from pyod
    copod = COPOD()
    copod.fit(data)

    # Get the outlier scores
    outlier_scores = copod.decision_scores_  # higher score -> more outlier

    return outlier_scores

def evaluate_copod_outlier_detection(outlier_scores, ground_truth):
    """
    Evaluate ECOD outlier detection results using precision, recall, and AUC.

    Args:
        outlier_scores: Array of outlier scores.
        ground_truth: Ground truth labels (numpy array or pandas Series).

    Returns:
        precision: Precision of the model.
        recall: Recall of the model.
        auc: AUC of the model.
    """
    # Calculate precision, recall, and AUC
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile for demonstration
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc



In [50]:
# COPOD
# dataset: Annthyroid_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_copod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_copod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_copod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)



Precision: 0.051094890510948905
Recall: 0.25925925925925924
AUC: 0.6914598256427342


In [51]:
# COPOD
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_copod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_copod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_copod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.40816326530612246


In [52]:
# COPOD
# dataset: Cardiotocography_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_copod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_copod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_copod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.058823529411764705
Recall: 0.3333333333333333
AUC: 0.7966867469879517


In [53]:
# COPOD
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (70% train, 30% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.3, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_copod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_copod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_copod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Precision: 0.2
Recall: 1.0
AUC: 0.9555555555555556


In [54]:
# COPOD
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_copod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_copod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_copod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Precision: 0.5
Recall: 1.0
AUC: 1.0


In [55]:
# COPOD
# dataset: Lymphography_withoutdupl_norm_1ofn.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-2].values

# Separate labels (last column)
labels = df.iloc[:, -2].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_copod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_copod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_copod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.3333333333333333
Recall: 1.0
AUC: 1.0


In [56]:
# COPOD
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values


outlier_scores = calculate_copod_outlier_scores(data)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_copod_outlier_detection(outlier_scores, labels)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 1.0
AUC: 1.0


In [57]:
# COPOD
# dataset: Pima_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_copod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_copod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_copod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.09090909090909091
Recall: 0.5
AUC: 0.605


In [1]:
#ROD 2020
import numpy as np
import pandas as pd
from pyod.models.rod import ROD
from sklearn.metrics import precision_score, recall_score, roc_auc_score

def calculate_rod_outlier_scores(data):
    """
    Calculate outlier scores using the ROD model.

    Args:
        data: Input dataset (numpy array or pandas DataFrame).

    Returns:
        outlier_scores: Array of outlier scores.
    """
    # Apply ROD from pyod
    rod = ROD()
    rod.fit(data)

    # Get the outlier scores
    outlier_scores = rod.decision_scores_  # higher score -> more outlier

    return outlier_scores

def evaluate_rod_outlier_detection(outlier_scores, ground_truth):
    """
    Evaluate ECOD outlier detection results using precision, recall, and AUC.

    Args:
        outlier_scores: Array of outlier scores.
        ground_truth: Ground truth labels (numpy array or pandas Series).

    Returns:
        precision: Precision of the model.
        recall: Recall of the model.
        auc: AUC of the model.
    """
    # Calculate precision, recall, and AUC
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile for demonstration
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc



In [3]:
# ROD
# dataset: Annthyroid_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_rod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_rod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_rod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Precision: 0.072992700729927
Recall: 0.37037037037037035
AUC: 0.7775556666111388


In [3]:
# ROD
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Reduce to 50 dimensions for faster processing
pca = PCA(n_components=100, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_rod_outlier_scores(X_train_pca)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_rod_outlier_scores(X_val_pca)
        
# Evaluate the performance
precision, recall, auc = evaluate_rod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.6734693877551021


In [3]:
# ROD
# dataset: Cardiotocography_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_rod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_rod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_rod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


  np.arccos(np.clip(np.dot(_x, gm) / (v_norm * norm_), -1, 1)),


Precision: 0.058823529411764705
Recall: 0.3333333333333333
AUC: 0.8719879518072289


In [5]:
# ROD
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (70% train, 30% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.3, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_rod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_rod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_rod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Precision: 0.2
Recall: 1.0
AUC: 0.9333333333333333


In [6]:
# ROD
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_rod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_rod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_rod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Precision: 0.0
Recall: 0.0
AUC: 0.8461538461538461


In [7]:
# ROD
# dataset: Lymphography_withoutdupl_norm_1ofn.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-2].values

# Separate labels (last column)
labels = df.iloc[:, -2].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_rod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_rod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_rod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


  np.arccos(np.clip(np.dot(_x, gm) / (v_norm * norm_), -1, 1)),
  np.arccos(np.clip(np.dot(_x, gm) / (v_norm * norm_), -1, 1)),
  np.arccos(np.clip(np.dot(_x, gm) / (v_norm * norm_), -1, 1)),
  np.arccos(np.clip(np.dot(_x, gm) / (v_norm * norm_), -1, 1)),
  np.arccos(np.clip(np.dot(_x, gm) / (v_norm * norm_), -1, 1)),
  np.arccos(np.clip(np.dot(_x, gm) / (v_norm * norm_), -1, 1)),
  np.arccos(np.clip(np.dot(_x, gm) / (v_norm * norm_), -1, 1)),


Precision: 0.3333333333333333
Recall: 1.0
AUC: 1.0


In [8]:
# ROD
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values


outlier_scores = calculate_rod_outlier_scores(data)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_rod_outlier_detection(outlier_scores, labels)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 1.0
AUC: 1.0


In [9]:
# ROD
# dataset: Pima_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_rod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_rod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_rod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.45


In [10]:
#DIF 2023
import numpy as np
import pandas as pd
from pyod.models.iforest import IForest
from sklearn.metrics import precision_score, recall_score, roc_auc_score

def calculate_dif_outlier_scores(data):
    """
    Calculate outlier scores using the DIF model.

    Args:
        data: Input dataset (numpy array or pandas DataFrame).

    Returns:
        outlier_scores: Array of outlier scores.
    """
    # Apply DIF from pyod
    dif = IForest()
    dif.fit(data)

    # Get the outlier scores
    outlier_scores = dif.decision_scores_  # higher score -> more outlier

    return outlier_scores

def evaluate_dif_outlier_detection(outlier_scores, ground_truth):
    """
    Evaluate ECOD outlier detection results using precision, recall, and AUC.

    Args:
        outlier_scores: Array of outlier scores.
        ground_truth: Ground truth labels (numpy array or pandas Series).

    Returns:
        precision: Precision of the model.
        recall: Recall of the model.
        auc: AUC of the model.
    """
    # Calculate precision, recall, and AUC
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile for demonstration
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc



In [11]:
# DIF
# dataset: Annthyroid_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_dif_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_dif_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_dif_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Precision: 0.043795620437956206
Recall: 0.2222222222222222
AUC: 0.6220500860680771


In [12]:
# DIF
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_dif_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_dif_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_dif_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.24489795918367352


In [13]:
# DIF
# dataset: Cardiotocography_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_dif_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_dif_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_dif_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.058823529411764705
Recall: 0.3333333333333333
AUC: 0.8323293172690763


In [14]:
# DIF
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (70% train, 30% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.3, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_dif_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_dif_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_dif_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Precision: 0.0
Recall: 0.0
AUC: 0.8666666666666667


In [15]:
# DIF
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_dif_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_dif_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_dif_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Precision: 0.5
Recall: 1.0
AUC: 1.0


In [16]:
# DIF
# dataset: Lymphography_withoutdupl_norm_1ofn.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-2].values

# Separate labels (last column)
labels = df.iloc[:, -2].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_dif_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_dif_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_dif_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.3333333333333333
Recall: 1.0
AUC: 1.0


In [17]:
# DIF
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values


outlier_scores = calculate_dif_outlier_scores(data)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_dif_outlier_detection(outlier_scores, labels)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 1.0
AUC: 1.0


In [18]:
# DIF
# dataset: Pima_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_dif_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_dif_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_dif_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.54


In [19]:
#SUOD 2021
import numpy as np
import pandas as pd
from pyod.models.suod import SUOD
from sklearn.metrics import precision_score, recall_score, roc_auc_score

def calculate_suod_outlier_scores(data):
    """
    Calculate outlier scores using the SUOD model.

    Args:
        data: Input dataset (numpy array or pandas DataFrame).

    Returns:
        outlier_scores: Array of outlier scores.
    """
    # Apply SUOD from pyod
    suod = SUOD()
    suod.fit(data)

    # Get the outlier scores
    outlier_scores = suod.decision_scores_  # higher score -> more outlier

    return outlier_scores

def evaluate_suod_outlier_detection(outlier_scores, ground_truth):
    """
    Evaluate outlier detection results using precision, recall, and AUC.

    Args:
        outlier_scores: Array of outlier scores.
        ground_truth: Ground truth labels (numpy array or pandas Series).

    Returns:
        precision: Precision of the model.
        recall: Recall of the model.
        auc: AUC of the model.
    """
    # Calculate precision, recall, and AUC
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile for demonstration
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc



In [20]:
# SUOD
# dataset: Annthyroid_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_suod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_suod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_suod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

RandomForestRegressor()

RandomForestRegressor()

Precision: 0.043795620437956206
Recall: 0.2222222222222222
AUC: 0.6160253206729969


In [21]:
# SUOD
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_suod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_suod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_suod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


RandomForestRegressor()

RandomForestRegressor()

Precision: 0.0
Recall: 0.0
AUC: 0.6938775510204082


In [22]:
# SUOD
# dataset: Cardiotocography_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_suod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_suod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_suod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


RandomForestRegressor()

RandomForestRegressor()

Precision: 0.058823529411764705
Recall: 0.3333333333333333
AUC: 0.8042168674698795


In [23]:
# SUOD
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (70% train, 30% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.3, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_suod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_suod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_suod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

RandomForestRegressor()

RandomForestRegressor()

Precision: 0.2
Recall: 1.0
AUC: 0.9555555555555556


In [24]:
# SUOD
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_suod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_suod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_suod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

RandomForestRegressor()

RandomForestRegressor()

Precision: 0.5
Recall: 1.0
AUC: 1.0


In [25]:
# SUOD
# dataset: Lymphography_withoutdupl_norm_1ofn.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-2].values

# Separate labels (last column)
labels = df.iloc[:, -2].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_suod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_suod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_suod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


RandomForestRegressor()

RandomForestRegressor()

Precision: 0.3333333333333333
Recall: 1.0
AUC: 1.0


In [26]:
# SUOD
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values


outlier_scores = calculate_suod_outlier_scores(data)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_suod_outlier_detection(outlier_scores, labels)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


RandomForestRegressor()

Precision: 0.2
Recall: 1.0
AUC: 1.0


In [27]:
# SUOD
# dataset: Pima_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_suod_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_suod_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_suod_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


RandomForestRegressor()

RandomForestRegressor()

Precision: 0.09090909090909091
Recall: 0.5
AUC: 0.655


In [28]:
#LUNAR 2022
import numpy as np
import pandas as pd
from pyod.models.lunar import LUNAR
from sklearn.metrics import precision_score, recall_score, roc_auc_score

def calculate_lunar_outlier_scores(data):
    """
    Calculate outlier scores using the lunar model.

    Args:
        data: Input dataset (numpy array or pandas DataFrame).

    Returns:
        outlier_scores: Array of outlier scores.
    """
    # Apply LUNAR from pyod
    lunar = LUNAR()
    lunar.fit(data)

    # Get the outlier scores
    outlier_scores = lunar.decision_scores_  # higher score -> more outlier

    return outlier_scores

def evaluate_lunar_outlier_detection(outlier_scores, ground_truth):
    """
    Evaluate outlier detection results using precision, recall, and AUC.

    Args:
        outlier_scores: Array of outlier scores.
        ground_truth: Ground truth labels (numpy array or pandas Series).

    Returns:
        precision: Precision of the model.
        recall: Recall of the model.
        auc: AUC of the model.
    """
    # Calculate precision, recall, and AUC
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile for demonstration
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc



In [29]:
# LUNAR
# dataset: Annthyroid_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_lunar_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_lunar_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_lunar_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Precision: 0.029197080291970802
Recall: 0.14814814814814814
AUC: 0.4962518740629685


In [30]:
# LUNAR
# dataset: Arrhythmia_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Arrhythmia\\Arrhythmia_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_lunar_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_lunar_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_lunar_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.5102040816326531


In [31]:
# LUNAR
# dataset: Cardiotocography_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Cardiotocography\\Cardiotocography_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_lunar_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_lunar_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_lunar_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.029411764705882353
Recall: 0.16666666666666666
AUC: 0.7439759036144578


In [32]:
# LUNAR
# dataset: HeartDisease_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\HeartDisease\\HeartDisease_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (70% train, 30% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.3, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_lunar_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_lunar_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_lunar_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Precision: 0.0
Recall: 0.0
AUC: 0.6222222222222222


In [33]:
# LUNAR
# dataset: Hepatitis_withoutdupl_norm_05_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Hepatitis\\Hepatitis_withoutdupl_norm_05_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_lunar_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_lunar_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_lunar_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Precision: 0.5
Recall: 1.0
AUC: 1.0


In [34]:
# LUNAR
# dataset: Lymphography_withoutdupl_norm_1ofn.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Lymphography\\Lymphography_withoutdupl_norm_1ofn.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-2].values

# Separate labels (last column)
labels = df.iloc[:, -2].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_lunar_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_lunar_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_lunar_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.3333333333333333
Recall: 1.0
AUC: 1.0


In [35]:
# LUNAR
# dataset: Parkinson_withoutdupl_norm_05_v01.csv

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Parkinson\\Parkinson_withoutdupl_norm_05_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values


outlier_scores = calculate_lunar_outlier_scores(data)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_lunar_outlier_detection(outlier_scores, labels)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 1.0
AUC: 0.9791666666666666


In [36]:
# LUNAR
# dataset: Pima_withoutdupl_norm_02_v01.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# File path to the dataset
file_path = "G:\\Nazanin\\B project\\code\\dataset\\Pima\\Pima_withoutdupl_norm_02_v01.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Separate data (all columns except the last)
data = df.iloc[:, :-1].values

# Separate labels (last column)
labels = df.iloc[:, -1].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Calculate outlier scores on the training set
predicted_outlier_scores = calculate_lunar_outlier_scores(X_train)
        
# Evaluate the performance on the validation set
val_outlier_scores = calculate_lunar_outlier_scores(X_val)
        
# Evaluate the performance
precision, recall, auc = evaluate_lunar_outlier_detection(val_outlier_scores, y_val)

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.0
Recall: 0.0
AUC: 0.605
