Recall: real outliers we said are outliers / all points we said are outliers(right or wrong)
Precision: real outliers we said are outliers / actual amount of outliers

KNN

In [4]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
from ucimlrepo import fetch_ucirepo 
import random

def calculate_outlier_scores_knn(data, n_samples=5, sample_size=0.8, n_neighbors=5):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    outlier_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        
        # Get the outlier scores for the sampled data
        sample_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Store the outlier score in the dictionary
        for idx, score in zip(sample_indices, sample_scores):
            outlier_scores[idx]['score'] += score
            outlier_scores[idx]['count'] += 1

    # Calculate the average outlier score for each instance
    avg_outlier_scores = {idx: scores['score'] / scores['count'] for idx, scores in outlier_scores.items()}
    
    # Convert to a DataFrame
    outlier_scores_df = pd.DataFrame(list(avg_outlier_scores.items()), columns=['Index', 'Avg_Outlier_Score'])
    
    return outlier_scores_df

def evaluate_outlier_detection(outlier_scores_df, ground_truth):
    # Convert outlier scores and ground truth labels to the same order
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Outlier_Score'].values

    # Calculate precision, recall, and AUC
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile for demonstration
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc

# Example usage
# Assuming 'data' is your dataset and 'labels' contains ground truth labels where 1 = outlier and 0 = normal
data = np.random.rand(1000, 50)  # Random dataset with 100 samples and 5 features
labels = np.random.choice([0, 1], size=1000, p=[0.9, 0.1])  # Random labels (10% outliers)


# Calculate outlier scores
outlier_scores_df = calculate_outlier_scores_knn(data, n_samples=5, sample_size=0.8, n_neighbors=5)

# Evaluate the outlier detection
precision, recall, auc = evaluate_outlier_detection(outlier_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.17
Recall: 0.16831683168316833
AUC: 0.5079571360918071


KNN 
dataset: Annthyroid_norm_02_v01

LOF

In [5]:
import numpy as np
import pandas as pd
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_outlier_scores_lof_pyod(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    outlier_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        
        # Get the outlier scores for the sampled data
        sample_scores = lof.decision_scores_  # higher score -> more outlier
        
        # Store the outlier score in the dictionary
        for idx, score in zip(sample_indices, sample_scores):
            outlier_scores[idx]['score'] += score
            outlier_scores[idx]['count'] += 1

    # Calculate the average outlier score for each instance
    avg_outlier_scores = {idx: scores['score'] / scores['count'] for idx, scores in outlier_scores.items()}
    
    # Convert to a DataFrame
    outlier_scores_df = pd.DataFrame(list(avg_outlier_scores.items()), columns=['Index', 'Avg_Outlier_Score'])
    
    return outlier_scores_df

def evaluate_outlier_detection(outlier_scores_df, ground_truth):
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc

# Example usage
# Assuming 'data' is your dataset and 'labels' contains ground truth labels where 1 = outlier and 0 = normal
data = np.random.rand(1000, 5)  # Generate a random dataset of 1000 samples and 5 features
labels = np.random.choice([0, 1], size=1000, p=[0.9, 0.1])  # Random labels with 10% outliers

# Calculate outlier scores using LOF
outlier_scores_df = calculate_outlier_scores_lof_pyod(data, n_samples=5, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_outlier_detection(outlier_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)



Precision: 0.1
Recall: 0.09433962264150944
AUC: 0.47587691528428516


ABOD

In [3]:
import numpy as np
import pandas as pd
from pyod.models.abod import ABOD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_abod_outlier_scores(data, n_samples=5, sample_size=0.8):
    """
    Calculates outlier scores using ABOD on random samples and averages scores
    for data points appearing in multiple samples.
    
    Parameters:
        data (numpy array): Input dataset of shape (n_samples, n_features).
        n_samples (int): Number of random samples to create.
        sample_size (float): Proportion of the dataset to include in each sample.
        
    Returns:
        pd.DataFrame: A DataFrame containing the indices and averaged ABOD scores.
    """
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    outlier_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply ABOD from pyod
        abod = ABOD()
        abod.fit(sample_data)
        sample_scores = abod.decision_scores_  # higher score -> more outlier
        
        # Store the outlier scores in the dictionary
        for idx, score in zip(sample_indices, sample_scores):
            outlier_scores[idx]['score'] += score
            outlier_scores[idx]['count'] += 1

    # Calculate the average outlier score for each instance
    avg_outlier_scores = {idx: scores['score'] / scores['count'] for idx, scores in outlier_scores.items()}
    
    # Convert to a DataFrame
    outlier_scores_df = pd.DataFrame(list(avg_outlier_scores.items()), columns=['Index', 'Avg_ABOD_Score'])
    
    return outlier_scores_df

def evaluate_outlier_detection(outlier_scores_df, ground_truth):
    """
    Evaluates the outlier detection performance using precision, recall, and AUC.

    Parameters:
        outlier_scores_df (pd.DataFrame): DataFrame with outlier scores for each instance.
        ground_truth (pd.Series or numpy array): Ground truth labels (1 = outlier, 0 = normal).
        
    Returns:
        tuple: Precision, Recall, and AUC scores.
    """
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_ABOD_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc

# Example usage
# Generate a random dataset and ground truth labels
data = np.random.rand(100, 5)  # Random dataset with 100 samples and 5 features
labels = np.random.choice([0, 1], size=100, p=[0.9, 0.1])  # Random labels with 10% outliers

# Calculate ABOD outlier scores
abod_scores_df = calculate_abod_outlier_scores(data, n_samples=5, sample_size=0.8)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_outlier_detection(abod_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.1
Recall: 0.1111111111111111
AUC: 0.6007326007326007


KNN and LOF with mean

In [2]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_combined_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    combined_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        knn_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        lof_scores = lof.decision_scores_  # higher score -> more outlier

        # Calculate the combined score as the average of KNN and LOF scores
        combined_sample_scores = (knn_scores + lof_scores) / 2

        # Store the combined score in the dictionary for each data point in the sample
        for idx, score in zip(sample_indices, combined_sample_scores):
            combined_scores[idx]['score'] += score
            combined_scores[idx]['count'] += 1

    # Calculate the average combined score for each instance
    avg_combined_scores = {idx: scores['score'] / scores['count'] for idx, scores in combined_scores.items()}
    
    # Convert to a DataFrame
    combined_scores_df = pd.DataFrame(list(avg_combined_scores.items()), columns=['Index', 'Avg_Combined_Outlier_Score'])
    
    return combined_scores_df

def evaluate_outlier_detection(outlier_scores_df, ground_truth):
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Combined_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc

# Example usage
# Assuming 'data' is your dataset and 'labels' contains ground truth labels where 1 = outlier and 0 = normal
data = np.random.rand(1000, 50)  # Generate a random dataset of 100 samples and 5 features
labels = np.random.choice([0, 1], size=1000, p=[0.9, 0.1])  # Random labels with 10% outliers

# Calculate combined outlier scores using both LOF and KNN
combined_scores_df = calculate_combined_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.08
Recall: 0.0851063829787234
AUC: 0.5179770795171669


KNN and LOF with max

In [14]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_combined_outlier_scores_max(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    combined_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        knn_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        lof_scores = lof.decision_scores_  # higher score -> more outlier

        # Calculate the combined score as the max of KNN and LOF scores
        combined_sample_scores = np.maximum(knn_scores, lof_scores)

        # Store the combined score in the dictionary for each data point in the sample
        for idx, score in zip(sample_indices, combined_sample_scores):
            combined_scores[idx]['score'] += score
            combined_scores[idx]['count'] += 1

    # Calculate the average combined score for each instance across samples
    avg_combined_scores = {idx: scores['score'] / scores['count'] for idx, scores in combined_scores.items()}
    
    # Convert to a DataFrame
    combined_scores_df = pd.DataFrame(list(avg_combined_scores.items()), columns=['Index', 'Avg_Max_Outlier_Score'])
    
    return combined_scores_df

def evaluate_outlier_detection(outlier_scores_df, ground_truth):
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Max_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc

# Example usage
# Assuming 'data' is your dataset and 'labels' contains ground truth labels where 1 = outlier and 0 = normal
data = np.random.rand(100, 5)  # Generate a random dataset of 100 samples and 5 features
labels = np.random.choice([0, 1], size=100, p=[0.9, 0.1])  # Random labels with 10% outliers

# Calculate combined outlier scores using the max of LOF and KNN
combined_scores_df = calculate_combined_outlier_scores_max(data, n_samples=5, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 0.18181818181818182
AUC: 0.5801838610827375


KNN and LOF with min

In [1]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_combined_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    combined_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        knn_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        lof_scores = lof.decision_scores_  # higher score -> more outlier

        # Calculate the combined score as the average of KNN and LOF scores
        combined_sample_scores = np.minimum(knn_scores , lof_scores)

        # Store the combined score in the dictionary for each data point in the sample
        for idx, score in zip(sample_indices, combined_sample_scores):
            combined_scores[idx]['score'] += score
            combined_scores[idx]['count'] += 1

    # Calculate the average combined score for each instance
    avg_combined_scores = {idx: scores['score'] / scores['count'] for idx, scores in combined_scores.items()}
    
    # Convert to a DataFrame
    combined_scores_df = pd.DataFrame(list(avg_combined_scores.items()), columns=['Index', 'Avg_Combined_Outlier_Score'])
    
    return combined_scores_df

def evaluate_outlier_detection(outlier_scores_df, ground_truth):
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Combined_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc

# Example usage
# Assuming 'data' is your dataset and 'labels' contains ground truth labels where 1 = outlier and 0 = normal
data = np.random.rand(1000, 50)  # Generate a random dataset of 100 samples and 5 features
labels = np.random.choice([0, 1], size=1000, p=[0.9, 0.1])  # Random labels with 10% outliers

# Calculate combined outlier scores using both LOF and KNN
combined_scores_df = calculate_combined_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.1
Recall: 0.0970873786407767
AUC: 0.4915807038834951


KNN LOF ABOD with avg

In [10]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.abod import ABOD
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
import random

def calculate_combined_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    """
    Calculates combined outlier scores using LOF, ABOD, and KNN on random samples
    and averages scores for data points appearing in multiple samples.
    
    Parameters:
        data (numpy array): Input dataset of shape (n_samples, n_features).
        n_samples (int): Number of random samples to create.
        sample_size (float): Proportion of the dataset to include in each sample.
        n_neighbors (int): Number of neighbors for LOF and KNN algorithms.
        
    Returns:
        pd.DataFrame: A DataFrame containing the indices and averaged combined scores.
    """
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    combined_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        knn_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        lof_scores = lof.decision_scores_  # higher score -> more outlier
        
        # Apply ABOD from pyod
        abod = ABOD()
        abod.fit(sample_data)
        abod_scores = abod.decision_scores_  # higher score -> more outlier
        
        # Calculate the combined score as the average of KNN, LOF, and ABOD scores
        combined_sample_scores = (knn_scores + lof_scores + abod_scores) / 3

        # Store the combined score in the dictionary for each data point in the sample
        for idx, score in zip(sample_indices, combined_sample_scores):
            combined_scores[idx]['score'] += score
            combined_scores[idx]['count'] += 1

    # Calculate the average combined score for each instance
    avg_combined_scores = {idx: scores['score'] / scores['count'] for idx, scores in combined_scores.items()}
    
    # Convert to a DataFrame
    combined_scores_df = pd.DataFrame(list(avg_combined_scores.items()), columns=['Index', 'Avg_Combined_Outlier_Score'])
    
    return combined_scores_df

def evaluate_outlier_detection(outlier_scores_df, ground_truth):
    """
    Evaluates the outlier detection performance using precision, recall, and AUC.

    Parameters:
        outlier_scores_df (pd.DataFrame): DataFrame with outlier scores for each instance.
        ground_truth (pd.Series or numpy array): Ground truth labels (1 = outlier, 0 = normal).
        
    Returns:
        tuple: Precision, Recall, and AUC scores.
    """
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Combined_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc

# Example usage
# Generate a random dataset and ground truth labels
data = np.random.rand(100, 5)  # Random dataset with 100 samples and 5 features
labels = np.random.choice([0, 1], size=100, p=[0.9, 0.1])  # Random labels with 10% outliers

# Calculate combined outlier scores
combined_scores_df = calculate_combined_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.1
Recall: 0.14285714285714285
AUC: 0.6113671274961597


KNN LOF ABOD with max

In [13]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.abod import ABOD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_combined_outlier_scores_max(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    """
    Calculates combined outlier scores using the maximum score from LOF, ABOD, and KNN on random samples
    and averages scores for data points appearing in multiple samples.
    
    Parameters:
        data (numpy array): Input dataset of shape (n_samples, n_features).
        n_samples (int): Number of random samples to create.
        sample_size (float): Proportion of the dataset to include in each sample.
        n_neighbors (int): Number of neighbors for LOF and KNN algorithms.
        
    Returns:
        pd.DataFrame: A DataFrame containing the indices and averaged combined scores.
    """
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    combined_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        knn_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        lof_scores = lof.decision_scores_  # higher score -> more outlier
        
        # Apply ABOD from pyod
        abod = ABOD()
        abod.fit(sample_data)
        abod_scores = abod.decision_scores_  # higher score -> more outlier
        
        # Calculate the combined score as the maximum of KNN, LOF, and ABOD scores
        combined_sample_scores = np.maximum.reduce([knn_scores, lof_scores, abod_scores])

        # Store the combined score in the dictionary for each data point in the sample
        for idx, score in zip(sample_indices, combined_sample_scores):
            combined_scores[idx]['score'] += score
            combined_scores[idx]['count'] += 1

    # Calculate the average combined score for each instance
    avg_combined_scores = {idx: scores['score'] / scores['count'] for idx, scores in combined_scores.items()}
    
    # Convert to a DataFrame
    combined_scores_df = pd.DataFrame(list(avg_combined_scores.items()), columns=['Index', 'Avg_Combined_Outlier_Score'])
    
    return combined_scores_df

def evaluate_outlier_detection(outlier_scores_df, ground_truth):
    """
    Evaluates the outlier detection performance using precision, recall, and AUC.

    Parameters:
        outlier_scores_df (pd.DataFrame): DataFrame with outlier scores for each instance.
        ground_truth (pd.Series or numpy array): Ground truth labels (1 = outlier, 0 = normal).
        
    Returns:
        tuple: Precision, Recall, and AUC scores.
    """
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Combined_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc

# Example usage
# Generate a random dataset and ground truth labels
data = np.random.rand(100, 5)  # Random dataset with 100 samples and 5 features
labels = np.random.choice([0, 1], size=100, p=[0.9, 0.1])  # Random labels with 10% outliers

# Calculate combined outlier scores using max of LOF, ABOD, and KNN
combined_scores_df = calculate_combined_outlier_scores_max(data, n_samples=5, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.1
Recall: 0.06666666666666667
AUC: 0.4752941176470588


KNN LOF ABOD with min

In [6]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.abod import ABOD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_combined_outlier_scores_max(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    """
    Calculates combined outlier scores using the maximum score from LOF, ABOD, and KNN on random samples
    and averages scores for data points appearing in multiple samples.
    
    Parameters:
        data (numpy array): Input dataset of shape (n_samples, n_features).
        n_samples (int): Number of random samples to create.
        sample_size (float): Proportion of the dataset to include in each sample.
        n_neighbors (int): Number of neighbors for LOF and KNN algorithms.
        
    Returns:
        pd.DataFrame: A DataFrame containing the indices and averaged combined scores.
    """
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    combined_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        knn_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        lof_scores = lof.decision_scores_  # higher score -> more outlier
        
        # Apply ABOD from pyod
        abod = ABOD()
        abod.fit(sample_data)
        abod_scores = abod.decision_scores_  # higher score -> more outlier
        
        # Calculate the combined score as the minimum of KNN, LOF, and ABOD scores
        combined_sample_scores = np.minimum.reduce([knn_scores, lof_scores, abod_scores])

        # Store the combined score in the dictionary for each data point in the sample
        for idx, score in zip(sample_indices, combined_sample_scores):
            combined_scores[idx]['score'] += score
            combined_scores[idx]['count'] += 1

    # Calculate the average combined score for each instance
    avg_combined_scores = {idx: scores['score'] / scores['count'] for idx, scores in combined_scores.items()}
    
    # Convert to a DataFrame
    combined_scores_df = pd.DataFrame(list(avg_combined_scores.items()), columns=['Index', 'Avg_Combined_Outlier_Score'])
    
    return combined_scores_df

def evaluate_outlier_detection(outlier_scores_df, ground_truth):
    """
    Evaluates the outlier detection performance using precision, recall, and AUC.

    Parameters:
        outlier_scores_df (pd.DataFrame): DataFrame with outlier scores for each instance.
        ground_truth (pd.Series or numpy array): Ground truth labels (1 = outlier, 0 = normal).
        
    Returns:
        tuple: Precision, Recall, and AUC scores.
    """
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Combined_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc

# Example usage
# Generate a random dataset and ground truth labels
#data = np.random.rand(100, 5)  # Random dataset with 100 samples and 5 features
#labels = np.random.choice([0, 1], size=100, p=[0.9, 0.1])  # Random labels with 10% outliers

file_path = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
    
    # Separate data (all columns except the last)
data = df.iloc[:, :-1].values
    
    # Separate labels (last column)
labels = df.iloc[:, -1].values

# Calculate combined outlier scores using min of LOF, ABOD, and KNN
combined_scores_df = calculate_combined_outlier_scores_max(data, n_samples=5, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.06764705882352941
Recall: 0.3382352941176471
AUC: 0.7694735378012486


dataset transformation

In [3]:
import csv

def process_arff_to_csv(input_file_path, output_file_path):
    """
    Processes an ARFF file starting from @DATA section and converts it to a CSV file.
    
    Parameters:
        input_file_path (str): Path to the input .arff file.
        output_file_path (str): Path to save the output .csv file.
    """
    data_section = False  # Flag to start processing after @DATA
    with open(input_file_path, 'r') as infile, open(output_file_path, 'w', newline='') as outfile:
        csv_writer = csv.writer(outfile)
        
        for line in infile:
            line = line.strip()
            
            # Check for @DATA to start processing
            if line.upper() == "@DATA":
                data_section = True
                continue
            
            if data_section and line:  # Process only after @DATA and ignore empty lines
                # Split the line by commas
                values = line.split(",")
                
                # Convert last column: 'yes' -> 1, 'no' -> 0
                if values[-1].strip() == "'yes'":
                    values[-1] = 1
                elif values[-1].strip() == "'no'":
                    values[-1] = 0
                
                # Write the processed line to CSV
                csv_writer.writerow(values)

# File paths
input_file = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid\\Annthyroid_norm_02_v01.arff"
output_file = "G:\\Nazanin\\B project\\code\\dataset\\Annthyroid\\Annthyroid_norm_02_v01.csv"

# Convert ARFF to CSV
process_arff_to_csv(input_file, output_file)

print(f"Processed CSV saved to: {output_file}")


Processed CSV saved to: G:\Nazanin\B project\code\dataset\Annthyroid\Annthyroid_norm_02_v01.csv
