Recall: real outliers we said are outliers / all points we said are outliers(right or wrong)
Precision: real outliers we said are outliers / actual amount of outliers

KNN

In [36]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
from ucimlrepo import fetch_ucirepo 
import random

def calculate_outlier_scores_knn(data, n_samples=5, sample_size=0.8, n_neighbors=5):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    outlier_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        
        # Get the outlier scores for the sampled data
        sample_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Store the outlier score in the dictionary
        for idx, score in zip(sample_indices, sample_scores):
            outlier_scores[idx]['score'] += score
            outlier_scores[idx]['count'] += 1

    # Calculate the average outlier score for each instance
    avg_outlier_scores = {idx: scores['score'] / scores['count'] for idx, scores in outlier_scores.items()}
    
    # Convert to a DataFrame
    outlier_scores_df = pd.DataFrame(list(avg_outlier_scores.items()), columns=['Index', 'Avg_Outlier_Score'])
    
    return outlier_scores_df

def evaluate_outlier_detection(outlier_scores_df, ground_truth):
    # Convert outlier scores and ground truth labels to the same order
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Outlier_Score'].values

    # Calculate precision, recall, and AUC
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile for demonstration
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc

# Example usage
# Assuming 'data' is your dataset and 'labels' contains ground truth labels where 1 = outlier and 0 = normal
data = np.random.rand(1000, 50)  # Random dataset with 100 samples and 5 features
labels = np.random.choice([0, 1], size=1000, p=[0.9, 0.1])  # Random labels (10% outliers)

#statlog_shuttle = fetch_ucirepo(id=148) 
  
# data (as pandas dataframes) 
#data = statlog_shuttle.data.features 
#labels = statlog_shuttle.data.targets 

# Calculate outlier scores
outlier_scores_df = calculate_outlier_scores_knn(data, n_samples=5, sample_size=0.8, n_neighbors=5)

# Evaluate the outlier detection
precision, recall, auc = evaluate_outlier_detection(outlier_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.1
Recall: 0.09174311926605505
AUC: 0.46731329605947347


LOF

In [14]:
import numpy as np
import pandas as pd
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_outlier_scores_lof_pyod(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    outlier_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        
        # Get the outlier scores for the sampled data
        sample_scores = lof.decision_scores_  # higher score -> more outlier
        
        # Store the outlier score in the dictionary
        for idx, score in zip(sample_indices, sample_scores):
            outlier_scores[idx]['score'] += score
            outlier_scores[idx]['count'] += 1

    # Calculate the average outlier score for each instance
    avg_outlier_scores = {idx: scores['score'] / scores['count'] for idx, scores in outlier_scores.items()}
    
    # Convert to a DataFrame
    outlier_scores_df = pd.DataFrame(list(avg_outlier_scores.items()), columns=['Index', 'Avg_Outlier_Score'])
    
    return outlier_scores_df

def evaluate_outlier_detection(outlier_scores_df, ground_truth):
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc

# Example usage
# Assuming 'data' is your dataset and 'labels' contains ground truth labels where 1 = outlier and 0 = normal
data = np.random.rand(1000, 5)  # Generate a random dataset of 1000 samples and 5 features
labels = np.random.choice([0, 1], size=1000, p=[0.9, 0.1])  # Random labels with 10% outliers

# Calculate outlier scores using LOF
outlier_scores_df = calculate_outlier_scores_lof_pyod(data, n_samples=5, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_outlier_detection(outlier_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)



Precision: 0.07
Recall: 0.07446808510638298
AUC: 0.4985146305950871


In [19]:
#X =  [[-1.1, 1, 5, 33, 4], [-1.5, 2, 4, 8, 3], [0.3, 111, 89, 46, 23], [0.5, 15, 11, 2, -3]]
X =  [[-1.1, 1, 5, 33, 4], [-1.5, 2, 4, 8, 3],[-1.5, 2, 4, 8, 3], [-28, 2, 49, 7, 23], [18, 2, 4, 1, 13], [0.3, 111, 89, 46, 23], [0.5, 15, 11, 2, -3]]
clf = LOF().fit(X)
print(clf.predict([[0.1, 1, 5, 2, 1.5], [0.5, 2, 7, 1, 0.8], [90, 100, 45, 7, 31]]))


[0 0 1]




KNN and LOF with mean

In [1]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_combined_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    combined_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        knn_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        lof_scores = lof.decision_scores_  # higher score -> more outlier

        # Calculate the combined score as the average of KNN and LOF scores
        combined_sample_scores = (knn_scores + lof_scores) / 2

        # Store the combined score in the dictionary for each data point in the sample
        for idx, score in zip(sample_indices, combined_sample_scores):
            combined_scores[idx]['score'] += score
            combined_scores[idx]['count'] += 1

    # Calculate the average combined score for each instance
    avg_combined_scores = {idx: scores['score'] / scores['count'] for idx, scores in combined_scores.items()}
    
    # Convert to a DataFrame
    combined_scores_df = pd.DataFrame(list(avg_combined_scores.items()), columns=['Index', 'Avg_Combined_Outlier_Score'])
    
    return combined_scores_df

def evaluate_outlier_detection(outlier_scores_df, ground_truth):
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Combined_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc

# Example usage
# Assuming 'data' is your dataset and 'labels' contains ground truth labels where 1 = outlier and 0 = normal
data = np.random.rand(1000, 50)  # Generate a random dataset of 100 samples and 5 features
labels = np.random.choice([0, 1], size=1000, p=[0.9, 0.1])  # Random labels with 10% outliers

# Calculate combined outlier scores using both LOF and KNN
combined_scores_df = calculate_combined_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.11
Recall: 0.12087912087912088
AUC: 0.46427060288446426


KNN and LOF with max

In [14]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_combined_outlier_scores_max(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    combined_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        knn_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        lof_scores = lof.decision_scores_  # higher score -> more outlier

        # Calculate the combined score as the max of KNN and LOF scores
        combined_sample_scores = np.maximum(knn_scores, lof_scores)

        # Store the combined score in the dictionary for each data point in the sample
        for idx, score in zip(sample_indices, combined_sample_scores):
            combined_scores[idx]['score'] += score
            combined_scores[idx]['count'] += 1

    # Calculate the average combined score for each instance across samples
    avg_combined_scores = {idx: scores['score'] / scores['count'] for idx, scores in combined_scores.items()}
    
    # Convert to a DataFrame
    combined_scores_df = pd.DataFrame(list(avg_combined_scores.items()), columns=['Index', 'Avg_Max_Outlier_Score'])
    
    return combined_scores_df

def evaluate_outlier_detection(outlier_scores_df, ground_truth):
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Max_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc

# Example usage
# Assuming 'data' is your dataset and 'labels' contains ground truth labels where 1 = outlier and 0 = normal
data = np.random.rand(100, 5)  # Generate a random dataset of 100 samples and 5 features
labels = np.random.choice([0, 1], size=100, p=[0.9, 0.1])  # Random labels with 10% outliers

# Calculate combined outlier scores using the max of LOF and KNN
combined_scores_df = calculate_combined_outlier_scores_max(data, n_samples=5, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.2
Recall: 0.18181818181818182
AUC: 0.5801838610827375


KNN and LOF with min

In [1]:
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from collections import defaultdict
import random

def calculate_combined_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20):
    # Standardize the data
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data)

    # Dictionary to store cumulative outlier scores and counts for each instance
    combined_scores = defaultdict(lambda: {'score': 0, 'count': 0})
    
    for i in range(n_samples):
        # Create a random sample of the data
        sample_indices = random.sample(range(data.shape[0]), int(sample_size * data.shape[0]))
        sample_data = data_std[sample_indices]
        
        # Apply KNN from pyod
        knn = KNN(n_neighbors=n_neighbors, method='mean')
        knn.fit(sample_data)
        knn_scores = knn.decision_scores_  # higher score -> more outlier
        
        # Apply LOF from pyod
        lof = LOF(n_neighbors=n_neighbors)
        lof.fit(sample_data)
        lof_scores = lof.decision_scores_  # higher score -> more outlier

        # Calculate the combined score as the average of KNN and LOF scores
        combined_sample_scores = np.minimum(knn_scores , lof_scores)

        # Store the combined score in the dictionary for each data point in the sample
        for idx, score in zip(sample_indices, combined_sample_scores):
            combined_scores[idx]['score'] += score
            combined_scores[idx]['count'] += 1

    # Calculate the average combined score for each instance
    avg_combined_scores = {idx: scores['score'] / scores['count'] for idx, scores in combined_scores.items()}
    
    # Convert to a DataFrame
    combined_scores_df = pd.DataFrame(list(avg_combined_scores.items()), columns=['Index', 'Avg_Combined_Outlier_Score'])
    
    return combined_scores_df

def evaluate_outlier_detection(outlier_scores_df, ground_truth):
    # Sort outlier scores and ground truth labels by index for alignment
    outlier_scores_df = outlier_scores_df.sort_values(by='Index').reset_index(drop=True)
    ground_truth = ground_truth[outlier_scores_df['Index']].values

    # Extract average outlier scores
    outlier_scores = outlier_scores_df['Avg_Combined_Outlier_Score'].values

    # Set a threshold to classify outliers and normal points
    threshold = np.percentile(outlier_scores, 90)  # Setting threshold at the 90th percentile
    predictions = (outlier_scores >= threshold).astype(int)  # Classify as 1 if above threshold (outlier), else 0

    # Calculate precision, recall, and AUC
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    auc = roc_auc_score(ground_truth, outlier_scores)

    return precision, recall, auc

# Example usage
# Assuming 'data' is your dataset and 'labels' contains ground truth labels where 1 = outlier and 0 = normal
data = np.random.rand(1000, 50)  # Generate a random dataset of 100 samples and 5 features
labels = np.random.choice([0, 1], size=1000, p=[0.9, 0.1])  # Random labels with 10% outliers

# Calculate combined outlier scores using both LOF and KNN
combined_scores_df = calculate_combined_outlier_scores(data, n_samples=5, sample_size=0.8, n_neighbors=20)

# Evaluate the outlier detection performance
precision, recall, auc = evaluate_outlier_detection(combined_scores_df, pd.Series(labels))

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


Precision: 0.1
Recall: 0.0970873786407767
AUC: 0.4915807038834951


In [23]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
covertype = fetch_ucirepo(id=31) 
  
# data (as pandas dataframes) 
X = covertype.data.features 
y = covertype.data.targets 
  


In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
statlog_shuttle = fetch_ucirepo(id=148) 
  
# data (as pandas dataframes) 
X = statlog_shuttle.data.features 
y = statlog_shuttle.data.targets 
  
# metadata 
print(statlog_shuttle.metadata) 
  
# variable information 
print(statlog_shuttle.variables) 
