## Sample Dataset

In [None]:
import numpy as np

np.random.seed(42)

# Generate bimodal data
low_spending = np.random.normal(50, 10, 225)  # 90% of data
high_spending = np.random.normal(200, 20, 25)  # 10% of data (outliers)
spending = np.concatenate([low_spending, high_spending])

# Use spending directly as a NumPy array
data = spending


## 1. Z-Score Method

In [None]:
def detect_outliers_zscore(data, threshold=3):
    # Calculate mean and standard deviation
    mean = np.mean(data)
    std = np.std(data)
    # Compute Z-scores
    z_scores = (data - mean) / std
    # Find indices where Z-scores exceed the threshold
    outlier_indices = np.where(np.abs(z_scores) > threshold)
    # Extract corresponding outlier values
    outlier_values = data[outlier_indices]
    return outlier_indices[0], outlier_values

# Usage
indices, values = detect_outliers_zscore(data)  # Pass the NumPy array
print("Z-Score Outliers (Index, Value):")
for idx, val in zip(indices, values):
    print(f"Index: {idx}, Value: {val}")

Z-Score Outliers (Index, Value):
Index: 225, Value: 221.77901193934733
Index: 229, Value: 213.5919549786935
Index: 231, Value: 204.3291717916395
Index: 234, Value: 242.8788817865065
Index: 235, Value: 212.67838044636022
Index: 239, Value: 217.04866669592448
Index: 242, Value: 210.09974557960913
Index: 243, Value: 217.31510388340243
Index: 248, Value: 235.30908480562192
Index: 249, Value: 208.0996342192191


## 2. Interquartile Range (IQR) Method

In [None]:
import numpy as np
def detect_outliers_iqr(data):
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outlier_indices = np.where((data < lower_bound) | (data > upper_bound))
    outlier_values = data[outlier_indices]
    return outlier_indices[0], outlier_values

indices, values = detect_outliers_iqr(data)
print("IQR Outliers (Index, Value):")
for idx, val in zip(indices, values):
    print(f"Index: {idx}, Value: {val}")


IQR Outliers (Index, Value):
Index: 209, Value: 88.52731490654722
Index: 225, Value: 221.77901193934733
Index: 226, Value: 201.28560038190926
Index: 227, Value: 178.4451044414139
Index: 228, Value: 185.69392581480065
Index: 229, Value: 213.5919549786935
Index: 230, Value: 185.39266736565727
Index: 231, Value: 204.3291717916395
Index: 232, Value: 200.91143679807627
Index: 233, Value: 186.96799304788365
Index: 234, Value: 242.8788817865065
Index: 235, Value: 212.67838044636022
Index: 236, Value: 159.49714826684786
Index: 237, Value: 203.72908629538856
Index: 238, Value: 186.76427070463225
Index: 239, Value: 217.04866669592448
Index: 240, Value: 184.14958523134598
Index: 241, Value: 197.70527117066203
Index: 242, Value: 210.09974557960913
Index: 243, Value: 217.31510388340243
Index: 244, Value: 175.99407185888447
Index: 245, Value: 193.30997528318102
Index: 246, Value: 190.50109377678086
Index: 247, Value: 186.93341534852576
Index: 248, Value: 235.30908480562192
Index: 249, Value: 208.099

## 3. Local Outlier Factor (LOF)

In [None]:
from sklearn.neighbors import LocalOutlierFactor

def detect_outliers_lof(data, n_neighbors=20, contamination=0.1):
    """
    Detects outliers using the Local Outlier Factor (LOF) method.

    Args:
        data (numpy array): 1D array of numerical data.
        n_neighbors (int): Number of neighbors to consider for LOF.
        contamination (float): Proportion of the dataset assumed to be outliers.

    Returns:
        tuple: Indices and values of detected outliers.
    """
    # Reshape the 1D array for LOF (it expects a 2D array as input)
    reshaped_data = data.reshape(-1, 1)
    # Initialize and fit the LOF model
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
    # Predict outliers: -1 for outliers, 1 for inliers
    outliers = lof.fit_predict(reshaped_data)
    # Get indices and values of outliers
    outlier_indices = np.where(outliers == -1)[0]
    outlier_values = data[outlier_indices]
    return outlier_indices, outlier_values

# Usage
indices, values = detect_outliers_lof(data)  # Pass the 1D NumPy array
print("LOF Outliers (Index, Value):")
for idx, val in zip(indices, values):
    print(f"Index: {idx}, Value: {val}")

LOF Outliers (Index, Value):
Index: 6, Value: 65.79212815507391
Index: 13, Value: 30.86719755342202
Index: 14, Value: 32.75082167486967
Index: 31, Value: 68.52278184508938
Index: 37, Value: 30.403298761202244
Index: 49, Value: 32.36959844637266
Index: 71, Value: 65.38036566465969
Index: 73, Value: 65.64643655814007
Index: 74, Value: 23.802548959102555
Index: 79, Value: 30.124310853991073
Index: 106, Value: 68.8618590121053
Index: 110, Value: 30.812287847009586
Index: 113, Value: 74.63242112485287
Index: 122, Value: 64.02794310936099
Index: 125, Value: 71.90455625809979
Index: 135, Value: 65.4993440501754
Index: 156, Value: 68.65774511144757
Index: 167, Value: 68.96792982653947
Index: 179, Value: 77.20169166589619
Index: 209, Value: 88.52731490654722
Index: 211, Value: 61.35565640180599
Index: 220, Value: 73.14658566673509
Index: 221, Value: 31.32734807408252
Index: 234, Value: 242.8788817865065
Index: 236, Value: 159.49714826684786


## 4. Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest

def detect_outliers_isolation_forest(data, contamination=0.1):
    iso = IsolationForest(contamination=contamination, random_state=42)
    outliers = iso.fit_predict(data.reshape(-1, 1))
    outlier_indices = np.where(outliers == -1)[0]
    outlier_values = data[outlier_indices]
    return outlier_indices, outlier_values

indices, values = detect_outliers_isolation_forest(data)
print("Isolation Forest Outliers (Index, Value):")
for idx, val in zip(indices, values):
    print(f"Index: {idx}, Value: {val}")

Isolation Forest Outliers (Index, Value):
Index: 74, Value: 23.802548959102555
Index: 79, Value: 30.124310853991073
Index: 113, Value: 74.63242112485287
Index: 125, Value: 71.90455625809979
Index: 179, Value: 77.20169166589619
Index: 209, Value: 88.52731490654722
Index: 220, Value: 73.14658566673509
Index: 225, Value: 221.77901193934733
Index: 227, Value: 178.4451044414139
Index: 229, Value: 213.5919549786935
Index: 231, Value: 204.3291717916395
Index: 234, Value: 242.8788817865065
Index: 235, Value: 212.67838044636022
Index: 236, Value: 159.49714826684786
Index: 237, Value: 203.72908629538856
Index: 239, Value: 217.04866669592448
Index: 240, Value: 184.14958523134598
Index: 241, Value: 197.70527117066203
Index: 242, Value: 210.09974557960913
Index: 243, Value: 217.31510388340243
Index: 244, Value: 175.99407185888447
Index: 245, Value: 193.30997528318102
Index: 246, Value: 190.50109377678086
Index: 248, Value: 235.30908480562192
Index: 249, Value: 208.0996342192191


## 5. Mahalanobis Distance

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import mahalanobis

# Generate synthetic data for Age and Annual Income
np.random.seed(42)
age = np.random.normal(30, 5, 200)  # Mean age = 30, std = 5
income = np.random.normal(50000, 15000, 200)  # Mean income = 50,000, std = 15,000

# Add some outliers
age = np.concatenate([age, np.random.normal(60, 5, 5)])  # Add outlier ages (high)
income = np.concatenate([income, np.random.normal(200000, 50000, 5)])  # Add outlier incomes (high)

# Combine the data into a DataFrame
df = pd.DataFrame({
    'Age': age,
    'Annual Income': income
})

# Function to detect outliers using Mahalanobis distance
def detect_outliers_mahalanobis_multivariate(data):
    mean = np.mean(data, axis=0)
    cov_matrix = np.cov(data, rowvar=False)

    try:
        inv_cov_matrix = np.linalg.inv(cov_matrix)
    except np.linalg.LinAlgError:
        raise ValueError("Covariance matrix is singular and cannot be inverted.")

    distances = []
    for row in data:
        distance = mahalanobis(row, mean, inv_cov_matrix)
        distances.append(distance)

    threshold = np.percentile(distances, 97.5)  # 97.5 percentile threshold for outliers
    outlier_indices = np.where(np.array(distances) > threshold)[0]
    outlier_values = data[outlier_indices]

    return outlier_indices, outlier_values

# Using the function on our dataset (Age, Annual Income)
indices, values = detect_outliers_mahalanobis_multivariate(df[['Age', 'Annual Income']].values)

print("Mahalanobis Outliers (Index, Age, Annual Income):")
for idx, val in zip(indices, values):
    print(f"Index: {idx}, Age: {val[0]}, Annual Income: {val[1]}")


Mahalanobis Outliers (Index, Age, Annual Income):
Index: 179, Age: 43.600845832948096, Annual Income: 37875.525719672725
Index: 200, Age: 52.02786170602816, Annual Income: 231142.49661737494
Index: 201, Age: 57.00312488523114, Annual Income: 146618.97853087028
Index: 202, Age: 60.026218498590914, Annual Income: 192881.02574893532
Index: 203, Age: 60.23490296882371, Annual Income: 206014.78158559493
Index: 204, Age: 57.749672642603784, Annual Income: 225721.94170293745
