In [1]:
# Question: Multivariate Outlier Detection Using Mahalanobis Distance
# Description: Implement Mahalanobis distance to detect multivariate outliers in a dataset.

import numpy as np
import pandas as pd
from scipy.stats import chi2
from scipy.spatial.distance import mahalanobis

# Sample DataFrame
data = {
    'X': [2, 4, 6, 8, 10, 100],  # Last value is an outlier
    'Y': [1, 3, 5, 7, 9, 99]     # Last value is an outlier
}
df = pd.DataFrame(data)

# Step 1: Calculate the mean and covariance matrix
mean_vector = df.mean().values
cov_matrix = np.cov(df.T)
inv_cov_matrix = np.linalg.inv(cov_matrix)

# Step 2: Calculate Mahalanobis distance for each observation
def mahalanobis_distance(x, mean, inv_cov):
    return mahalanobis(x, mean, inv_cov)

df['Mahalanobis_Distance'] = df.apply(lambda row: mahalanobis_distance(row[['X', 'Y']], mean_vector, inv_cov_matrix), axis=1)

# Step 3: Determine threshold using Chi-squared distribution
threshold = np.sqrt(chi2.ppf(0.975, df=df.shape[1] - 1))  # 95% confidence interval

# Step 4: Flag outliers
df['Outlier'] = df['Mahalanobis_Distance'] > threshold

# Display results
print(df)


LinAlgError: Singular matrix