In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from scipy.spatial.distance import mahalanobis
from scipy.stats import chi2

# --- Mahalanobis Distance Method for Outlier Detection ---

# When to use this method:
# - For multivariate data where features are correlated.
# - When outliers are unusual combinations of values.
# - When the data is roughly elliptically distributed.



In [2]:
# 1. Load the dataset
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)

# We'll use a subset of features to demonstrate.
# This method shines when features are correlated.
features = ['MedInc', 'HouseAge', 'AveRooms', 'AveOccup']
data = df[features]



In [3]:
# 2. Calculate the Mahalanobis Distance
# First, calculate the mean and inverse covariance matrix
mean = data.mean().values
cov = data.cov().values
inv_cov_matrix = np.linalg.inv(cov)

# Calculate Mahalanobis distance for each data point
mahal_distances = []
for i in range(len(data)):
    dist = mahalanobis(data.iloc[i], mean, inv_cov_matrix)
    mahal_distances.append(dist)

df['Mahalanobis_Distance'] = mahal_distances



In [4]:
# 3. Determine the outlier threshold
# The squared Mahalanobis distance follows a Chi-squared distribution
# with degrees of freedom equal to the number of variables (p).
p = len(features) # Number of features
alpha = 0.02 # Significance level (e.g., 1% of data as outliers)
threshold = chi2.ppf(1 - alpha, df=p)




In [5]:
# 4. Identify outliers
outlier_indices = np.where(df['Mahalanobis_Distance'] > threshold)[0]




In [6]:
# 5. Create a new DataFrame containing only the outliers
outliers_df = df.loc[outlier_indices].copy()




In [7]:
# --- 6. Display Results ---
print(f"Analyzing features: {features}")
print(f"Chi-squared threshold (p={p}, alpha={alpha}): {threshold:.4f}")
print("-" * 30)
print(f"Found {len(outliers_df)} outliers.")
print("\nDataFrame containing the identified outliers (first 5 rows):")
# We display the selected features plus the calculated distance
print(outliers_df[features + ['Mahalanobis_Distance']].head())


# Example of what the output will look like:
# Analyzing features: ['MedInc', 'HouseAge', 'AveRooms', 'AveOccup']
# Chi-squared threshold (p=4, alpha=0.01): 13.2767
# ------------------------------
# Found 244 outliers.
#
# DataFrame containing the identified outliers (first 5 rows):
#        MedInc  HouseAge   AveRooms    AveOccup  Mahalanobis_Distance
# 1914   1.8333      23.0  14.619048    2.285714              4.062013
# 1979   4.5000      16.0  25.363636    3.090909              6.046313
# 2011   1.6250      16.0  28.000000    2.800000              7.124614
# 2799   2.5357      35.0  41.923077    2.153846             10.638531
# 3033  15.0001      25.0  15.818182    3.045455              4.288283

Analyzing features: ['MedInc', 'HouseAge', 'AveRooms', 'AveOccup']
Chi-squared threshold (p=4, alpha=0.02): 11.6678
------------------------------
Found 26 outliers.

DataFrame containing the identified outliers (first 5 rows):
      MedInc  HouseAge   AveRooms  AveOccup  Mahalanobis_Distance
1239  3.0750      13.0  33.989691  2.752577             12.368836
1240  3.1250      11.0  47.515152  2.484848             18.155487
1872  2.4917      27.0  38.840909  2.204545             14.642641
1912  4.9750      16.0  56.269231  2.076923             21.616693
1913  4.0714      19.0  61.812500  2.333333             24.177527
