In [1]:
# Question: Multivariate Outlier Detection Using Mahalanobis Distance
# Description: Implement Mahalanobis distance to detect multivariate outliers in a dataset.

import numpy as np
import pandas as pd
from scipy.spatial import distance

def mahalanobis_distance(data):
    """
    Calculates the Mahalanobis distance for each data point in a multivariate dataset.

    Args:
        data (numpy.ndarray or pandas.DataFrame): The multivariate dataset.
                                                  Rows represent observations, and columns represent variables.

    Returns:
        numpy.ndarray: An array of Mahalanobis distances for each data point.
    """
    if isinstance(data, pd.DataFrame):
        data = data.values

    mean = np.mean(data, axis=0)
    covariance = np.cov(data, rowvar=False)
    inv_covariance = np.linalg.inv(covariance)
    mahalanobis = []
    for i in range(data.shape[0]):
        diff = data[i] - mean
        mahalanobis.append(distance.mahalanobis(diff, np.zeros(diff.shape), inv_covariance))
    return np.array(mahalanobis)

def detect_multivariate_outliers_mahalanobis(data, threshold_multiplier=3):
    """
    Detects multivariate outliers in a dataset using the Mahalanobis distance.

    Args:
        data (numpy.ndarray or pandas.DataFrame): The multivariate dataset.
                                                  Rows represent observations, and columns represent variables.
        threshold_multiplier (float, optional): The multiplier for the standard deviation of Mahalanobis distances
                                                 to determine the outlier threshold. Defaults to 3.

    Returns:
        pandas.DataFrame: A DataFrame with the original data and a boolean column 'Is_Outlier'
                          indicating whether each data point is an outlier.
    """
    mahalanobis_distances = mahalanobis_distance(data)
    threshold = np.mean(mahalanobis_distances) + threshold_multiplier * np.std(mahalanobis_distances)
    is_outlier = mahalanobis_distances > threshold

    if isinstance(data, pd.DataFrame):
        outlier_df = data.copy()
    else:
        outlier_df = pd.DataFrame(data)

    outlier_df['Mahalanobis_Distance'] = mahalanobis_distances
    outlier_df['Is_Outlier'] = is_outlier
    return outlier_df

if __name__ == '__main__':
    # Example Multivariate Dataset
    data = np.array([
        [1, 2],
        [2, 3],
        [3, 4],
        [4, 5],
        [5, 6],
        [10, 15],  # Potential outlier
        [-5, -3]   # Potential outlier
    ])

    df = pd.DataFrame(data, columns=['Feature_1', 'Feature_2'])

    # Detect outliers using Mahalanobis distance
    outlier_results = detect_multivariate_outliers_mahalanobis(df)
    print("Outlier Detection using Mahalanobis Distance:\n", outlier_results)

    print("\n---")

    # Example with a DataFrame
    data_df = pd.DataFrame({
        'X': [1, 2, 3, 4, 5, 10, -1],
        'Y': [2, 3, 4, 5, 6, 15, -2]
    })
    outlier_results_df = detect_multivariate_outliers_mahalanobis(data_df, threshold_multiplier=2)
    print("Outlier Detection on DataFrame:\n", outlier_results_df)

Outlier Detection using Mahalanobis Distance:
    Feature_1  Feature_2  Mahalanobis_Distance  Is_Outlier
0          1          2              0.515382       False
1          2          3              0.481576       False
2          3          4              0.574646       False
3          4          5              0.748684       False
4          5          6              0.960648       False
5         10         15              2.260033       False
6         -5         -3              2.140356       False

---
Outlier Detection on DataFrame:
     X   Y  Mahalanobis_Distance  Is_Outlier
0   1   2              1.478626       False
1   2   3              0.718867       False
2   3   4              0.172666       False
3   4   5              0.851743       False
4   5   6              1.613607       False
5  10  15              2.067194       False
6  -1  -2              1.290192       False
