In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing


In [2]:
# --- Modified Z-Score Calculation ---

# When to use this method:
# - For small datasets where mean/std are easily skewed.
# - When data is not normally distributed (i.e., skewed).
# - To reduce the influence of extreme outliers on the detection process itself.

# We'll use the California Housing dataset for this example.
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)

# We will analyze the 'MedInc' (Median Income) feature for outliers.
data = df['MedInc'].values



In [3]:
# 2. Calculate the median and Median Absolute Deviation (MAD)
median_val = np.median(data)
abs_deviation = np.abs(data - median_val)
mad = np.median(abs_deviation)



In [4]:
# 3. Calculate the Modified Z-scores
# Handle the case where MAD is 0 to avoid division by zero.
if mad == 0:
    mod_z_scores = np.zeros_like(data)
else:
    # The constant 0.6745 makes the MAD a consistent estimator for the standard deviation.
    mod_z_scores = 0.6745 * (data - median_val) / mad



In [5]:
# 4. Identify and display outliers
outlier_threshold = 3.0
outlier_indices = np.where(np.abs(mod_z_scores) > outlier_threshold)
outlier_values = data[outlier_indices]




In [6]:
# 5. Create a new DataFrame containing only the outliers
# We use .loc to select rows from the original DataFrame based on the outlier indices
outliers_df = df.loc[outlier_indices].copy()
outliers_df['Modified_Z_Score'] = mod_z_scores[outlier_indices]

In [7]:
# --- Display Results ---
print(f"Analyzing feature: 'MedInc' from California Housing dataset")
print(f"Number of data points: {len(data)}")
print(f"Found {len(outliers_df)} outliers with a threshold of {outlier_threshold}.")
print("-" * 30)
print("\nDataFrame containing the identified outliers (first 5 rows):")
print(outliers_df.head())


# Example of what the output will look like:
# Analyzing feature: 'MedInc' from California Housing dataset
# Number of data points: 20640
# Found 439 outliers with a threshold of 3.5.
# ------------------------------
#
# DataFrame containing the identified outliers (first 5 rows):
#      MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  Longitude  Modified_Z_Score
# 9   15.0001      52.0  8.288136   1.076271       592.0  2.508475     37.85    -122.25          6.877508
# 10  15.0001      52.0  8.288136   1.076271       592.0  2.508475     37.85    -122.25          6.877508
# 11  15.0001      52.0  8.288136   1.076271       592.0  2.508475     37.85    -122.25          6.877508
# 12  15.0001      52.0  8.288136   1.076271       592.0  2.508475     37.85    -122.25          6.877508
# 13  15.0001      52.0  8.288136   1.076271       592.0  2.508475     37.85    -122.25          6.877508

Analyzing feature: 'MedInc' from California Housing dataset
Number of data points: 20640
Found 583 outliers with a threshold of 3.0.
------------------------------

DataFrame containing the identified outliers (first 5 rows):
      MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0     8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1     8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
131  11.6017      18.0  8.335052   1.082474       533.0  2.747423     37.84   
135   8.4010      26.0  7.530806   1.056872       542.0  2.568720     37.83   
137   8.3170      32.0  6.977186   1.003802       635.0  2.414449     37.82   

     Longitude  Modified_Z_Score  
0      -122.23          3.036201  
1      -122.22          3.021116  
131    -122.19          5.112877  
135    -122.20          3.084243  
137    -122.19          3.031003  


In [8]:
outliers_df['Modified_Z_Score'].describe()

count    583.000000
mean       4.388262
std        1.254496
min        3.003560
25%        3.338813
50%        4.147839
75%        4.836885
max        7.266815
Name: Modified_Z_Score, dtype: float64