In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing

In [2]:
# --- Standard Z-Score Calculation with a Real Dataset ---

# When to use this method:
# - When your data is approximately normally distributed (bell-shaped).
# - For larger datasets where the mean is a reliable measure.
# - It's the most common and straightforward method for outlier detection.



In [3]:
# 1. We'll use the California Housing dataset for this example.
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)

# We will analyze the 'MedInc' (Median Income) feature for outliers.
data = df['MedInc'].values



In [4]:
# 2. Calculate the mean and standard deviation
mean_val = np.mean(data)
std_val = np.std(data)



In [5]:
# 3. Calculate the Z-scores
# Handle the case where standard deviation is 0 to avoid division by zero.
if std_val == 0:
    z_scores = np.zeros_like(data)
else:
    z_scores = (data - mean_val) / std_val



In [6]:
# 4. Identify outliers
outlier_threshold = 3.0 # A common threshold for standard Z-score is 3.0
outlier_indices = np.where(np.abs(z_scores) > outlier_threshold)[0] # Get indices as a 1D array



In [7]:
# 5. Create a new DataFrame containing only the outliers
# We use .loc to select rows from the original DataFrame based on the outlier indices
outliers_df = df.loc[outlier_indices].copy()

# Add the Z-score to the outliers DataFrame for context
outliers_df['Z_Score'] = z_scores[outlier_indices]




In [8]:
# --- Display Results ---
print(f"Analyzing feature: 'MedInc' from California Housing dataset using Standard Z-Score")
print(f"Number of data points: {len(data)}")
print(f"Found {len(outliers_df)} outliers with a threshold of {outlier_threshold}.")
print("-" * 30)
print("\nDataFrame containing the identified outliers (first 5 rows):")
print(outliers_df.head())

# Example of what the output will look like:
# Analyzing feature: 'MedInc' from California Housing dataset using Standard Z-Score
# Number of data points: 20640
# Found 240 outliers with a threshold of 3.0.
# ------------------------------
#
# DataFrame containing the identified outliers (first 5 rows):
#      MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  Longitude   Z_Score
# 195   9.532      36.0  8.212329   1.109589       869.0  2.380822     37.80    -122.23  3.000049
# 237  10.082      25.0  8.291866   1.037453      1256.0  2.356469     37.85    -122.28  3.288636
# 243  10.082      25.0  8.291866   1.037453      1256.0  2.356469     37.85    -122.28  3.288636
# 308  10.536      34.0

Analyzing feature: 'MedInc' from California Housing dataset using Standard Z-Score
Number of data points: 20640
Found 345 outliers with a threshold of 3.0.
------------------------------

DataFrame containing the identified outliers (first 5 rows):
      MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
131  11.6017      18.0  8.335052   1.082474       533.0  2.747423     37.84   
409  10.0825      52.0  8.209016   1.024590       658.0  2.696721     37.90   
510  11.8603      39.0  7.911111   0.984127       808.0  2.565079     37.82   
511  13.4990      42.0  8.928358   1.000000      1018.0  3.038806     37.82   
512  12.2138      52.0  9.210227   1.039773      1001.0  2.843750     37.82   

     Longitude   Z_Score  
131    -122.19  4.069443  
409    -122.28  3.269770  
510    -122.22  4.205564  
511    -122.22  5.068140  
512    -122.23  4.391639  


In [9]:
outliers_df['Z_Score'].describe()

count    345.000000
mean       4.085177
std        0.935690
min        3.006475
25%        3.346831
50%        3.689819
75%        4.690885
max        5.858286
Name: Z_Score, dtype: float64