In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing



In [2]:
# --- IQR (Interquartile Range) Method for Outlier Detection ---

# When to use this method:
# - When the data is skewed (not normally distributed).
# - When you need a method that is robust to extreme outliers.
# - It's the basis for outlier detection in standard box plots.


In [11]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [3]:
# 1. We'll use the California Housing dataset for this example.
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)

# We will analyze the 'MedInc' (Median Income) feature for outliers.
data = df['MedInc'].values


In [4]:
# 2. Calculate Q1, Q3, and the Interquartile Range (IQR)
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1



In [5]:
# 3. Define the outlier bounds
# The 1.5 factor is a standard convention.
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR



In [6]:
# 4. Identify outliers
# Find indices of data points that are outside the lower and upper bounds.
outlier_indices = np.where((data < lower_bound) | (data > upper_bound))[0]



In [7]:
# 5. Create a new DataFrame containing only the outliers
# We use .loc to select rows from the original DataFrame based on the outlier indices.
outliers_df = df.loc[outlier_indices].copy()

# Add a column to specify if the outlier is high or low for context
outliers_df['Outlier_Type'] = np.where(outliers_df['MedInc'] > upper_bound, 'High', 'Low')




In [8]:
# --- Display Results ---
print(f"Analyzing feature: 'MedInc' from California Housing dataset using IQR Method")
print(f"Q1: {Q1:.4f}, Q3: {Q3:.4f}, IQR: {IQR:.4f}")
print(f"Lower Bound for Outliers: {lower_bound:.4f}")
print(f"Upper Bound for Outliers: {upper_bound:.4f}")
print("-" * 30)
print(f"Found {len(outliers_df)} outliers.")
print("\nDataFrame containing the identified outliers (first 5 rows):")
print(outliers_df.head())

# Example of what the output will look like:
# Analyzing feature: 'MedInc' from California Housing dataset using IQR Method
# Q1: 2.5634, Q3: 4.7432, IQR: 2.1798
# Lower Bound for Outliers: -0.7064
# Upper Bound for Outliers: 8.0129
# ------------------------------
# Found 697 outliers.
#
# DataFrame containing the identified outliers (first 5 rows):
#      MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  Longitude Outlier_Type
# 9   15.0001      52.0  8.288136   1.076271       592.0  2.508475     37.85    -122.25         High
# 10  15.0001      52.0  8.288136   1.076271       592.0  2.508475     37.85    -122.25         High
# 11  15.0001      52.0  8.288136   1.076271       592.0  2.508475     37.85    -122.25         High
# 12  15.0001      52.0  8.288136   1.076271       592.0  2.508475     37.85    -122.25         High
# 13  15.0001      52.0  8.288136   1.076271       592.0  2.508475     37.85    -122.25         High

Analyzing feature: 'MedInc' from California Housing dataset using IQR Method
Q1: 2.5634, Q3: 4.7432, IQR: 2.1799
Lower Bound for Outliers: -0.7064
Upper Bound for Outliers: 8.0130
------------------------------
Found 681 outliers.

DataFrame containing the identified outliers (first 5 rows):
      MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0     8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1     8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
131  11.6017      18.0  8.335052   1.082474       533.0  2.747423     37.84   
134   8.2049      28.0  6.978947   0.968421       463.0  2.436842     37.83   
135   8.4010      26.0  7.530806   1.056872       542.0  2.568720     37.83   

     Longitude Outlier_Type  
0      -122.23         High  
1      -122.22         High  
131    -122.19         High  
134    -122.19         High  
135    -122.20         High  


In [9]:
outliers_df['Outlier_Type'].describe()

count      681
unique       1
top       High
freq       681
Name: Outlier_Type, dtype: object