In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import IsolationForest



In [2]:
# --- Isolation Forest Method for Outlier Detection ---

# When to use this method:
# - For high-dimensional datasets (multivariate outlier detection).
# - When you don't know the distribution of the data.
# - It's efficient and works well on large datasets.



In [3]:
# 1. We'll use the California Housing dataset for this example.
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)

# We will analyze the 'MedInc' (Median Income) feature for outliers.
# Isolation Forest expects a 2D array, so we reshape the data.
data = df['MedInc'].values.reshape(-1, 1)



In [4]:
# --- 2. Hyperparameter Tuning ---
# We will test different combinations of hyperparameters to see how they affect the outcome.
# This is a manual form of "grid search".

# Define the grid of parameters to test
param_grid = {
    'n_estimators': [50, 100, 200],
    'contamination': [0.01, 0.05, 0.1]
}

# Store the results
tuning_results = []

print("Running hyperparameter tuning...")
for n_est in param_grid['n_estimators']:
    for cont in param_grid['contamination']:
        # Create and fit the model with the current parameters
        iso_forest = IsolationForest(
            n_estimators=n_est,
            contamination=cont,
            random_state=42
        )
        
        # Fit the model and get predictions
        predictions = iso_forest.fit_predict(data)
        
        # Count the number of outliers (-1)
        num_outliers = np.sum(predictions == -1)
        
        # Store the results
        tuning_results.append({
            'n_estimators': n_est,
            'contamination': cont,
            'num_outliers': num_outliers
        })

# Convert results to a DataFrame for easy viewing
results_df = pd.DataFrame(tuning_results)

Running hyperparameter tuning...


In [5]:
print("\nTuning complete. Here are the results:")
print(results_df)

print("\n--- Interpretation ---")
print("You can now analyze this table to choose the best parameters.")
print("For example, if you have domain knowledge suggesting around 5% of your data might be anomalous,")
print("you would look at the results for contamination=0.05 and choose the n_estimators value you prefer.")


Tuning complete. Here are the results:
   n_estimators  contamination  num_outliers
0            50           0.01           197
1            50           0.05          1026
2            50           0.10          2064
3           100           0.01           204
4           100           0.05          1029
5           100           0.10          2063
6           200           0.01           206
7           200           0.05          1030
8           200           0.10          2064

--- Interpretation ---
You can now analyze this table to choose the best parameters.
For example, if you have domain knowledge suggesting around 5% of your data might be anomalous,
you would look at the results for contamination=0.05 and choose the n_estimators value you prefer.


In [11]:
# 2.1 Create and fit the Isolation Forest model
# 'contamination' is the expected proportion of outliers. 'auto' is a good default.
# We set a random_state for reproducibility.
iso_forest = IsolationForest(contamination=0.02,n_estimators=100, random_state=42)
predictions = iso_forest.fit_predict(data)

# The model returns -1 for outliers and 1 for inliers.
# We can also get the anomaly scores for more context (lower scores are more abnormal).
anomaly_scores = iso_forest.decision_function(data)



In [12]:
# 3. Identify outliers
# Find indices where the prediction is -1.
outlier_indices = np.where(predictions == -1)[0]



In [13]:
# 4. Create a new DataFrame containing only the outliers
# We use .loc to select rows from the original DataFrame based on the outlier indices.
outliers_df = df.loc[outlier_indices].copy()

# Add the anomaly score to the outliers DataFrame for context
outliers_df['Anomaly_Score'] = anomaly_scores[outlier_indices]


# --- Display Results ---
print(f"Analyzing feature: 'MedInc' from California Housing dataset using Isolation Forest")
print(f"Contamination parameter set to 'auto'")
print("-" * 30)
print(f"Found {len(outliers_df)} outliers.")
print("\nDataFrame containing the identified outliers (first 5 rows):")
print(outliers_df.head())

# Example of what the output will look like:
# Analyzing feature: 'MedInc' from California Housing dataset using Isolation Forest
# Contamination parameter set to 'auto'
# ------------------------------
# Found 1032 outliers.
#
# DataFrame containing the identified outliers (first 5 rows):
#      MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  Longitude  Anomaly_Score
# 9   15.0001      52.0  8.288136   1.076271       592.0  2.508475     37.85    -122.25      -0.129339
# 10  15.0001      52.0  8.288136   1.076271       592.0  2.508475     37.85    -122.25      -0.129339
# 11  15.0001      52.0  8.288136   1.076271       592.0  2.508475     37.85    -122.25      -0.129339
# 12  15.0001      52.0  8.288136   1.076271       592.0  2.508475     37.85    -122.25      -0.129339
# 13  15.0001      52.0  8.288136   1.076271       592.0  2.508475     37.85    -122.25      -0.129339

Analyzing feature: 'MedInc' from California Housing dataset using Isolation Forest
Contamination parameter set to 'auto'
------------------------------
Found 409 outliers.

DataFrame containing the identified outliers (first 5 rows):
      MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
71    0.7286      46.0  3.375451   1.072202       582.0  2.101083     37.81   
73    0.4999      46.0  1.714286   0.571429        18.0  2.571429     37.81   
87    0.7600      10.0  2.651515   1.054545       546.0  1.654545     37.81   
92    0.7500      52.0  2.823529   0.911765       191.0  5.617647     37.80   
131  11.6017      18.0  8.335052   1.082474       533.0  2.747423     37.84   

     Longitude  Anomaly_Score  
71     -122.29      -0.004149  
73     -122.29      -0.025379  
87     -122.27      -0.000920  
92     -122.28      -0.001380  
131    -122.19      -0.073428  


In [14]:
outliers_df['Anomaly_Score'].describe()

count    409.000000
mean      -0.055741
std        0.042035
min       -0.132964
25%       -0.089751
50%       -0.042890
75%       -0.023450
max       -0.000920
Name: Anomaly_Score, dtype: float64