In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.neighbors import LocalOutlierFactor



In [2]:
# --- Local Outlier Factor (LOF) Method for Outlier Detection ---

# When to use this method:
# - When outliers are defined by local density rather than global position.
# - For datasets where the density of clusters varies.
# - It's effective at finding outliers that are close to other clusters.



In [3]:
# 1. We'll use the California Housing dataset for this example.
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)

# We will analyze the 'MedInc' (Median Income) feature for outliers.
# LOF expects a 2D array, so we reshape the data.
data = df['MedInc'].values.reshape(-1, 1)





In [4]:
# --- 2. Hyperparameter Tuning ---
# We will test different combinations of hyperparameters to see how they affect the outcome.

# Define the grid of parameters to test for LOF
param_grid = {
    'n_neighbors': [10, 20, 35, 50],
    'contamination': [0.01, 0.05, 0.1]
}

# Store the results
tuning_results = []

print("Running hyperparameter tuning for Local Outlier Factor...")
for n_neigh in param_grid['n_neighbors']:
    for cont in param_grid['contamination']:
        lof = LocalOutlierFactor(n_neighbors=n_neigh, contamination=cont)
        predictions = lof.fit_predict(data)
        num_outliers = np.sum(predictions == -1)
        tuning_results.append({
            'n_neighbors': n_neigh,
            'contamination': cont,
            'num_outliers': num_outliers
        })

# Convert results to a DataFrame for easy viewing
results_df = pd.DataFrame(tuning_results)




Running hyperparameter tuning for Local Outlier Factor...




In [5]:
# --- 3. Display Tuning Results ---
print("\nTuning complete. Here are the results:")
print(results_df)
print("\n--- Interpretation ---")
print("Analyze this table to choose the best parameters. For this example, we'll proceed with n_neighbors=20 and contamination=0.05.")





Tuning complete. Here are the results:
    n_neighbors  contamination  num_outliers
0            10           0.01           207
1            10           0.05          1032
2            10           0.10          2064
3            20           0.01           207
4            20           0.05          1031
5            20           0.10          2064
6            35           0.01           206
7            35           0.05          1032
8            35           0.10          2064
9            50           0.01           207
10           50           0.05          1032
11           50           0.10          2064

--- Interpretation ---
Analyze this table to choose the best parameters. For this example, we'll proceed with n_neighbors=20 and contamination=0.05.


In [30]:
# --- 4. Apply the Final Model ---
# Based on the tuning results and domain knowledge, we select our final parameters.
best_n_neighbors = 20
best_contamination = 0.02

# Create and fit the final model
final_lof = LocalOutlierFactor(n_neighbors=best_n_neighbors, contamination=best_contamination)
predictions = final_lof.fit_predict(data)

# Get the indices of the outliers
outlier_indices = np.where(predictions == -1)[0]

# Get the outlier scores (negative_outlier_factor_)
# Lower scores mean more abnormal.
outlier_scores = final_lof.negative_outlier_factor_[outlier_indices]





In [31]:
# 5. Create a DataFrame of the outliers
outliers_df = df.loc[outlier_indices].copy()
outliers_df['LOF_Score'] = outlier_scores




In [32]:
# --- 6. Display Final Outliers ---
print(f"\n--- Applying Final Model (n_neighbors={best_n_neighbors}, contamination={best_contamination}) ---")
print(f"Found {len(outliers_df)} outliers.")
print("\nDataFrame containing the identified outliers (first 5 rows):")
print(outliers_df.head())


--- Applying Final Model (n_neighbors=20, contamination=0.02) ---
Found 413 outliers.

DataFrame containing the identified outliers (first 5 rows):
     MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
52   1.1108      41.0  4.473611   1.184722      1959.0  2.720833     37.82   
77   1.1111      19.0  5.830918   1.173913       721.0  3.483092     37.81   
107  3.1691      52.0  4.165877   1.090047       686.0  1.625592     37.81   
144  3.0812      38.0  4.628337   1.098563       951.0  1.952772     37.80   
159  2.7477      52.0  4.333333   1.138716       918.0  1.900621     37.81   

     Longitude     LOF_Score  
52     -122.27 -1.306733e+00  
77     -122.28 -1.292857e+00  
107    -122.25 -1.252870e+00  
144    -122.21 -1.262398e+00  
159    -122.24 -1.002000e+07  


In [33]:
outliers_df['LOF_Score'].describe()

count    4.130000e+02
mean    -4.409963e+07
std      3.262592e+08
min     -3.266671e+09
25%     -1.200000e+07
50%     -4.220001e+06
75%     -1.359197e+00
max     -1.244730e+00
Name: LOF_Score, dtype: float64