In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler



In [2]:
# --- One-Class SVM Method for Outlier Detection ---

# When to use this method:
# - For high-dimensional data.
# - When you want to define a "boundary" around normal data points.
# - It's effective for "novelty detection" (identifying new, unseen patterns).
# Note: One-Class SVM can be sensitive to outliers and often requires data scaling.



In [3]:
# 1. Load and Scale the Data
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
data_feature = df['MedInc'].values.reshape(-1, 1)

# Scaling is important for SVM algorithms
scaler = StandardScaler()
data = scaler.fit_transform(data_feature)




In [4]:
# --- 2. Hyperparameter Tuning ---
# We will test different combinations of hyperparameters.
# nu: Think of this as being very similar to the contamination parameter in the other models. It's a number between 0 and 1 that gives the model an idea of what percentage of your data you expect to be outliers.

#     A small nu (e.g., 0.01) tells the model to be very strict and assume very few points are outliers.

#     A larger nu (e.g., 0.1) tells the model to be more lenient and allows it to flag a larger fraction of the data as outliers.

# gamma: This parameter controls the "reach" or influence of a single data point. It only applies when using the rbf kernel (which is the most common).

#     'scale' (default and recommended): The influence is calculated automatically based on the variance of your data. This is usually the best choice.

#     'auto': The influence is 1 / n_features.

#     A small gamma value (e.g., 0.01): A single point has a far-reaching influence, leading to a smoother, more general boundary around the normal data.

#     A large gamma value (e.g., 10): A single point has a very local influence, which can create a more complex, "tighter" boundary that might overfit to the training data.
# Define the grid of parameters to test for One-Class SVM
param_grid = {
    'nu': [0.01, 0.05, 0.1],
    'gamma': ['scale', 'auto', 0.1]
}

# Store the results
tuning_results = []

print("Running hyperparameter tuning for One-Class SVM...")
for nu_val in param_grid['nu']:
    for gamma_val in param_grid['gamma']:
        # Create and fit the model with the current parameters
        svm = OneClassSVM(nu=nu_val, kernel='rbf', gamma=gamma_val)
        
        # Fit the model and get predictions
        predictions = svm.fit_predict(data)
        
        # Count the number of outliers (-1)
        num_outliers = np.sum(predictions == -1)
        
        # Store the results
        tuning_results.append({
            'nu': nu_val,
            'gamma': gamma_val,
            'num_outliers': num_outliers
        })

# Convert results to a DataFrame for easy viewing
results_df = pd.DataFrame(tuning_results)




Running hyperparameter tuning for One-Class SVM...


In [5]:
# --- 3. Display Tuning Results ---
print("\nTuning complete. Here are the results:")
print(results_df)
print("\n--- Interpretation ---")
print("Analyze this table to choose the best parameters. For this example, we'll proceed with nu=0.05 and gamma='scale'.")






Tuning complete. Here are the results:
     nu  gamma  num_outliers
0  0.01  scale           380
1  0.01   auto           380
2  0.01    0.1           203
3  0.05  scale          1116
4  0.05   auto          1116
5  0.05    0.1          1033
6  0.10  scale          2078
7  0.10   auto          2078
8  0.10    0.1          2064

--- Interpretation ---
Analyze this table to choose the best parameters. For this example, we'll proceed with nu=0.05 and gamma='scale'.


In [6]:
# --- 4. Apply the Final Model ---
# Based on the tuning results, we select our final parameters.
best_nu = 0.05
best_gamma = 'scale'

# Create and fit the final model
final_svm = OneClassSVM(nu=best_nu, kernel='rbf', gamma=best_gamma)
predictions = final_svm.fit_predict(data)

# Get the indices of the outliers
outlier_indices = np.where(predictions == -1)[0]

# Get the outlier scores (the lower, the more abnormal)
outlier_scores = final_svm.decision_function(data)[outlier_indices]



In [7]:
# 5. Create a DataFrame of the outliers
outliers_df = df.loc[outlier_indices].copy()
outliers_df['SVM_Score'] = outlier_scores




In [8]:
# --- 6. Display Final Outliers ---
print(f"\n--- Applying Final Model (nu={best_nu}, gamma='{best_gamma}') ---")
print(f"Found {len(outliers_df)} outliers.")
print("\nDataFrame containing the identified outliers (first 5 rows):")
print(outliers_df.head())


--- Applying Final Model (nu=0.05, gamma='scale') ---
Found 1116 outliers.

DataFrame containing the identified outliers (first 5 rows):
    MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0   8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1   8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
11  3.2705      52.0  4.772480   1.024523      1504.0  2.049046     37.85   
42  1.0250      49.0  3.772487   1.068783       462.0  2.444444     37.84   
48  0.9506      40.0  3.900000   1.218750       423.0  2.643750     37.82   

    Longitude  SVM_Score  
0     -122.23  -0.029071  
1     -122.22  -0.026198  
11    -122.26  -0.000253  
42    -122.26  -1.328459  
48    -122.26  -3.898390  


In [9]:
outliers_df['SVM_Score']

0       -0.029071
1       -0.026198
11      -0.000253
42      -1.328459
48      -3.898390
           ...   
20465   -0.000469
20495   -0.000085
20503   -0.022914
20537   -0.152227
20592   -6.160399
Name: SVM_Score, Length: 1116, dtype: float64