# Hospital Anomalies: Isolation Forest Baseline Model

This notebook demonstrates the baseline Isolation Forest anomaly detection model.

In [None]:
import sys
from pathlib import Path

sys.path.insert(0, str(Path.cwd().parent.parent.parent))

import pandas as pd
import matplotlib.pyplot as plt
from publicdata_ca.utils.config import load_config
from case_studies.hospital_anomalies.src.ingest import load_ingested_data
from case_studies.hospital_anomalies.src.features import engineer_features
from case_studies.hospital_anomalies.src.models.isolation_forest import IsolationForestDetector
from case_studies.hospital_anomalies.src.visualize import plot_time_series_with_anomalies

%matplotlib inline

## 1. Load Data and Configuration

In [None]:
config_path = Path.cwd().parent / 'config' / 'default.yaml'
config = load_config(config_path)
config_dict = config.to_dict()

datasets = load_ingested_data(config_dict['datasets'])
df = list(datasets.values())[0]  # Use first dataset

## 2. Feature Engineering

In [None]:
features_df = engineer_features(df, config_dict)
print(f"Engineered {len(features_df.columns)} features")
display(features_df.head())

## 3. Train Isolation Forest

In [None]:
# Select features
feature_cols = [
    col for col in features_df.columns
    if col not in ['date', 'region', 'hospital_id', 'year', 'month', 'day']
]
X = features_df[feature_cols].dropna()

# Train model
if_config = config_dict['isolation_forest']
detector = IsolationForestDetector(
    n_estimators=if_config['n_estimators'],
    contamination=if_config['contamination'],
    random_state=if_config['random_state']
)
detector.fit(X)

print(f"Model trained on {len(X)} samples")

## 4. Detect Anomalies

In [None]:
predictions = detector.get_anomalies(X)

print(f"Total anomalies: {predictions['is_anomaly'].sum()}")
print(f"Anomaly rate: {predictions['is_anomaly'].mean():.2%}")

## 5. Visualize Results

In [None]:
# Combine predictions with original data
results_df = features_df.loc[X.index].copy()
results_df['is_anomaly'] = predictions['is_anomaly']
results_df['anomaly_score'] = predictions['anomaly_score']

# Plot time series with anomalies
value_col = [c for c in df.columns if c not in ['date', 'region', 'hospital_id']][0]
if value_col in results_df.columns:
    plot_time_series_with_anomalies(
        results_df, 'date', value_col, 'is_anomaly',
        title=f'{value_col} with Detected Anomalies'
    )
    plt.show()

## 6. Top Anomalies

In [None]:
top_anomalies = results_df[results_df['is_anomaly']].nsmallest(10, 'anomaly_score')
display(top_anomalies[['date', value_col, 'anomaly_score']])