# Model Simulation

In [1]:
## Standard imports
import pandas as pd
import numpy as np
## Script imports
import simuFlares
from STL_IF import STLIF
import detectFlare
from sigma_clip import sigma_clip
## Simulation status
from IPython.display import clear_output

## Setup
# Load Data
pdcsap = pd.read_csv("../0.Data/031381302.csv", index_col = 'time').loc[:, ["pdcsap_flux"]].dropna()
# Calm interval
pdcsap = pdcsap.query("1442 <= index <= 1449")
inds = np.arange(pdcsap.shape[0])

## Flare parameters
num_flares = 5
# Base half-peak timescale: larger values => all flares last longer (relative to their amplitudes)
t_half = 4.32/120 #2.5  # e.g. 10 minutes (2-min cadence)
# Flare ampltiude (Pareto) parameters
xm = pdcsap['pdcsap_flux'].mean() * 0.02        # Scale (~ x_min): Baseline amplitude (values will rarely be smaller than this)
alpha = 2                                       # Shape: smaller => heavier tail = more large flares
offset = 0                                      # Offset amplitudes (shift)
upper = pdcsap['pdcsap_flux'].mean() * 0.1      # Amplitude cap
print(upper)
# xm = 10         # Scale (~ x_min): Baseline amplitude (values will rarely be smaller than this)
# alpha = 1       # Shape: smaller => heavier tail = more large flares.
# offset = 30     # Offset amplitudes (shift)
# upper = 100     # Amplitude cap

156.44160040416168


In [None]:
## Isolation Forest parameters
contamination = 0.001 # Expected proportion of anomalies
n_estimators = 100 # Number of trees
sample_size = 256 # Number of samples used to train each tree

## Simulate
n = 100 # Number of simulations
stlif_metrics = []
sigma_metrics = []

for i in range(n):
    ## Simulation status
    clear_output(wait=True)
    print(i+1)

    ## Simulate flares
    flare_lightcurve, flare_times = simuFlares.kepler_flare(
        inds,                           # time array
        t_half,                         # base half-peak width
        num_flares,                     # number of flares
        flux_dist=simuFlares.rpareto,   # amplitude distribution
        xm=xm, alpha=alpha, offset=offset, upper=upper
    )
    # Inject flares
    data = pdcsap.copy()
    data["pdcsap_flux"] += flare_lightcurve

    ## Run model: STLIF
    data = STLIF(data, contamination=contamination, n_estimators=n_estimators, sample_size=sample_size)

    # Calculate metrics
    prec, rec, f1 = detectFlare.event_level_scores(real_flares=flare_times, y_pred=data["anomaly"].values)
    stlif_metrics.append((prec, rec, f1))

    ## Run model: STLSigmaClip
    # Note: Uses detrended series from STLIF output.
    anomalies = sigma_clip(data['resid'], sigma=3.0, consecutive_pts=3).ravel()

    # Calculate metrics
    prec, rec, f1 = detectFlare.event_level_scores(real_flares=flare_times, y_pred=anomalies)
    sigma_metrics.append((prec, rec, f1))

## Compute average metrics
avg_prec, avg_rec, avg_f1 = np.array(stlif_metrics).mean(axis=0)

# Print results
print(f"After {n} runs:")
print("STLIF:")
print(f"  Avg Precision: {avg_prec:.3f}")
print(f"  Avg Recall:    {avg_rec:.3f}")
print(f"  Avg F1 Score:  {avg_f1:.3f}")

## Compute average metrics
avg_prec, avg_rec, avg_f1 = np.array(sigma_metrics).mean(axis=0)

print("3-3sigma:")
print(f"  Avg Precision: {avg_prec:.3f}")
print(f"  Avg Recall:    {avg_rec:.3f}")
print(f"  Avg F1 Score:  {avg_f1:.3f}")

99
After 100 runs:
STLIF:
  Avg Precision: 0.980
  Avg Recall:    0.830
  Avg F1 Score:  0.886
3-3sigma:
  Avg Precision: 0.850
  Avg Recall:    0.292
  Avg F1 Score:  0.420


## Hyperparameter Tuning

In [11]:
## Isolation Forest parameters
# Expected proportion of anomalies
contamination_values = [0.001, 0.0009, 0.0008, 0.0011, 0.0012]
# Number of trees
n_estimators_values = [200]
# Number of samples used to train each tree
max_samples_values = ["auto"]

## Simulate
n_runs = 25#10 # Number of simulations
results = []
# Counter
k = 1
import itertools
total_k = len(list(itertools.product(contamination_values, n_estimators_values, max_samples_values))) # Total parameter combinations

# Create a small param grid
param_grid = []
for c in contamination_values:
    for ne in n_estimators_values:
        for ms in max_samples_values:
            param_grid.append((c, ne, ms))

for (contamination, n_est, m_samp) in param_grid:
    ## Simulation status
    clear_output(wait=True)
    print("Combination: ", k, "/", total_k, " (contamination=", contamination, ", n_est=", n_est, ", m_samp=", m_samp, ")", sep="")
    k += 1

    ## Setup
    run_metrics = []
    
    for run_i in range(n_runs):
        ## Simulate flares
        flare_lightcurve, flare_times = simuFlares.kepler_flare(
            inds,                           # time array
            t_half,                         # base half-peak width
            num_flares,                     # number of flares
            flux_dist=simuFlares.rpareto,   # amplitude distribution
            xm=xm, alpha=alpha, offset=offset, upper=upper
        )
        # Inject flares
        data = pdcsap.copy()
        data["pdcsap_flux"] += flare_lightcurve

        ## Run model: STLIF
        data = STLIF(data, contamination=contamination, n_estimators=n_est, sample_size=m_samp)
        
        # Calculate metrics
        prec, rec, f1 = detectFlare.event_level_scores(real_flares=flare_times, y_pred=data["anomaly"].values)
        run_metrics.append((prec, rec, f1))
    
    # Average performance over n_runs
    avg_prf = np.mean(run_metrics, axis=0)
    result_dict = {
        "contamination": contamination,
        "n_estimators": n_est,
        "max_samples": m_samp,
        "avg_precision": avg_prf[0],
        "avg_recall":    avg_prf[1],
        "avg_f1_score":  avg_prf[2],
    }
    results.append(result_dict)

# Sort results by F1
results.sort(key=lambda x: x["avg_f1_score"], reverse=True)

Combination: 5/5 (contamination=0.0012, n_est=200, m_samp=auto)


In [10]:
# Print top results
print("Top 5 Hyperparam Combos (by F1):")
for row in results[:5]:
    print(row)

Top 5 Hyperparam Combos (by F1):
{'contamination': 0.001, 'n_estimators': 200, 'max_samples': 'auto', 'avg_precision': 1.0, 'avg_recall': 0.96, 'avg_f1_score': 0.9777777777777779}
{'contamination': 0.0011, 'n_estimators': 200, 'max_samples': 'auto', 'avg_precision': 1.0, 'avg_recall': 0.9400000000000001, 'avg_f1_score': 0.9638888888888889}
{'contamination': 0.0012, 'n_estimators': 200, 'max_samples': 'auto', 'avg_precision': 0.9833333333333334, 'avg_recall': 0.9399999999999998, 'avg_f1_score': 0.9575757575757577}
{'contamination': 0.0009, 'n_estimators': 200, 'max_samples': 'auto', 'avg_precision': 1.0, 'avg_recall': 0.9199999999999999, 'avg_f1_score': 0.9555555555555557}
{'contamination': 0.0008, 'n_estimators': 200, 'max_samples': 'auto', 'avg_precision': 1.0, 'avg_recall': 0.7999999999999999, 'avg_f1_score': 0.8888888888888891}
