# EDA: Light Curve Events Results

Exploratory data analysis of `lc_events_results_12_12.5.csv` from malca events.py pipeline.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load Data

In [None]:
import os
osname = os.name
if osname == "Linux":
    df = pd.read_csv('/home/calder/code/malca/output/lc_events_results_12_12.5.csv')
elif osname == "Darwin":  # macOS
    df = pd.read_csv('/Users/calder/code/malca/output/lc_events_results_12_12.5.csv')

print(f"Loaded {len(df)} light curves")
df.head()

In [None]:
df.info()

## 2. Basic Statistics

In [None]:
print("=== Detection Summary ===")
print(f"Total light curves: {len(df)}")
print(f"Dip detections: {df['dip_significant'].sum()} ({df['dip_significant'].sum()/len(df)*100:.2f}%)")
print(f"Jump detections: {df['jump_significant'].sum()} ({df['jump_significant'].sum()/len(df)*100:.2f}%)")
print(f"Either dip or jump: {(df['dip_significant'] | df['jump_significant']).sum()}")
print(f"Both dip and jump: {(df['dip_significant'] & df['jump_significant']).sum()}")

In [None]:
df.describe()

## 3. Light Curve Quality Metrics

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

axes[0, 0].hist(df['n_points'], bins=50, edgecolor='black')
axes[0, 0].set_xlabel('Number of Points')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('Distribution of Light Curve Length')
axes[0, 0].axvline(df['n_points'].median(), color='red', linestyle='--', label=f'Median: {df["n_points"].median():.0f}')
axes[0, 0].legend()

time_span = df['jd_last'] - df['jd_first']
axes[0, 1].hist(time_span, bins=50, edgecolor='black')
axes[0, 1].set_xlabel('Time Span (days)')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Distribution of Observing Baseline')
axes[0, 1].axvline(time_span.median(), color='red', linestyle='--', label=f'Median: {time_span.median():.0f} days')
axes[0, 1].legend()

axes[0, 2].hist(df['cadence_median_days'], bins=50, edgecolor='black')
axes[0, 2].set_xlabel('Median Cadence (days)')
axes[0, 2].set_ylabel('Count')
axes[0, 2].set_title('Distribution of Observing Cadence')
axes[0, 2].axvline(df['cadence_median_days'].median(), color='red', linestyle='--', label=f'Median: {df["cadence_median_days"].median():.2f} days')
axes[0, 2].legend()

axes[1, 0].hist(df['n_cameras'], bins=range(0, int(df['n_cameras'].max())+2), edgecolor='black')
axes[1, 0].set_xlabel('Number of Cameras')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Distribution of Camera Count')
axes[1, 0].axvline(df['n_cameras'].median(), color='red', linestyle='--', label=f'Median: {df["n_cameras"].median():.0f}')
axes[1, 0].legend()

axes[1, 1].scatter(df['n_points'], time_span, alpha=0.3, s=10)
axes[1, 1].set_xlabel('Number of Points')
axes[1, 1].set_ylabel('Time Span (days)')
axes[1, 1].set_title('Points vs Time Span')

axes[1, 2].scatter(df['n_cameras'], df['n_points'], alpha=0.3, s=10)
axes[1, 2].set_xlabel('Number of Cameras')
axes[1, 2].set_ylabel('Number of Points')
axes[1, 2].set_title('Cameras vs Points')

plt.tight_layout()
plt.show()

## 4. Detection Metrics

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

axes[0, 0].hist(np.log10(df['dip_bayes_factor'].replace(0, np.nan).dropna()), bins=50, alpha=0.6, label='Dip', edgecolor='black')
axes[0, 0].hist(np.log10(df['jump_bayes_factor'].replace(0, np.nan).dropna()), bins=50, alpha=0.6, label='Jump', edgecolor='black')
axes[0, 0].set_xlabel('log10(Bayes Factor)')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('Bayes Factor Distribution')
axes[0, 0].axvline(np.log10(10), color='red', linestyle='--', label='BF=10 threshold')
axes[0, 0].legend()

axes[0, 1].hist(df['dip_max_event_prob'], bins=50, alpha=0.6, label='Dip', edgecolor='black')
axes[0, 1].hist(df['jump_max_event_prob'], bins=50, alpha=0.6, label='Jump', edgecolor='black')
axes[0, 1].set_xlabel('Max Event Probability')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Event Probability Distribution')
axes[0, 1].axvline(0.5, color='red', linestyle='--', label='p=0.5 threshold')
axes[0, 1].legend()

axes[1, 0].hist(df['dip_run_count'], bins=range(0, int(df['dip_run_count'].max())+2), alpha=0.6, label='Dip', edgecolor='black')
axes[1, 0].hist(df['jump_run_count'], bins=range(0, int(df['jump_run_count'].max())+2), alpha=0.6, label='Jump', edgecolor='black')
axes[1, 0].set_xlabel('Run Count')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Number of Runs per Light Curve')
axes[1, 0].legend()

axes[1, 1].hist(df['dip_max_run_points'], bins=50, alpha=0.6, label='Dip', edgecolor='black')
axes[1, 1].hist(df['jump_max_run_points'], bins=50, alpha=0.6, label='Jump', edgecolor='black')
axes[1, 1].set_xlabel('Max Run Points')
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_title('Points in Longest Run')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

## 5. Morphology Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

dip_morphs = df[df['dip_best_morph'] != 'none']['dip_best_morph'].value_counts()
axes[0].bar(range(len(dip_morphs)), dip_morphs.values, tick_label=dip_morphs.index)
axes[0].set_ylabel('Count')
axes[0].set_title(f'Dip Morphology (n={dip_morphs.sum()})')
axes[0].tick_params(axis='x', rotation=45)

jump_morphs = df[df['jump_best_morph'] != 'none']['jump_best_morph'].value_counts()
axes[1].bar(range(len(jump_morphs)), jump_morphs.values, tick_label=jump_morphs.index)
axes[1].set_ylabel('Count')
axes[1].set_title(f'Jump Morphology (n={jump_morphs.sum()})')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

dip_with_morph = df[df['dip_best_morph'] != 'none']
axes[0].hist(dip_with_morph['dip_best_delta_bic'], bins=50, edgecolor='black')
axes[0].set_xlabel('Delta BIC')
axes[0].set_ylabel('Count')
axes[0].set_title(f'Dip Delta BIC (n={len(dip_with_morph)})')
axes[0].axvline(10, color='red', linestyle='--', label='BIC=10 threshold')
axes[0].legend()

jump_with_morph = df[df['jump_best_morph'] != 'none']
axes[1].hist(jump_with_morph['jump_best_delta_bic'], bins=50, edgecolor='black')
axes[1].set_xlabel('Delta BIC')
axes[1].set_ylabel('Count')
axes[1].set_title(f'Jump Delta BIC (n={len(jump_with_morph)})')
axes[1].axvline(10, color='red', linestyle='--', label='BIC=10 threshold')
axes[1].legend()

plt.tight_layout()
plt.show()

## 6. Dipper Score Analysis

In [None]:
import numpy as np
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# Filter out -inf, inf, and NaN values
clean_score = df['dipper_score'][np.isfinite(df['dipper_score'])]
axes[0].hist(clean_score, bins=50, edgecolor='black')
axes[0].set_xlabel('Dipper Score')
axes[0].set_ylabel('Count')
axes[0].set_title('Dipper Score Distribution')
axes[0].axvline(clean_score.median(), color='red', linestyle='--', label=f'Median: {clean_score.median():.2f}')
axes[0].legend()
axes[1].hist(df['dipper_n_dips'], bins=range(0, int(df['dipper_n_dips'].max())+2), edgecolor='black')
axes[1].set_xlabel('Number of Dips')
axes[1].set_ylabel('Count')
axes[1].set_title('Dipper: Number of Dips')
axes[2].hist(df['dipper_n_valid_dips'], bins=range(0, int(df['dipper_n_valid_dips'].max())+2), edgecolor='black')
axes[2].set_xlabel('Number of Valid Dips')
axes[2].set_ylabel('Count')
axes[2].set_title('Dipper: Valid Dips')
plt.tight_layout()
plt.show()

## 7. Correlation Analysis

In [None]:
numeric_cols = [
    'n_points', 'cadence_median_days', 'n_cameras',
    'dip_bayes_factor', 'jump_bayes_factor',
    'dip_max_event_prob', 'jump_max_event_prob',
    'dip_run_count', 'jump_run_count',
    'dip_max_run_points', 'jump_max_run_points',
    'dipper_score', 'dipper_n_dips'
]

corr = df[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0, square=True)
plt.title('Correlation Matrix of Key Metrics')
plt.tight_layout()
plt.show()

## 8. Filter Performance Analysis

In [None]:
print("=== Filter Thresholds Analysis ===")
print()

bf_thresholds = [1, 3, 10, 30, 100]
print("Bayes Factor Thresholds:")
for thresh in bf_thresholds:
    n_dip = (df['dip_bayes_factor'] > thresh).sum()
    n_jump = (df['jump_bayes_factor'] > thresh).sum()
    n_either = ((df['dip_bayes_factor'] > thresh) | (df['jump_bayes_factor'] > thresh)).sum()
    print(f"  BF > {thresh:3d}: dip={n_dip:4d} ({n_dip/len(df)*100:5.2f}%), jump={n_jump:4d} ({n_jump/len(df)*100:5.2f}%), either={n_either:4d} ({n_either/len(df)*100:5.2f}%)")

print()
prob_thresholds = [0.5, 0.7, 0.9, 0.95, 0.99]
print("Event Probability Thresholds:")
for thresh in prob_thresholds:
    n_dip = (df['dip_max_event_prob'] > thresh).sum()
    n_jump = (df['jump_max_event_prob'] > thresh).sum()
    n_either = ((df['dip_max_event_prob'] > thresh) | (df['jump_max_event_prob'] > thresh)).sum()
    print(f"  P > {thresh:.2f}: dip={n_dip:4d} ({n_dip/len(df)*100:5.2f}%), jump={n_jump:4d} ({n_jump/len(df)*100:5.2f}%), either={n_either:4d} ({n_either/len(df)*100:5.2f}%)")

print()
print("Run Count Requirements:")
for min_runs in [1, 2, 3]:
    n_dip = (df['dip_run_count'] >= min_runs).sum()
    n_jump = (df['jump_run_count'] >= min_runs).sum()
    n_either = ((df['dip_run_count'] >= min_runs) | (df['jump_run_count'] >= min_runs)).sum()
    print(f"  Runs >= {min_runs}: dip={n_dip:4d} ({n_dip/len(df)*100:5.2f}%), jump={n_jump:4d} ({n_jump/len(df)*100:5.2f}%), either={n_either:4d} ({n_either/len(df)*100:5.2f}%)")

## 9. Combined Filter Analysis

In [None]:
print("=== Combined Filtering Scenarios ===")
print()

mask_bf10 = (df['dip_bayes_factor'] > 10) | (df['jump_bayes_factor'] > 10)
mask_prob50 = (df['dip_max_event_prob'] > 0.5) | (df['jump_max_event_prob'] > 0.5)
mask_runs1 = (df['dip_run_count'] >= 1) | (df['jump_run_count'] >= 1)
mask_runpoints2 = (df['dip_max_run_points'] >= 2) | (df['jump_max_run_points'] >= 2)
mask_cameras2 = (df['dip_max_run_cameras'] >= 2) | (df['jump_max_run_cameras'] >= 2)

print(f"Starting: {len(df)} light curves")
print()

n = len(df)
scenarios = [
    ("BF > 10", mask_bf10),
    ("+ Event prob > 0.5", mask_bf10 & mask_prob50),
    ("+ Run count >= 1", mask_bf10 & mask_prob50 & mask_runs1),
    ("+ Run points >= 2", mask_bf10 & mask_prob50 & mask_runs1 & mask_runpoints2),
    ("+ Run cameras >= 2", mask_bf10 & mask_prob50 & mask_runs1 & mask_runpoints2 & mask_cameras2),
]

for label, mask in scenarios:
    n_pass = mask.sum()
    print(f"{label:25s}: {n_pass:5d} ({n_pass/n*100:5.2f}%)")

## 10. Top Candidates

In [None]:
df['max_bayes_factor'] = df[['dip_bayes_factor', 'jump_bayes_factor']].max(axis=1)
df['max_event_prob'] = df[['dip_max_event_prob', 'jump_max_event_prob']].max(axis=1)

top_by_bf = df.nlargest(20, 'max_bayes_factor')[[
    'path', 'dip_significant', 'jump_significant',
    'dip_bayes_factor', 'jump_bayes_factor',
    'dip_max_event_prob', 'jump_max_event_prob',
    'dip_run_count', 'jump_run_count',
    'n_points', 'n_cameras'
]]

print("=== Top 20 Candidates by Bayes Factor ===")
print(top_by_bf.to_string())

## 11. Baseline Source Analysis

In [None]:
baseline_counts = df['baseline_source'].value_counts()

plt.figure(figsize=(8, 6))
plt.bar(range(len(baseline_counts)), baseline_counts.values, tick_label=baseline_counts.index)
plt.ylabel('Count')
plt.title('Baseline Source Distribution')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("\nBaseline Source Counts:")
for source, count in baseline_counts.items():
    print(f"  {source}: {count} ({count/len(df)*100:.2f}%)")

## 12. Camera Coverage Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].scatter(df['n_cameras'], df['n_points'], alpha=0.3, s=10)
axes[0].set_xlabel('Number of Cameras')
axes[0].set_ylabel('Number of Points')
axes[0].set_title('Camera Count vs Light Curve Points')

axes[1].scatter(df['n_cameras'], df['max_bayes_factor'], alpha=0.3, s=10)
axes[1].set_xlabel('Number of Cameras')
axes[1].set_ylabel('Max Bayes Factor')
axes[1].set_yscale('log')
axes[1].set_title('Camera Count vs Detection Strength')

plt.tight_layout()
plt.show()

## 13. Summary Statistics by Detection Type

In [None]:
print("=== Dip Detections ===")
dip_detections = df[df['dip_significant']]
if len(dip_detections) > 0:
    print(dip_detections[[
        'n_points', 'n_cameras', 'dip_bayes_factor',
        'dip_max_event_prob', 'dip_run_count', 'dip_max_run_points'
    ]].describe())
else:
    print("No dip detections")

print("\n=== Jump Detections ===")
jump_detections = df[df['jump_significant']]
if len(jump_detections) > 0:
    print(jump_detections[[
        'n_points', 'n_cameras', 'jump_bayes_factor',
        'jump_max_event_prob', 'jump_run_count', 'jump_max_run_points'
    ]].describe())
else:
    print("No jump detections")

## 14. Export Filtered Results

In [None]:
    strong_candidates = df[
    ((df['dip_bayes_factor'] > 10) | (df['jump_bayes_factor'] > 10)) &
    ((df['dip_max_event_prob'] > 0.5) | (df['jump_max_event_prob'] > 0.5)) &
    ((df['dip_run_count'] >= 1) | (df['jump_run_count'] >= 1))
]

print(f"Found {len(strong_candidates)} strong candidates")
print(f"({len(strong_candidates)/len(df)*100:.2f}% of total)")

# add platform options again

if platform.system() == 'Linux':
    output_path = '/home/calder/code/malca/output/strong_candidates_12_12.5.csv'
elif platform.system() == 'Darwin':
    output_path = '/Users/calder/code/malca/output/strong_candidates_12_12.5.csv'

strong_candidates.to_csv(output_path, index=False)
print(f"\nSaved to: {output_path}")