# 07_bayesian_network — Probabilistic Decision Support
A compact **Bayesian Network (BN)** that fuses hydrology + text flags + GP forecast to infer **spill / shortfall / high‑load** risks and a suggested **decision**.

**Inputs**
- `master_with_topics.csv` (features + flags)
- `predictions_test.csv` (from 05_modeling; provides `gpr_mean`, `gpr_std`)

**Outputs**
- `bn_inference.csv` with marginal posteriors per day


### Cell 1 — Imports & configuration

In [None]:
import pandas as pd, numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

# BN library
try:
    from pgmpy.models import BayesianNetwork
    from pgmpy.estimators import ParameterEstimator, MaximumLikelihoodEstimator, K2Score, HillClimbSearch, BayesianEstimator
    from pgmpy.inference import VariableElimination
except Exception as e:
    print("If pgmpy is missing, run:  %pip install pgmpy")

FEATURES_PATH = "master_with_topics.csv"
PRED_PATH     = "predictions_test.csv"

print(Path(FEATURES_PATH).resolve())
print(Path(PRED_PATH).resolve())

### Cell 2 — Load data, join forecasts, discretize

In [None]:
df = pd.read_csv(FEATURES_PATH, parse_dates=['date']).sort_values('date')
pred = pd.read_csv(PRED_PATH, parse_dates=['date']) if Path(PRED_PATH).exists() else None

# Detect typical columns
def first_existing(cols):
    for c in cols:
        if c in df.columns: return c
    return None

COL_LOAD = first_existing(['load_MW','peak_load_mw','avg_load_mw'])
COL_RAIN = first_existing(['rainfall_mm'])
COL_Q    = first_existing(['discharge_m3s','discharge_cms'])
COL_RES  = first_existing(['reservoir_m','gauge_m'])

if pred is not None and {'date','gpr_mean','gpr_std'}.issubset(pred.columns):
    data = df.merge(pred[['date','gpr_mean','gpr_std']], on='date', how='left')
else:
    # fallback forecast: 7d MA + residual std
    tmp = df[['date', COL_LOAD]].set_index('date').sort_index().asfreq('D')
    gpr_mean = tmp[COL_LOAD].rolling(7, min_periods=3).mean()
    gpr_std  = (tmp[COL_LOAD] - gpr_mean).rolling(30, min_periods=7).std().fillna(gpr_mean.std())
    fb = pd.DataFrame({'date': gpr_mean.index, 'gpr_mean': gpr_mean.values, 'gpr_std': gpr_std.values})
    data = df.merge(fb, on='date', how='left')

data = data.sort_values('date').reset_index(drop=True)

# Capacity/min bound for derived binary nodes
cap_mw = data[COL_LOAD].quantile(0.95) if COL_LOAD else data['gpr_mean'].quantile(0.95)
min_mw = data[COL_LOAD].quantile(0.10) if COL_LOAD else data['gpr_mean'].quantile(0.10)

# Derived probabilities from GP (using Normal tail)
from scipy.stats import norm
prob_highload  = 1 - norm.cdf((cap_mw - data['gpr_mean'])/data['gpr_std'].replace(0,np.nan))
prob_shortfall = 1 - norm.cdf((data['gpr_mean'] - min_mw)/data['gpr_std'].replace(0,np.nan))

# Discretize into categorical states {0,1}
def binarize_prob(p, thr=0.5):
    return (p >= thr).astype(int)

def z_monthly(s):
    g = data['date'].dt.month
    return (s - s.groupby(g).transform('mean'))/s.groupby(g).transform('std')

data['RainHigh']      = (z_monthly(data.get(COL_RAIN, pd.Series(0))) > 0.7).astype(int)
data['DischargeHigh'] = (z_monthly(data.get(COL_Q, pd.Series(0))) > 0.7).astype(int)
data['ReservoirHigh'] = (z_monthly(data.get(COL_RES, pd.Series(0))) > 0.7).astype(int)
data['FloodFlag']     = data.get('flood_flag', 0).astype(int)
data['WeatherFlag']   = data.get('weather_flag', 0).astype(int)
data['MaintFlag']     = (data.get('maintenance_flag',0) + data.get('outage_flag',0)).clip(0,1).astype(int)

data['GP_HighLoad']   = binarize_prob(prob_highload.fillna(0.0), 0.5)
data['GP_Shortfall']  = binarize_prob(prob_shortfall.fillna(0.0), 0.5)

# Targets we want to infer (latent risks as categories)
data['RiskSpill']      = ((data['RainHigh']|data['DischargeHigh']|data['FloodFlag']|data['WeatherFlag'])>0).astype(int)
data['RiskShortfall']  = data['GP_Shortfall']
data['RiskHighLoad']   = data['GP_HighLoad']

keep_cols = ['date','RainHigh','DischargeHigh','ReservoirHigh','FloodFlag','WeatherFlag','MaintFlag',
             'GP_HighLoad','GP_Shortfall','RiskSpill','RiskShortfall','RiskHighLoad']
bn_df = data[keep_cols].dropna().reset_index(drop=True)
bn_df.head()

### Cell 3 — Define Bayesian Network structure

In [None]:
# Structure (expert prior): 
# Rain/Discharge/Weather → RiskSpill
# GP_* → corresponding risks
# MaintFlag + RiskSpill/RiskHighLoad → Decision
edges = [
    ('RainHigh','RiskSpill'),
    ('DischargeHigh','RiskSpill'),
    ('WeatherFlag','RiskSpill'),
    ('FloodFlag','RiskSpill'),
    ('GP_Shortfall','RiskShortfall'),
    ('GP_HighLoad','RiskHighLoad'),
    ('MaintFlag','RiskHighLoad'),
    ('MaintFlag','RiskShortfall'),
]

model = BayesianNetwork(edges)
model

### Cell 4 — Learn CPDs (Maximum Likelihood)

In [None]:
# All variables are binary 0/1 here
# Fit CPDs via MLE (you can switch to BayesianEstimator with Dirichlet priors if data is sparse)
model.fit(bn_df.drop(columns=['date']), estimator=MaximumLikelihoodEstimator)
print("CPDs learned for nodes:", [cpd.variable for cpd in model.get_cpds()])
for cpd in model.get_cpds()[:4]:
    print(cpd)

### Cell 5 — Inference: compute daily marginals

In [None]:
infer = VariableElimination(model)

# Evidence per-day: treat observed parents as evidence, infer posteriors for risks
out_rows = []
for _, r in bn_df.iterrows():
    evidence = {
        'RainHigh': int(r['RainHigh']),
        'DischargeHigh': int(r['DischargeHigh']),
        'WeatherFlag': int(r['WeatherFlag']),
        'FloodFlag': int(r['FloodFlag']),
        'GP_Shortfall': int(r['GP_Shortfall']),
        'GP_HighLoad': int(r['GP_HighLoad']),
        'MaintFlag': int(r['MaintFlag']),
    }
    q_spill = infer.query(['RiskSpill'], evidence=evidence, show_progress=False)
    q_short = infer.query(['RiskShortfall'], evidence=evidence, show_progress=False)
    q_high  = infer.query(['RiskHighLoad'], evidence=evidence, show_progress=False)

    out_rows.append({
        'date': r['date'],
        'P(RiskSpill=1)': float(q_spill.values[1]),
        'P(RiskShortfall=1)': float(q_short.values[1]),
        'P(RiskHighLoad=1)': float(q_high.values[1]),
    })

bn_post = pd.DataFrame(out_rows)
bn_post.head()

### Cell 6 — Map BN posteriors → decision

In [None]:
THR_HIGH, THR_MED = 0.6, 0.35

def decide(p_spill, p_short, p_high):
    if p_spill >= THR_HIGH:
        return 'Spill advisory'
    if p_short >= THR_HIGH:
        return 'Shortfall mitigation (imports/DR)'
    if p_high >= THR_HIGH:
        return 'Peak support readiness'
    # medium levels
    if p_spill >= THR_MED:
        return 'Monitor spill readiness'
    if p_short >= THR_MED:
        return 'Monitor shortfall'
    if p_high >= THR_MED:
        return 'Monitor peak load'
    return 'Normal ops'

bn_post['decision'] = [decide(a,b,c) for a,b,c in bn_post[['P(RiskSpill=1)','P(RiskShortfall=1)','P(RiskHighLoad=1)']].values]
bn_post.to_csv('bn_inference.csv', index=False)
print("Saved bn_inference.csv | rows:", len(bn_post))
bn_post.head(10)

### Cell 7 — Visualize posteriors

In [None]:
plt.figure(figsize=(12,4))
plt.plot(bn_post['date'], bn_post['P(RiskSpill=1)'], label='P(Spill)')
plt.plot(bn_post['date'], bn_post['P(RiskShortfall=1)'], label='P(Shortfall)')
plt.plot(bn_post['date'], bn_post['P(RiskHighLoad=1)'], label='P(HighLoad)')
plt.axhline(0.6, color='k', ls='--', alpha=0.4); plt.axhline(0.35, color='k', ls=':', alpha=0.4)
plt.legend(); plt.title('BN posterior risks'); plt.tight_layout(); plt.show()