# 02 — Explore results and what-ifs

This exploratory notebook evaluates the trained model's behaviour, visualises distributional trends, and applies scenario analyses using the synthetic mart rebuilt via `make ingest && make marts && make analyze` with confidence intervals.

💡 **Refresh the marts before exploring**

```bash
make ingest && make marts && make analyze
```

This sequence regenerates the synthetic raw inputs, rebuilds `data/marts/fact_day.parquet`, and updates the scenario adjustments consumed below.

In [None]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

## Load marts

In [None]:
PROJECT_ROOT = Path(os.environ.get('OPAL_OCEAN_PROJECT_ROOT', Path.cwd()))
MARTS_DIR = Path(os.environ.get('OPAL_OCEAN_MARTS_DIR', PROJECT_ROOT / 'data' / 'marts'))
FACT_DAY_PATH = MARTS_DIR / 'fact_day.parquet'
SCENARIO_PATH = MARTS_DIR / 'what_if_scenarios.csv'
if not FACT_DAY_PATH.exists():
    raise FileNotFoundError('Run `make ingest && make marts && make analyze` to create data/marts/fact_day.parquet.')
if not SCENARIO_PATH.exists():
    raise FileNotFoundError('Run `make analyze` to refresh data/marts/what_if_scenarios.csv.')

df = pd.read_parquet(FACT_DAY_PATH)
df['date'] = pd.to_datetime(df['date'])
if 'beach_ok' in df.columns:
    df['beach_ok'] = df['beach_ok'].astype(int)
scenarios = pd.read_csv(SCENARIO_PATH)
expected_columns = {
    'scenario', 'description', 'delta_commute_minutes', 'delta_reliability',
    'delta_pm25_mean', 'delta_rain_24h_mm', 'delta_steps', 'delta_sleep_hours', 'delta_caffeine_mg'
}
missing = expected_columns.difference(scenarios.columns)
if missing:
    raise ValueError(f'Scenario file missing columns: {sorted(missing)}')
df.head()

## Daily trends

In [None]:
daily_summary = (
    df.assign(weekday=df['weekday'].astype('category'))
      .groupby('weekday')
      [['mood_1_5', 'commute_minutes', 'opal_cost', 'reliability', 'pm25_mean', 'rain_24h_mm', 'steps', 'sleep_hours', 'caffeine_mg']]
      .mean()
      .sort_index()
)
daily_summary

The averages show how commute, environment, and activity markers align with the weekday rhythm.

## Refit the pipeline for inference

In [None]:
features = ['weekday', 'commute_minutes', 'opal_cost', 'reliability', 'pm25_mean', 'rain_24h_mm', 'beach_ok', 'steps', 'sleep_hours', 'caffeine_mg']
target = 'mood_1_5'
X = df[features]
y = df[target]
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['weekday']),
        ('num', StandardScaler(), ['commute_minutes', 'opal_cost', 'reliability', 'pm25_mean', 'rain_24h_mm', 'beach_ok', 'steps', 'sleep_hours', 'caffeine_mg']),
    ]
)
model = Pipeline(steps=[('prep', preprocessor), ('regressor', LinearRegression())])
model.fit(X, y)

## Confidence intervals for recent days

In [None]:
def bootstrap_interval(pipeline, features, target, samples=300, alpha=0.05, random_state=13):
    rng = np.random.default_rng(random_state)
    preds = []
    for _ in range(samples):
        idx = rng.integers(0, len(features), len(features))
        X_s = features.iloc[idx]
        y_s = target.iloc[idx]
        pipeline.fit(X_s, y_s)
        preds.append(pipeline.predict(features))
    preds = np.vstack(preds)
    point = pipeline.fit(features, target).predict(features)
    lower = np.percentile(preds, 100 * alpha / 2, axis=0)
    upper = np.percentile(preds, 100 * (1 - alpha / 2), axis=0)
    return point, lower, upper

window = min(7, len(df))
recent = df.tail(window).copy()
point, lower, upper = bootstrap_interval(model, X, y)
recent['prediction'] = point[-window:]
recent['lower_ci'] = lower[-window:]
recent['upper_ci'] = upper[-window:]
recent[['date', 'weekday', 'mood_1_5', 'prediction', 'lower_ci', 'upper_ci']]

## Scenario-based sensitivity analysis

In [None]:
baseline = df[features].mean(numeric_only=True).to_dict()
baseline['weekday'] = df['weekday'].mode().iat[0]
baseline_df = pd.DataFrame([baseline])
baseline_prediction = model.predict(baseline_df)[0]
scenario_rows = []
for _, row in scenarios.iterrows():
    scenario_features = baseline_df.copy()
    scenario_features['commute_minutes'] += row['delta_commute_minutes']
    scenario_features['reliability'] = (scenario_features['reliability'] + row['delta_reliability']).clip(0.0, 1.0)
    scenario_features['pm25_mean'] += row['delta_pm25_mean']
    scenario_features['rain_24h_mm'] += row['delta_rain_24h_mm']
    scenario_features['steps'] += row['delta_steps']
    scenario_features['sleep_hours'] += row['delta_sleep_hours']
    scenario_features['caffeine_mg'] += row['delta_caffeine_mg']
    prediction = model.predict(scenario_features)[0]
    scenario_rows.append({
        'scenario': row['scenario'],
        'description': row['description'],
        'predicted_mood': prediction,
        'delta_vs_baseline': prediction - baseline_prediction
    })
scenario_results = pd.DataFrame(scenario_rows)
scenario_results

Scenario deltas summarise how commute, reliability, air quality, and personal routines influence predicted mood.