# PROMPT_1D_SCAN — SEGMENTATION & EFFECT SIZE

High-priority hypotheses (from `docs/hypotheses.md`):
- **H1**: `ffpi_food` level and MoM % = `(ffpi_food / ffpi_food.shift(1) - 1) * 100` — Regime means: 2018–2019 vs 2020–2022 vs 2023–2025.
- **H2**: `ffpi_veg_oils` vs `ffpi_food` levels (corr/β) — Overall and by regime.
- **H4**: Lead–lag: `bdi_price` (lead 1–2 months) vs `ffpi_food` MoM % — Pre-COVID vs 2020–2022 vs 2023–2025.
- **H5**: Import pass-through: corr/β of `ipi_food` level vs `ffpi_food` level — Overall and by regime.
- **H6**: Local demand dampening: `rs_dairy_products` and `rs_fresh` vs `ffpi_food` (corr, slopes) — Compare high-FFPI months vs others.

The cells below load the cleaned panel (`data/clean/data_clean.parquet`), derive KPIs, compute segment summaries/comparisons, and export CSVs (`data/derived/segment_summary.csv`, `data/derived/segment_scoreboard.csv`).


In [None]:

from pathlib import Path
from typing import List, Dict, Tuple, Any

import numpy as np
import pandas as pd
from scipy import stats

pd.set_option('display.float_format', '{:,.2f}'.format)

# Resolve cleaned dataset paths for both notebook and repo-root execution
path_candidates = [
    (Path('data/clean/data_clean.parquet'), Path('data/cleaned.csv')),
    (Path('..') / 'data' / 'clean' / 'data_clean.parquet', Path('..') / 'data' / 'cleaned.csv'),
    (Path('Final Project/Final Project Repo/data/clean/data_clean.parquet'), Path('Final Project/Final Project Repo/data/cleaned.csv')),
]
selected = next(((p, f) for p, f in path_candidates if p.exists() or f.exists()), None)
if selected is None:
    raise FileNotFoundError('No cleaned dataset found under data/clean/ or data/.')
DATA_PATH, FALLBACK_CSV = selected
DERIVED_DIR = Path('data/derived') if Path('data').exists() else Path('..') / 'data' / 'derived'
DERIVED_DIR.mkdir(parents=True, exist_ok=True)


In [None]:


def load_panel(path: Path, fallback: Path) -> pd.DataFrame:
    """Load the cleaned panel, preferring Parquet but safely falling back to CSV."""
    if path.exists():
        try:
            import pyarrow  # noqa: F401
            df = pd.read_parquet(path)
        except Exception:
            if fallback.exists():
                df = pd.read_csv(fallback)
            else:
                raise
    elif fallback.exists():
        df = pd.read_csv(fallback)
    else:
        raise FileNotFoundError(f'No cleaned dataset found at {path} or {fallback}.')
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').reset_index(drop=True)
    return df


In [None]:
# Hypothesis-specific KPI configurations
regime_pairs = [('2018-2019', '2020-2022'), ('2020-2022', '2023-2025'), ('2018-2019', '2023-2025')]
high_ffpi_pairs = [(True, False)]

hypothesis_plan = [
    {
        'id': 'H1',
        'description': 'ffpi_food level and MoM% by regime',
        'segment': 'regime',
        'pairs': regime_pairs,
        'metrics': [
            {'type': 'series', 'kpi': 'ffpi_food', 'label': 'FFPI Food level'},
            {'type': 'series', 'kpi': 'ffpi_food_mom_pct', 'label': 'FFPI Food MoM %'}
        ]
    },
    {
        'id': 'H2',
        'description': 'Veg oils vs food (levels + correlation)',
            'segment': 'regime',
        'pairs': regime_pairs,
        'metrics': [
            {'type': 'series', 'kpi': 'ffpi_veg_oils', 'label': 'Veg oils level'},
            {'type': 'series', 'kpi': 'ffpi_food', 'label': 'FFPI Food level'},
            {'type': 'series', 'kpi': 'veg_food_ratio', 'label': 'Veg/food level ratio'},
            {'type': 'correlation', 'x': 'ffpi_veg_oils', 'y': 'ffpi_food', 'label': 'Corr(veg oils, food)'}
        ]
    },
    {
        'id': 'H4',
        'description': 'BDI lead (1–2m) vs FFPI MoM%',
        'segment': 'regime',
        'pairs': regime_pairs,
        'metrics': [
            {'type': 'correlation', 'x': 'bdi_price_lead1', 'y': 'ffpi_food_mom_pct', 'label': 'Corr(BDI lead1, FFPI MoM%)'},
            {'type': 'correlation', 'x': 'bdi_price_lead2', 'y': 'ffpi_food_mom_pct', 'label': 'Corr(BDI lead2, FFPI MoM%)'}
        ]
    },
    {
        'id': 'H5',
        'description': 'Import pass-through: IPI food vs FFPI food',
        'segment': 'regime',
        'pairs': regime_pairs,
        'metrics': [
            {'type': 'series', 'kpi': 'ipi_food', 'label': 'IPI food level'},
            {'type': 'correlation', 'x': 'ipi_food', 'y': 'ffpi_food', 'label': 'Corr(IPI food, FFPI food)'}
        ]
    },
    {
        'id': 'H6',
        'description': 'Retail demand dampening vs FFPI food',
        'segment': 'high_ffpi_flag',
        'pairs': high_ffpi_pairs,
        'metrics': [
            {'type': 'correlation', 'x': 'rs_dairy_products', 'y': 'ffpi_food', 'label': 'Corr(RS dairy, FFPI food)'},
            {'type': 'correlation', 'x': 'rs_fresh', 'y': 'ffpi_food', 'label': 'Corr(RS fresh, FFPI food)'}
        ]
    },
]

# Derived ratios added here to keep transforms centralized
def attach_transforms(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['veg_food_ratio'] = df['ffpi_veg_oils'] / df['ffpi_food']
    return df


In [None]:
# Execute the scan
panel = add_derived_features(load_panel(DATA_PATH, FALLBACK_CSV))
panel = attach_transforms(panel)

segment_summaries: List[pd.DataFrame] = []
segment_comparisons: List[pd.DataFrame] = []

for hypothesis in hypothesis_plan:
    segment = hypothesis['segment']
    pairs = hypothesis['pairs']
    for metric in hypothesis['metrics']:
        summary = summarize_metric(panel, metric, segment)
        summary['hypothesis'] = hypothesis['id']
        segment_summaries.append(summary)
        comparisons = compare_segments(panel, metric, segment, pairs)
        comparisons['hypothesis'] = hypothesis['id']
        segment_comparisons.append(comparisons)

segment_summary_df = pd.concat(segment_summaries, ignore_index=True)
comparison_df = pd.concat(segment_comparisons, ignore_index=True)
scoreboard_df = build_scoreboard(segment_comparisons)

segment_summary_path = DERIVED_DIR / 'segment_summary.csv'
scoreboard_path = DERIVED_DIR / 'segment_scoreboard.csv'
segment_summary_df.to_csv(segment_summary_path, index=False)
scoreboard_df.to_csv(scoreboard_path, index=False)

segment_summary_df.head(), comparison_df.head(), scoreboard_df.head()
