# PROMPT_1D_SCAN — SEGMENTATION & EFFECT SIZE

High-priority hypotheses (from `docs/hypotheses.md`):
- **H1**: `ffpi_food` level and MoM % = `(ffpi_food / ffpi_food.shift(1) - 1) * 100` — Regime means: 2018–2019 vs 2020–2022 vs 2023–2025.
- **H2**: `ffpi_veg_oils` vs `ffpi_food` levels (corr/β) — Overall and by regime.
- **H4**: Lead–lag: `bdi_price` (lead 1–2 months) vs `ffpi_food` MoM % — Pre-COVID vs 2020–2022 vs 2023–2025.
- **H5**: Import pass-through: corr/β of `ipi_food` level vs `ffpi_food` level — Overall and by regime.
- **H6**: Local demand dampening: `rs_dairy_products` and `rs_fresh` vs `ffpi_food` (corr, slopes) — Compare high-FFPI months vs others.

The cells below load the cleaned panel (`data/clean/data_clean.parquet`), derive KPIs, compute segment summaries/comparisons, and export CSVs (`data/derived/segment_summary.csv`, `data/derived/segment_scoreboard.csv`).


In [None]:

from pathlib import Path
from typing import List, Dict, Tuple, Any

import numpy as np
import pandas as pd
from scipy import stats

pd.set_option('display.float_format', '{:,.2f}'.format)

# Resolve cleaned dataset paths for both notebook and repo-root execution
path_candidates = [
    (Path('data/clean/data_clean.parquet'), Path('data/cleaned.csv')),
    (Path('..') / 'data' / 'clean' / 'data_clean.parquet', Path('..') / 'data' / 'cleaned.csv'),
    (Path('Final Project/Final Project Repo/data/clean/data_clean.parquet'), Path('Final Project/Final Project Repo/data/cleaned.csv')),
]
selected = next(((p, f) for p, f in path_candidates if p.exists() or f.exists()), None)
if selected is None:
    raise FileNotFoundError('No cleaned dataset found under data/clean/ or data/.')
DATA_PATH, FALLBACK_CSV = selected
DERIVED_DIR = Path('data/derived') if Path('data').exists() else Path('..') / 'data' / 'derived'
DERIVED_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
def load_panel(path: Path, fallback: Path) -> pd.DataFrame:
    """Load the cleaned panel, preferring Parquet but falling back to CSV for local testing."""
    if path.exists():
        df = pd.read_parquet(path)
    elif fallback.exists():
        df = pd.read_csv(fallback)
    else:
        raise FileNotFoundError(f'No cleaned dataset found at {path} or {fallback}.')
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').reset_index(drop=True)
    return df

def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
    """Create KPIs and segmentation helpers used in the high-priority hypotheses."""
    enriched = df.copy()
    enriched['ffpi_food_mom_pct'] = enriched['ffpi_food'].pct_change() * 100
    enriched['ffpi_food_yoy_pct'] = enriched['ffpi_food'].pct_change(12) * 100
    enriched['bdi_price_lead1'] = enriched['bdi_price'].shift(-1)
    enriched['bdi_price_lead2'] = enriched['bdi_price'].shift(-2)

    # Regime buckets for pre-/during-/post-stress comparisons.
    enriched['regime'] = pd.cut(
        enriched['date'].dt.year,
        bins=[2017, 2019, 2022, 2025],
        labels=['2018-2019', '2020-2022', '2023-2025']
    )
    enriched['high_ffpi_flag'] = enriched['ffpi_food'] >= enriched['ffpi_food'].quantile(0.75)
    return enriched

def correlation_metric(data: pd.DataFrame, x: str, y: str) -> Tuple[float, int]:
    """Compute a Pearson correlation and return the value with the usable sample size."""
    aligned = data[[x, y]].dropna()
    if len(aligned) < 2:
        return np.nan, len(aligned)
    return aligned[x].corr(aligned[y]), len(aligned)

def welch_t_pvalue(a: pd.Series, b: pd.Series) -> float:
    """Welch's t-test p-value for unequal variances; returns NaN if not enough data."""
    a_clean, b_clean = a.dropna(), b.dropna()
    if len(a_clean) < 2 or len(b_clean) < 2:
        return np.nan
    test = stats.ttest_ind(a_clean, b_clean, equal_var=False)
    return test.pvalue

def summarize_metric(df: pd.DataFrame, metric: Dict[str, Any], segment_col: str) -> pd.DataFrame:
    """Summaries (mean/median/N) for a KPI across segment values plus overall."""
    rows = []
    segment_values = list(df[segment_col].dropna().unique())
    for segment_value in segment_values + ['Overall']:
        segment_df = df if segment_value == 'Overall' else df[df[segment_col] == segment_value]
        if metric['type'] == 'series':
            series = segment_df[metric['kpi']].dropna()
            rows.append({
                'KPI_name': metric['label'],
                'segment_dimension': segment_col,
                'segment_value': segment_value,
                'mean': series.mean(),
                'median': series.median(),
                'N': len(series)
            })
        elif metric['type'] == 'correlation':
            corr_value, sample_size = correlation_metric(segment_df, metric['x'], metric['y'])
            rows.append({
                'KPI_name': metric['label'],
                'segment_dimension': segment_col,
                'segment_value': segment_value,
                'mean': corr_value,
                'median': corr_value,
                'N': sample_size
            })
    return pd.DataFrame(rows)

def compare_segments(df: pd.DataFrame, metric: Dict[str, Any], segment_col: str, pairs: List[Tuple[Any, Any]], min_n: int = 8) -> pd.DataFrame:
    """Compute differences/ratios (and optional p-values) between segment pairs."""
    rows = []
    for seg_a, seg_b in pairs:
        df_a = df if seg_a == 'Overall' else df[df[segment_col] == seg_a]
        df_b = df if seg_b == 'Overall' else df[df[segment_col] == seg_b]
        if metric['type'] == 'series':
            series_a, series_b = df_a[metric['kpi']].dropna(), df_b[metric['kpi']].dropna()
            mean_a, mean_b = series_a.mean(), series_b.mean()
            diff = mean_a - mean_b
            ratio = (mean_a / mean_b) if pd.notnull(mean_a) and pd.notnull(mean_b) and mean_b != 0 else np.nan
            p_value = welch_t_pvalue(series_a, series_b) if (len(series_a) >= min_n and len(series_b) >= min_n) else np.nan
            rows.append({
                'KPI': metric['label'],
                'segment_dimension': segment_col,
                'segment_A': seg_a,
                'segment_B': seg_b,
                'diff': diff,
                'ratio': ratio,
                'N_A': len(series_a),
                'N_B': len(series_b),
                'p_value': p_value
            })
        elif metric['type'] == 'correlation':
            corr_a, n_a = correlation_metric(df_a, metric['x'], metric['y'])
            corr_b, n_b = correlation_metric(df_b, metric['x'], metric['y'])
            diff = corr_a - corr_b if pd.notnull(corr_a) and pd.notnull(corr_b) else np.nan
            ratio = (corr_a / corr_b) if pd.notnull(corr_a) and pd.notnull(corr_b) and corr_b != 0 else np.nan
            rows.append({
                'KPI': metric['label'],
                'segment_dimension': segment_col,
                'segment_A': seg_a,
                'segment_B': seg_b,
                'diff': diff,
                'ratio': ratio,
                'N_A': n_a,
                'N_B': n_b,
                'p_value': np.nan
            })
    return pd.DataFrame(rows)

def build_scoreboard(comparisons: List[pd.DataFrame]) -> pd.DataFrame:
    """Rank segment differences by absolute effect size, ratio distance, and sample size."""
    scoreboard = pd.concat(comparisons, ignore_index=True)
    scoreboard['abs_diff'] = scoreboard['diff'].abs()
    scoreboard['ratio_deviation'] = scoreboard['ratio'].apply(lambda r: abs(r - 1) if pd.notnull(r) else np.nan)
    scoreboard['n_min'] = scoreboard[['N_A', 'N_B']].min(axis=1)
    scoreboard = scoreboard.sort_values(['abs_diff', 'ratio_deviation', 'n_min'], ascending=[False, False, False])
    return scoreboard


In [None]:
# Hypothesis-specific KPI configurations
regime_pairs = [('2018-2019', '2020-2022'), ('2020-2022', '2023-2025'), ('2018-2019', '2023-2025')]
high_ffpi_pairs = [(True, False)]

hypothesis_plan = [
    {
        'id': 'H1',
        'description': 'ffpi_food level and MoM% by regime',
        'segment': 'regime',
        'pairs': regime_pairs,
        'metrics': [
            {'type': 'series', 'kpi': 'ffpi_food', 'label': 'FFPI Food level'},
            {'type': 'series', 'kpi': 'ffpi_food_mom_pct', 'label': 'FFPI Food MoM %'}
        ]
    },
    {
        'id': 'H2',
        'description': 'Veg oils vs food (levels + correlation)',
            'segment': 'regime',
        'pairs': regime_pairs,
        'metrics': [
            {'type': 'series', 'kpi': 'ffpi_veg_oils', 'label': 'Veg oils level'},
            {'type': 'series', 'kpi': 'ffpi_food', 'label': 'FFPI Food level'},
            {'type': 'series', 'kpi': 'veg_food_ratio', 'label': 'Veg/food level ratio'},
            {'type': 'correlation', 'x': 'ffpi_veg_oils', 'y': 'ffpi_food', 'label': 'Corr(veg oils, food)'}
        ]
    },
    {
        'id': 'H4',
        'description': 'BDI lead (1–2m) vs FFPI MoM%',
        'segment': 'regime',
        'pairs': regime_pairs,
        'metrics': [
            {'type': 'correlation', 'x': 'bdi_price_lead1', 'y': 'ffpi_food_mom_pct', 'label': 'Corr(BDI lead1, FFPI MoM%)'},
            {'type': 'correlation', 'x': 'bdi_price_lead2', 'y': 'ffpi_food_mom_pct', 'label': 'Corr(BDI lead2, FFPI MoM%)'}
        ]
    },
    {
        'id': 'H5',
        'description': 'Import pass-through: IPI food vs FFPI food',
        'segment': 'regime',
        'pairs': regime_pairs,
        'metrics': [
            {'type': 'series', 'kpi': 'ipi_food', 'label': 'IPI food level'},
            {'type': 'correlation', 'x': 'ipi_food', 'y': 'ffpi_food', 'label': 'Corr(IPI food, FFPI food)'}
        ]
    },
    {
        'id': 'H6',
        'description': 'Retail demand dampening vs FFPI food',
        'segment': 'high_ffpi_flag',
        'pairs': high_ffpi_pairs,
        'metrics': [
            {'type': 'correlation', 'x': 'rs_dairy_products', 'y': 'ffpi_food', 'label': 'Corr(RS dairy, FFPI food)'},
            {'type': 'correlation', 'x': 'rs_fresh', 'y': 'ffpi_food', 'label': 'Corr(RS fresh, FFPI food)'}
        ]
    },
]

# Derived ratios added here to keep transforms centralized
def attach_transforms(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['veg_food_ratio'] = df['ffpi_veg_oils'] / df['ffpi_food']
    return df


In [None]:
# Execute the scan
panel = add_derived_features(load_panel(DATA_PATH, FALLBACK_CSV))
panel = attach_transforms(panel)

segment_summaries: List[pd.DataFrame] = []
segment_comparisons: List[pd.DataFrame] = []

for hypothesis in hypothesis_plan:
    segment = hypothesis['segment']
    pairs = hypothesis['pairs']
    for metric in hypothesis['metrics']:
        summary = summarize_metric(panel, metric, segment)
        summary['hypothesis'] = hypothesis['id']
        segment_summaries.append(summary)
        comparisons = compare_segments(panel, metric, segment, pairs)
        comparisons['hypothesis'] = hypothesis['id']
        segment_comparisons.append(comparisons)

segment_summary_df = pd.concat(segment_summaries, ignore_index=True)
comparison_df = pd.concat(segment_comparisons, ignore_index=True)
scoreboard_df = build_scoreboard(segment_comparisons)

segment_summary_path = DERIVED_DIR / 'segment_summary.csv'
scoreboard_path = DERIVED_DIR / 'segment_scoreboard.csv'
segment_summary_df.to_csv(segment_summary_path, index=False)
scoreboard_df.to_csv(scoreboard_path, index=False)

segment_summary_df.head(), comparison_df.head(), scoreboard_df.head()
