# Insight validation (Step 1E)

For each high-scoring candidate insight (relevance_score ≥ 4), recompute the KPI differences, chart the pattern, and capture caveats.

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

plt.style.use('seaborn-v0_8')
project_root = Path('..') / '..'
DATA_PATH = project_root / 'Final Project/Final Project Repo/data/clean/data_clean.csv'

df = pd.read_csv(DATA_PATH)
df['date'] = pd.to_datetime(df['date'])

# Regime flags for 2018–2019 (pre), 2020–2022 (stress), 2023–2025 (recent)
def assign_regime(ts):
    year = ts.year
    if year <= 2019:
        return '2018-2019'
    elif year <= 2022:
        return '2020-2022'
    else:
        return '2023-2025'

df['regime'] = df['date'].apply(assign_regime)
regime_order = ['2018-2019', '2020-2022', '2023-2025']
df['regime'] = pd.Categorical(df['regime'], categories=regime_order, ordered=True)

regime_windows = df.groupby('regime')['date'].agg(['min', 'max', 'count']).rename(columns={'count': 'rows'})
regime_windows


In [None]:

# Candidate insights table (manually curated for relevance)
candidate_insights = [
    {
        'id': 'CI1',
        'title': 'FFPI food level remains ~30 points higher post-2019',
        'kpi': 'ffpi_food',
        'relevance_score': 5,
        'pairs': [('2018-2019', '2020-2022'), ('2018-2019', '2023-2025')],
    },
    {
        'id': 'CI2',
        'title': 'Veg oils index surged during the 2020–2022 stress window',
        'kpi': 'ffpi_veg_oils',
        'relevance_score': 5,
        'pairs': [('2018-2019', '2020-2022'), ('2018-2019', '2023-2025')],
    },
    {
        'id': 'CI3',
        'title': 'Import price pressure (IPI food) stayed elevated through 2025',
        'kpi': 'ipi_food',
        'relevance_score': 4,
        'pairs': [('2018-2019', '2023-2025'), ('2020-2022', '2023-2025')],
    },
]

# Only keep those with relevance_score >= 4
pd.DataFrame(candidate_insights)


## CI1: FFPI food level remains ~30 points higher post-2019  

- KPI: **ffpi_food**  
- Relevance_score: **5**  
- Validation focus: Validate the step-change in the FAO food price index level after 2019.

In [None]:

ci1_kpi = 'ffpi_food'
ci1_summary = df.groupby('regime')[ci1_kpi].agg(mean='mean', median='median', N='count').reset_index()
ci1_summary['diff_vs_2018_2019'] = ci1_summary['mean'] - ci1_summary.loc[ci1_summary['regime'] == '2018-2019', 'mean'].iloc[0]
ci1_summary['ratio_vs_2018_2019'] = ci1_summary['mean'] / ci1_summary.loc[ci1_summary['regime'] == '2018-2019', 'mean'].iloc[0]
ci1_summary


In [None]:

fig, ax = plt.subplots(figsize=(9, 4))
ax.plot(df['date'], df['ffpi_food'], color='#1f77b4', linewidth=1.5)
ax.set_title('FFPI food level remains ~30 points higher post-2019')
ax.set_ylabel('ffpi_food')
ax.set_xlabel('Date')

# Shade regimes for context
regime_colors = {'2018-2019': '#d9ead3', '2020-2022': '#fce5cd', '2023-2025': '#d9d2e9'}
for reg, group in df.groupby('regime'):
    ax.axvspan(group['date'].min(), group['date'].max(), color=regime_colors.get(reg, '#f0f0f0'), alpha=0.2, label=reg)

handles, labels = ax.get_legend_handles_labels()
by_label = dict(zip(labels, handles))
ax.legend(by_label.values(), by_label.keys(), title='Regime')
ax.grid(True, linestyle='--', alpha=0.4)
plt.tight_layout()


Caveats and context:
- Sample sizes by regime: 2018-2019 n=24, 2020-2022 n=36, 2023-2025 n=34.
- Date coverage: 2018-2019: 2018-01-01 to 2019-12-01, 2020-2022: 2020-01-01 to 2022-12-01, 2023-2025: 2023-01-01 to 2025-11-01.
- Key differences: 2020-2022 vs 2018-2019: Δ = 27.44, ratio = 1.29; 2023-2025 vs 2018-2019: Δ = 29.21, ratio = 1.31.
- Data caveats: No missing values flagged for this KPI; monthly cadence is intact.

## CI2: Veg oils index surged during the 2020–2022 stress window  

- KPI: **ffpi_veg_oils**  
- Relevance_score: **5**  
- Validation focus: Check the magnitude of the veg-oils spike relative to the pre-COVID baseline.

In [None]:

ci2_kpi = 'ffpi_veg_oils'
ci2_summary = df.groupby('regime')[ci2_kpi].agg(mean='mean', median='median', N='count').reset_index()
ci2_summary['diff_vs_2018_2019'] = ci2_summary['mean'] - ci2_summary.loc[ci2_summary['regime'] == '2018-2019', 'mean'].iloc[0]
ci2_summary['ratio_vs_2018_2019'] = ci2_summary['mean'] / ci2_summary.loc[ci2_summary['regime'] == '2018-2019', 'mean'].iloc[0]
ci2_summary


In [None]:

fig, ax = plt.subplots(figsize=(9, 4))
ax.plot(df['date'], df['ffpi_veg_oils'], color='#1f77b4', linewidth=1.5)
ax.set_title('Veg oils index surged during the 2020–2022 stress window')
ax.set_ylabel('ffpi_veg_oils')
ax.set_xlabel('Date')

# Shade regimes for context
regime_colors = {'2018-2019': '#d9ead3', '2020-2022': '#fce5cd', '2023-2025': '#d9d2e9'}
for reg, group in df.groupby('regime'):
    ax.axvspan(group['date'].min(), group['date'].max(), color=regime_colors.get(reg, '#f0f0f0'), alpha=0.2, label=reg)

handles, labels = ax.get_legend_handles_labels()
by_label = dict(zip(labels, handles))
ax.legend(by_label.values(), by_label.keys(), title='Regime')
ax.grid(True, linestyle='--', alpha=0.4)
plt.tight_layout()


Caveats and context:
- Sample sizes by regime: 2018-2019 n=24, 2020-2022 n=36, 2023-2025 n=34.
- Date coverage: 2018-2019: 2018-01-01 to 2019-12-01, 2020-2022: 2020-01-01 to 2022-12-01, 2023-2025: 2023-01-01 to 2025-11-01.
- Key differences: 2020-2022 vs 2018-2019: Δ = 65.19, ratio = 1.76; 2023-2025 vs 2018-2019: Δ = 55.16, ratio = 1.65.
- Data caveats: No missing values flagged for this KPI; monthly cadence is intact.

## CI3: Import price pressure (IPI food) stayed elevated through 2025  

- KPI: **ipi_food**  
- Relevance_score: **4**  
- Validation focus: Confirm the upward shift in the food import price index versus 2018–2019.

In [None]:

ci3_kpi = 'ipi_food'
ci3_summary = df.groupby('regime')[ci3_kpi].agg(mean='mean', median='median', N='count').reset_index()
ci3_summary['diff_vs_2018_2019'] = ci3_summary['mean'] - ci3_summary.loc[ci3_summary['regime'] == '2018-2019', 'mean'].iloc[0]
ci3_summary['ratio_vs_2018_2019'] = ci3_summary['mean'] / ci3_summary.loc[ci3_summary['regime'] == '2018-2019', 'mean'].iloc[0]
ci3_summary


In [None]:

fig, ax = plt.subplots(figsize=(9, 4))
ax.plot(df['date'], df['ipi_food'], color='#1f77b4', linewidth=1.5)
ax.set_title('Import price pressure (IPI food) stayed elevated through 2025')
ax.set_ylabel('ipi_food')
ax.set_xlabel('Date')

# Shade regimes for context
regime_colors = {'2018-2019': '#d9ead3', '2020-2022': '#fce5cd', '2023-2025': '#d9d2e9'}
for reg, group in df.groupby('regime'):
    ax.axvspan(group['date'].min(), group['date'].max(), color=regime_colors.get(reg, '#f0f0f0'), alpha=0.2, label=reg)

handles, labels = ax.get_legend_handles_labels()
by_label = dict(zip(labels, handles))
ax.legend(by_label.values(), by_label.keys(), title='Regime')
ax.grid(True, linestyle='--', alpha=0.4)
plt.tight_layout()


Caveats and context:
- Sample sizes by regime: 2018-2019 n=24, 2020-2022 n=36, 2023-2025 n=33.
- Date coverage: 2018-2019: 2018-01-01 to 2019-12-01, 2020-2022: 2020-01-01 to 2022-12-01, 2023-2025: 2023-01-01 to 2025-11-01.
- Key differences: 2023-2025 vs 2018-2019: Δ = 12.26, ratio = 1.14; 2023-2025 vs 2020-2022: Δ = 7.61, ratio = 1.08.
- Data caveats: Missing IPI observations for 2025-10 and 2025-11 reduce the recent-sample N to 33.