#### 1. Imports & load raw data

In [1]:
import pandas as pd
import numpy as np

# Load the strongest version you just generated
DATA_PATH = "../data/processed/crop_risk_insurance_v2.csv"
df = pd.read_csv(DATA_PATH)

print("Loaded shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nRisk distribution:\n", df['risk_class'].value_counts(normalize=True).round(3))

Loaded shape: (18000, 17)
Columns: ['country', 'crop', 'season_year', 'rainfall_mm', 'avg_temp_c', 'heat_stress_days', 'ndvi_peak', 'soil_ph', 'soc_percent', 'fertilizer_n_kg_ha', 'pest_disease_level', 'irrigated', 'actual_yield_t_ha', 'expected_yield_t_ha', 'yield_loss_pct', 'risk_class', 'payout_usd_per_ha']

Risk distribution:
 risk_class
High      0.553
Low       0.383
Medium    0.064
Name: proportion, dtype: float64


#### 2 – Create engineered features

In [2]:
# 1. Rainfall-based drought severity (categorical + interaction)
df['drought_severity'] = pd.cut(
    df['rainfall_mm'],
    bins=[-np.inf, 250, 400, 600, np.inf],
    labels=[3, 2, 1, 0],  # 3 = extreme drought, 0 = no drought
    include_lowest=True
).astype(int)

# Rainfall × NDVI interaction (very low rain + low NDVI = very high risk)
df['rain_ndvi_interaction'] = df['rainfall_mm'] * df['ndvi_peak']

# 2. Combined environmental stress (rain + heat + NDVI)
df['env_stress_index'] = (
    (3 - df['drought_severity']) * 0.4 +
    (df['heat_stress_days'] / 50) * 0.3 +
    ((0.95 - df['ndvi_peak']) / 0.95) * 0.3
).clip(0, 1)

# 3. Pest × heat interaction (pests worse in hot conditions)
df['pest_heat_interaction'] = df['pest_disease_level'] * (df['avg_temp_c'] / 30)

# 4. Management quality score (fertilizer + irrigation + SOC)
df['management_score'] = (
    (df['fertilizer_n_kg_ha'] / 150) * 0.4 +
    df['irrigated'] * 0.4 +
    (df['soc_percent'] / 3.5) * 0.2
).clip(0, 1)

# 5. NDVI stress flag + deviation squared (non-linear penalty)
df['ndvi_stress_flag'] = (df['ndvi_peak'] < 0.55).astype(int)
df['ndvi_penalty'] = (0.7 - df['ndvi_peak']).clip(0, 0.65) ** 2

print("New engineered features added:")
print(df.columns[-8:].tolist())

New engineered features added:
['payout_usd_per_ha', 'drought_severity', 'rain_ndvi_interaction', 'env_stress_index', 'pest_heat_interaction', 'management_score', 'ndvi_stress_flag', 'ndvi_penalty']


#### 3 – One-hot encode country & crop

In [3]:
# One-hot encode categorical variables
df = pd.get_dummies(df, columns=['country', 'crop', 'drought_severity'],
                    drop_first=True, prefix=['country', 'crop', 'drought'])

# Drop columns we don't want in modeling (targets + non-predictive)
drop_cols = [
    'actual_yield_t_ha',
    'expected_yield_t_ha',
    'yield_loss_pct',
    'payout_usd_per_ha',
    'risk_class',
    'season_year',
    'yield_potential_t_ha'
]

df_model = df.drop(columns=drop_cols, errors='ignore')

# Ensure all columns are numeric
df_model = df_model.astype(float)

print("Final modeling-ready shape:", df_model.shape)
print("All columns numeric?", not df_model.select_dtypes(exclude='number').columns.any())

Final modeling-ready shape: (18000, 28)
All columns numeric? True


#### 4 – Save engineered & encoded dataset

In [4]:
SAVE_PATH = "../data/processed/crop_risk_insurance_ready_for_model_v2.csv"
df_model.to_csv(SAVE_PATH, index=False)
print(f"Ready-for-modeling dataset saved to:\n{SAVE_PATH}")

print("\nFinal columns (first 15 + last 5):")
print(df_model.columns[:15].tolist() + df_model.columns[-5:].tolist())

Ready-for-modeling dataset saved to:
../data/processed/crop_risk_insurance_ready_for_model_v2.csv

Final columns (first 15 + last 5):
['rainfall_mm', 'avg_temp_c', 'heat_stress_days', 'ndvi_peak', 'soil_ph', 'soc_percent', 'fertilizer_n_kg_ha', 'pest_disease_level', 'irrigated', 'rain_ndvi_interaction', 'env_stress_index', 'pest_heat_interaction', 'management_score', 'ndvi_stress_flag', 'ndvi_penalty', 'crop_Millet', 'crop_Sorghum', 'drought_1', 'drought_2', 'drought_3']
