#### 1 – Imports & load data

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import joblib

print("All imports successful")

All imports successful


#### 2 – Load the data & quick inspection

In [2]:
DATA_PATH = "../data/processed/crop_risk_insurance_v2.csv"

df = pd.read_csv(DATA_PATH)

print("Loaded shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nHas targets?", 'risk_class' in df.columns and 'yield_loss_pct' in df.columns)
print("\nRisk class distribution:\n", df['risk_class'].value_counts(normalize=True).round(3) if 'risk_class' in df.columns else "No risk_class column")

Loaded shape: (18000, 17)

Columns: ['country', 'crop', 'season_year', 'rainfall_mm', 'avg_temp_c', 'heat_stress_days', 'ndvi_peak', 'soil_ph', 'soc_percent', 'fertilizer_n_kg_ha', 'pest_disease_level', 'irrigated', 'actual_yield_t_ha', 'expected_yield_t_ha', 'yield_loss_pct', 'risk_class', 'payout_usd_per_ha']

Has targets? True

Risk class distribution:
 risk_class
High      0.553
Low       0.383
Medium    0.064
Name: proportion, dtype: float64


#### 3- One-hot encode categorical features

In [3]:
df = pd.get_dummies(df, columns=['country', 'crop'], drop_first=True, prefix=['country', 'crop'])

print("\nAfter one-hot encoding – new columns added:")
print(df.filter(regex='^country_|^crop_').columns.tolist())
print("\nTotal columns now:", len(df.columns))


After one-hot encoding – new columns added:
['country_Kenya', 'country_Malawi', 'country_Tanzania', 'country_Uganda', 'country_Zambia', 'crop_Cassava', 'crop_Groundnut', 'crop_Maize', 'crop_Millet', 'crop_Sorghum']

Total columns now: 25


#### 4- Define features (X) and targets (y)

In [4]:
exclude = [
    'actual_yield_t_ha',
    'expected_yield_t_ha',
    'yield_loss_pct',       
    'payout_usd_per_ha',       
    'risk_class',           
    'season_year',
    'yield_potential_t_ha'
]

# Create X
X = df.drop(columns=exclude, errors='ignore')

# Targets (separate)
y_class_raw = df['risk_class']    
y_loss      = df['yield_loss_pct']   

# Encode classification target
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_class = le.fit_transform(y_class_raw)

print("Classification target encoded. Mapping:")
for cls, code in zip(le.classes_, range(len(le.classes_))):
    print(f"  {cls:6} → {code}")

# Final safety checks
print("\nX shape:", X.shape)
print("All X numeric now?", X.select_dtypes(exclude=['float64','int64','uint8','bool']).empty)

non_numeric = X.select_dtypes(exclude=['float64','int64','uint8','bool']).columns.tolist()
if non_numeric:
    print("WARNING: Still non-numeric in X:", non_numeric)
else:
    print("SUCCESS: X is fully numeric – ready for modeling!")

# Quick preview
print("\nX first 5 columns:", X.columns[:5].tolist())
print("X last 5 columns:", X.columns[-5:].tolist())
print("y_class sample:\n", y_class[:5])

Classification target encoded. Mapping:
  High   → 0
  Low    → 1
  Medium → 2

X shape: (18000, 19)
All X numeric now? True
SUCCESS: X is fully numeric – ready for modeling!

X first 5 columns: ['rainfall_mm', 'avg_temp_c', 'heat_stress_days', 'ndvi_peak', 'soil_ph']
X last 5 columns: ['crop_Cassava', 'crop_Groundnut', 'crop_Maize', 'crop_Millet', 'crop_Sorghum']
y_class sample:
 [0 0 0 0 1]


In [5]:
X_train, X_test, y_class_train, y_class_test = train_test_split(
    X, y_class,
    test_size=0.20,
    random_state=42,
    stratify=y_class
)

_, _, y_loss_train, y_loss_test = train_test_split(
    X, y_loss,
    test_size=0.20,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("\nClass distribution in train (encoded):")
print(pd.Series(y_class_train).value_counts(normalize=True).round(3))

Train shape: (14400, 19)
Test shape: (3600, 19)

Class distribution in train (encoded):
0    0.553
1    0.383
2    0.064
Name: proportion, dtype: float64


#### 5 – Random Forest Classifier (with balanced weights)

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import joblib
import os

# If encoder already exists, load it – otherwise create & save
encoder_path = "../models/risk_class_encoder.joblib"

if os.path.exists(encoder_path):
    le = joblib.load(encoder_path)
    print("Loaded existing label encoder")
else:
    le = LabelEncoder()
    le.fit(df['risk_class'])  # fit on full original labels
    joblib.dump(le, encoder_path)
    print("Created and saved new label encoder")

# Encode targets (if not already encoded)
y_class_encoded = le.transform(df['risk_class'])

# Train/test split (use encoded y for training)
X_train, X_test, y_class_train, y_class_test = train_test_split(
    X, y_class_encoded,
    test_size=0.20,
    random_state=42,
    stratify=y_class_encoded
)

# Retrain SMALLER model for deployment & GitHub push
rf_small = RandomForestClassifier(
    n_estimators=100,               # small enough for GitHub
    max_depth=8,
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight='balanced_subsample',
    random_state=42,
    n_jobs=-1
)

rf_small.fit(X_train, y_class_train)

# Predict & evaluate
y_pred_encoded = rf_small.predict(X_test)
y_pred_str = le.inverse_transform(y_pred_encoded)
y_test_str = le.inverse_transform(y_class_test)

print("Classification Report (Small Random Forest):")
print(classification_report(y_test_str, y_pred_str, digits=3))

# Save both
joblib.dump(rf_small, "../models/risk_classifier_rf_small.joblib")
joblib.dump(le, "../models/risk_class_encoder.joblib")  # always save encoder too

print("Small RF model & encoder saved")
print("Model file size (approx):", round(os.path.getsize("../models/risk_classifier_rf_small.joblib") / (1024 * 1024), 2), "MB")
print("Encoder file size (approx):", round(os.path.getsize("../models/risk_class_encoder.joblib") / (1024 * 1024), 2), "MB")

Loaded existing label encoder
Classification Report (Small Random Forest):
              precision    recall  f1-score   support

        High      0.985     0.890     0.935      1992
         Low      0.978     0.832     0.899      1378
      Medium      0.293     0.800     0.429       230

    accuracy                          0.862      3600
   macro avg      0.752     0.841     0.755      3600
weighted avg      0.938     0.862     0.889      3600

Small RF model & encoder saved
Model file size (approx): 1.96 MB
Encoder file size (approx): 0.0 MB
