In [3]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error
from xgboost import XGBClassifier, XGBRegressor

In [4]:
df = pd.read_csv("../data/preproc.csv")

In [5]:
X = df.drop(columns=['Year-Month', 'Flood Risk', 'Area affected in (m.ha)', 'Population affected in (million)', 'Damage to Crops', 'Damage to Houses', 'Flood Occurred'])
y_risk = df["Flood Risk"]
y_area = df["Area affected in (m.ha)"].fillna(df["Area affected in (m.ha)"].median())
y_pop = df["Population affected in (million)"].fillna(df["Population affected in (million)"].median())
y_crops = df["Damage to Crops"].fillna(df["Damage to Crops"].median())
y_houses = df["Damage to Houses"].fillna(df["Damage to Houses"].median())

In [6]:
from sklearn.preprocessing import FunctionTransformer

# Define log transformation scalers
log_transformer = FunctionTransformer(np.log1p, inverse_func=np.expm1, validate=True)

scalers = {
    "area": log_transformer,
    "population": log_transformer,
    "crops": log_transformer,
    "houses": log_transformer
}

y_area_scaled = scalers["area"].fit_transform(y_area.values.reshape(-1, 1))
y_pop_scaled = scalers["population"].fit_transform(y_pop.values.reshape(-1, 1))
y_crops_scaled = scalers["crops"].fit_transform(y_crops.values.reshape(-1, 1))
y_houses_scaled = scalers["houses"].fit_transform(y_houses.values.reshape(-1, 1))

for name, scaler in scalers.items():
    joblib.dump(scaler, f"../models/scaler_{name}.pkl")


In [7]:
X_train, X_test, y_risk_train, y_risk_test, y_area_train, y_area_test, y_pop_train, y_pop_test, y_crops_train, y_crops_test, y_houses_train, y_houses_test = train_test_split(
    X, y_risk, y_area_scaled, y_pop_scaled, y_crops_scaled, y_houses_scaled, test_size=0.2, random_state=42
)

In [8]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 6],
    'subsample': [0.7],
    'colsample_bytree': [0.7]
}

In [9]:
clf = GridSearchCV(XGBClassifier(random_state=42), param_grid, cv=3, scoring='accuracy', n_jobs=-1)
clf.fit(X_train, y_risk_train)
best_clf = clf.best_estimator_
y_pred_risk = best_clf.predict(X_test)
print(f"Flood Risk Accuracy: {accuracy_score(y_risk_test, y_pred_risk)}")

Flood Risk Accuracy: 0.8754950495049505


In [10]:
joblib.dump(best_clf, "../models/xgb_flood_risk_model.pkl")

['../models/xgb_flood_risk_model.pkl']

In [11]:
def train_and_save_regressor(X_train, y_train, X_test, y_test, target_name):
    print(f"Tuning {target_name}...")
    reg = GridSearchCV(XGBRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    reg.fit(X_train, y_train)
    best_reg = reg.best_estimator_
    
    y_pred = best_reg.predict(X_test)
    y_pred_unscaled = scalers[target_name].inverse_transform(y_pred.reshape(-1, 1))
    y_pred_unscaled = np.clip(y_pred_unscaled, 0, None)  # 👈 Clip to avoid negatives
    
    y_test_unscaled = scalers[target_name].inverse_transform(y_test.reshape(-1, 1))
    rmse = np.sqrt(mean_squared_error(y_test_unscaled, y_pred_unscaled))
    print(f"{target_name.capitalize()} RMSE: {rmse}")

    joblib.dump(best_reg, f"../models/xgb_{target_name}_model.pkl")


In [12]:
train_and_save_regressor(X_train, y_area_train, X_test, y_area_test, "area")

Tuning area...
Area RMSE: 9.840557025325266


In [13]:
train_and_save_regressor(X_train, y_pop_train, X_test, y_pop_test, "population")

Tuning population...
Population RMSE: 35.081503267547454


In [14]:
train_and_save_regressor(X_train, y_crops_train, X_test, y_crops_test, "crops")

Tuning crops...
Crops RMSE: 47.97465974734991


In [15]:
train_and_save_regressor(X_train, y_houses_train, X_test, y_houses_test, "houses")

Tuning houses...
Houses RMSE: 856.3776284060484
