# ðŸŒ´ðŸŒŠ SawitFlood Lab - Risk Classification Modeling

Training dan Evaluasi Model Klasifikasi Risiko Banjir


In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    roc_curve,
)
import xgboost as xgb

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

In [None]:
# Load data
processed_dir = PROJECT_ROOT / "data" / "processed"
if (processed_dir / "analysis_dataset.parquet").exists():
    df = pd.read_parquet(processed_dir / "analysis_dataset.parquet")
elif (processed_dir / "analysis_dataset.csv").exists():
    df = pd.read_csv(processed_dir / "analysis_dataset.csv")
else:
    from src.data.build_dataset import DatasetBuilder

    builder = DatasetBuilder()
    gdf = builder.build_analysis_dataset()
    df = gdf.drop(columns=["geometry"]) if "geometry" in gdf.columns else gdf
print(f"Dataset shape: {df.shape}")

In [None]:
# Prepare features and target
exclude_cols = {
    "geometry",
    "kabupaten_id",
    "id",
    "name",
    "province",
    "kabupaten",
    "flood_risk_label",
}
feature_cols = [
    col for col in df.columns if col not in exclude_cols and df[col].dtype in [np.float64, np.int64]
]

X = df[feature_cols].fillna(df[feature_cols].median())
y = df["flood_risk_label"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training: {len(X_train)}, Test: {len(X_test)}")

## Train XGBoost Model


In [None]:
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss",
)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

print("XGBoost Results:")
print(f"  F1-Score:  {f1_score(y_test, y_pred):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_test, y_pred_proba):.4f}")

In [None]:
# Feature Importance
importance_df = (
    pd.DataFrame({"Feature": feature_cols, "Importance": xgb_model.feature_importances_})
    .sort_values("Importance", ascending=True)
    .tail(10)
)

plt.figure(figsize=(10, 6))
plt.barh(importance_df["Feature"], importance_df["Importance"], color="steelblue")
plt.title("Top 10 Feature Importance")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()

In [None]:
# Save model
import pickle, json
from datetime import datetime

models_dir = PROJECT_ROOT / "models"
models_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_name = f"flood_risk_xgboost_{timestamp}"

with open(models_dir / f"{model_name}.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

metadata = {
    "model_type": "xgboost",
    "feature_names": feature_cols,
    "created_at": datetime.now().isoformat(),
    "training_history": {
        "metrics": {
            "f1_score": f1_score(y_test, y_pred),
            "roc_auc": roc_auc_score(y_test, y_pred_proba),
        }
    },
}
with open(models_dir / f"{model_name}_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)
print(f"Model saved to: {models_dir / model_name}.pkl")