Test - Train split

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split


file_path = "../data/scaled_flood_data.csv"
df = pd.read_csv(file_path)
features_to_remove = ['Flood Occurred', 'Damage to Crops', 'Damage to Houses', 
                      'Area affected in (m.ha)', 'Population affected in (million)', 
                      'Flood Risk', 'River']
X = df.drop(columns=features_to_remove)
y = df['Flood Occurred']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training Set: {X_train.shape}, Testing Set: {X_test.shape}")


Training Set: (26280, 6), Testing Set: (6571, 6)


Random Forest Model

In [27]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight="balanced",
    random_state=42
)
rf_model.fit(X_train, y_train)

In [32]:
importances = rf_model.feature_importances_
feature_names = X.columns
feat_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feat_importance_df = feat_importance_df.sort_values(by='Importance', ascending=False)
feat_importance_df['Importance (%)'] = feat_importance_df['Importance'] * 100


print("\nTop Features Influencing Flood Prediction:\n")
print(feat_importance_df[['Feature', 'Importance']].head(5))


Top Features Influencing Flood Prediction:

         Feature  Importance
2    River Level    0.496774
1  Rainfall (mm)    0.241156
5      Month_cos    0.092057
4      Month_sin    0.067506
3           Year    0.065660


In [28]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score


y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)
cv_scores_rf = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')

print(f"Random Forest Accuracy: {accuracy_rf:.4f}")
print("\nRandom Forest Classification Report:\n", report_rf)
print(f"\nCross-Validation Accuracy: {cv_scores_rf.mean():.4f} ± {cv_scores_rf.std():.4f}")

Random Forest Accuracy: 0.8299

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86      3769
           1       0.84      0.75      0.79      2802

    accuracy                           0.83      6571
   macro avg       0.83      0.82      0.82      6571
weighted avg       0.83      0.83      0.83      6571


Cross-Validation Accuracy: 0.8317 ± 0.0029


XgBoost model

In [56]:
import xgboost as xgb


xgb_model = xgb.XGBClassifier(
    n_estimators=100,  
    max_depth=5,  
    learning_rate=0.1,  
    subsample=0.8,  
    colsample_bytree=0.8,  
    scale_pos_weight=1.5,  
    random_state=42
)
xgb_model.fit(X_train, y_train)

In [57]:
importances = xgb_model.feature_importances_
feature_names = X.columns
feat_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feat_importance_df = feat_importance_df.sort_values(by='Importance', ascending=False)
feat_importance_df['Importance (%)'] = feat_importance_df['Importance'] * 100

print("\nTop Features Influencing Flood Prediction:\n")
print(feat_importance_df[['Feature', 'Importance']].head(5))


Top Features Influencing Flood Prediction:

         Feature  Importance
2    River Level    0.424312
1  Rainfall (mm)    0.205232
5      Month_cos    0.121763
4      Month_sin    0.108947
3           Year    0.080527


In [59]:
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
report_xgb = classification_report(y_test, y_pred_xgb)
cv_scores_xgb = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='accuracy')

print(f"\nXGBoost Accuracy: {accuracy_xgb:.4f}")
print("\nXGBoost Classification Report:\n", report_xgb)
print(f"\nCross-Validation Accuracy: {cv_scores_xgb.mean():.4f} ± {cv_scores_xgb.std():.4f}")


XGBoost Accuracy: 0.8337

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.85      3769
           1       0.80      0.81      0.81      2802

    accuracy                           0.83      6571
   macro avg       0.83      0.83      0.83      6571
weighted avg       0.83      0.83      0.83      6571


Cross-Validation Accuracy: 0.8363 ± 0.0035
