In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the final processed dataset
file_path = "../data/scaled_flood_data.csv"
df = pd.read_csv(file_path)

# Define features (X) - Remove "Effect" Variables
features_to_remove = ['Flood Occurred', 'Damage to Crops', 'Damage to Houses', 
                      'Area affected in (m.ha)', 'Population affected in (million)']

X = df.drop(columns=features_to_remove)  # Keep only predictive features
y = df['Flood Occurred']  # Target (0 = No, 1 = Yes)

# Perform train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Display dataset shapes
print(f"Training Set: {X_train.shape}, Testing Set: {X_test.shape}")

Training Set: (26280, 8), Testing Set: (6571, 8)


In [26]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score

# Load dataset
file_path = "../data/scaled_flood_data.csv"
df = pd.read_csv(file_path)

# Remove post-flood effect variables & highly correlated 'Flood Risk'
features_to_remove = ['Flood Occurred', 'Damage to Crops', 'Damage to Houses', 
                      'Area affected in (m.ha)', 'Population affected in (million)', 
                      'Flood Risk']  # Remove Flood Risk to prevent leakage

X = df.drop(columns=features_to_remove)
y = df['Flood Occurred']

# Train-Test Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize Random Forest with constraints to reduce overfitting
rf_model = RandomForestClassifier(
    n_estimators=50,          # Reduce number of trees
    max_depth=5,              # Limit tree depth
    min_samples_split=10,     # Require at least 10 samples to split a node
    min_samples_leaf=5,       # Require at least 5 samples per leaf
    class_weight="balanced",  # Handle class imbalance
    random_state=42
)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test set
y_pred = rf_model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Perform Cross-Validation (5-Fold)
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')

# Print results
print(f"Updated Random Forest Accuracy: {accuracy:.4f}")
print("\nUpdated Classification Report:\n", report)
print(f"\nUpdated Cross-Validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

Updated Random Forest Accuracy: 0.8891

Updated Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.85      0.92      4940
           1       0.69      1.00      0.82      1631

    accuracy                           0.89      6571
   macro avg       0.85      0.93      0.87      6571
weighted avg       0.92      0.89      0.89      6571


Updated Cross-Validation Accuracy: 0.9012 ± 0.0027
