In [None]:
pip install pandas numpy scikit-learn joblib


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
import joblib

# Load the dataset
df = pd.read_csv("../data/cleaned_flood_data.csv")  # Ensure the correct file path

# Encode categorical variables (District, River)
label_encoders = {}
for col in ["District", "River"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoders for later use

# Define input features (Include Month_sin and Month_cos)
X = df[["River", "River Level", "Rainfall (mm)", "District", "Month_sin", "Month_cos"]]

# Classification targets
y_flood_risk = df["Flood Risk"].fillna(df["Flood Risk"].mode()[0]).astype(int)
y_flood_occurred = df["Flood Occurred"].fillna(df["Flood Occurred"].mode()[0]).astype(int)

# Regression targets
y_area_affected = df["Area affected in (m.ha)"].fillna(df["Area affected in (m.ha)"].median())
y_population_affected = df["Population affected in (million)"].fillna(df["Population affected in (million)"].median())
y_damage_crops = df["Damage to Crops"].fillna(df["Damage to Crops"].median())
y_damage_houses = df["Damage to Houses"].fillna(df["Damage to Houses"].median())

# Split data into training and testing sets
X_train, X_test, y_train_risk, y_test_risk = train_test_split(X, y_flood_risk, test_size=0.2, random_state=42)
_, _, y_train_occurred, y_test_occurred = train_test_split(X, y_flood_occurred, test_size=0.2, random_state=42)
_, _, y_train_area, y_test_area = train_test_split(X, y_area_affected, test_size=0.2, random_state=42)
_, _, y_train_pop, y_test_pop = train_test_split(X, y_population_affected, test_size=0.2, random_state=42)
_, _, y_train_crops, y_test_crops = train_test_split(X, y_damage_crops, test_size=0.2, random_state=42)
_, _, y_train_houses, y_test_houses = train_test_split(X, y_damage_houses, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train classification models
clf_risk = RandomForestClassifier(n_estimators=100, random_state=42)
clf_risk.fit(X_train_scaled, y_train_risk)

clf_occurred = RandomForestClassifier(n_estimators=100, random_state=42)
clf_occurred.fit(X_train_scaled, y_train_occurred)

# Train regression models
reg_area = RandomForestRegressor(n_estimators=100, random_state=42)
reg_area.fit(X_train_scaled, y_train_area)

reg_pop = RandomForestRegressor(n_estimators=100, random_state=42)
reg_pop.fit(X_train_scaled, y_train_pop)

reg_crops = RandomForestRegressor(n_estimators=100, random_state=42)
reg_crops.fit(X_train_scaled, y_train_crops)

reg_houses = RandomForestRegressor(n_estimators=100, random_state=42)
reg_houses.fit(X_train_scaled, y_train_houses)

# Predictions
y_pred_risk = clf_risk.predict(X_test_scaled)
y_pred_occurred = clf_occurred.predict(X_test_scaled)
y_pred_area = reg_area.predict(X_test_scaled)
y_pred_pop = reg_pop.predict(X_test_scaled)
y_pred_crops = reg_crops.predict(X_test_scaled)
y_pred_houses = reg_houses.predict(X_test_scaled)

# Model evaluation
print("Flood Risk Accuracy:", accuracy_score(y_test_risk, y_pred_risk))
print("Flood Occurred Accuracy:", accuracy_score(y_test_occurred, y_pred_occurred))
print("Area Affected RMSE:", np.sqrt(mean_squared_error(y_test_area, y_pred_area)))
print("Population Affected RMSE:", np.sqrt(mean_squared_error(y_test_pop, y_pred_pop)))
print("Damage to Crops RMSE:", np.sqrt(mean_squared_error(y_test_crops, y_pred_crops)))
print("Damage to Houses RMSE:", np.sqrt(mean_squared_error(y_test_houses, y_pred_houses)))

# Save models
joblib.dump(clf_risk, "flood_risk_model.pkl")
joblib.dump(reg_area, "area_affected_model.pkl")
joblib.dump(reg_pop, "population_affected_model.pkl")
joblib.dump(reg_crops, "damage_crops_model.pkl")
joblib.dump(reg_houses, "damage_houses_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")  

print("Models saved successfully!")


Flood Risk Accuracy: 0.7716195816260514
Flood Occurred Accuracy: 0.8000862626698296
Area Affected RMSE: 0.8828201711439362
Population Affected RMSE: 1.0437836455695564
Damage to Crops RMSE: 1.020561167224183
Damage to Houses RMSE: 1.0217082227446002
Models saved successfully!


In [20]:
print("Unique classes in y_train_risk:", np.unique(y_train_risk))


Unique classes in y_train_risk: [1]


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
import joblib

# Load dataset
df = pd.read_csv("../data/scaled_flood_data.csv")

# Drop 'Year' column if present
df.drop(columns=["Year"], inplace=True, errors="ignore")

# Encode categorical variables
label_encoder_district = LabelEncoder()
label_encoder_river = LabelEncoder()

df["District"] = label_encoder_district.fit_transform(df["District"])
df["River"] = label_encoder_river.fit_transform(df["River"])

# Feature Engineering (Interaction Terms)
df["Rainfall × River Level"] = df["Rainfall (mm)"] * df["River Level"]
df["Rainfall × Month_sin"] = df["Rainfall (mm)"] * df["Month_sin"]
df["River Level × Month_cos"] = df["River Level"] * df["Month_cos"]

# Define feature columns
X = df[[
    "River", "River Level", "Rainfall (mm)", "District", "Month_sin", "Month_cos",
    "Rainfall × River Level", "Rainfall × Month_sin", "River Level × Month_cos"
]]

# Define target variables
y_flood_risk = df["Flood Risk"].fillna(df["Flood Risk"].mode()[0]).astype(int)
y_area_affected = df["Area affected in (m.ha)"].fillna(df["Area affected in (m.ha)"].median())
y_population_affected = df["Population affected in (million)"].fillna(df["Population affected in (million)"].median())
y_damage_crops = df["Damage to Crops"].fillna(df["Damage to Crops"].median())
y_damage_houses = df["Damage to Houses"].fillna(df["Damage to Houses"].median())

# Split data (consistent splitting)
X_train, X_test, y_train_risk, y_test_risk = train_test_split(X, y_flood_risk, test_size=0.2, random_state=42)
_, _, y_train_area, y_test_area = train_test_split(X, y_area_affected, test_size=0.2, random_state=42)
_, _, y_train_pop, y_test_pop = train_test_split(X, y_population_affected, test_size=0.2, random_state=42)
_, _, y_train_crops, y_test_crops = train_test_split(X, y_damage_crops, test_size=0.2, random_state=42)
_, _, y_train_houses, y_test_houses = train_test_split(X, y_damage_houses, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train classification model (Flood Risk)
clf_risk = XGBClassifier(n_estimators=200, learning_rate=0.07, max_depth=6, random_state=42)
clf_risk.fit(X_train_scaled, y_train_risk)

# Train regression models
reg_area = XGBRegressor(n_estimators=200, learning_rate=0.07, max_depth=6, random_state=42)
reg_area.fit(X_train_scaled, y_train_area)

reg_pop = XGBRegressor(n_estimators=200, learning_rate=0.07, max_depth=6, random_state=42)
reg_pop.fit(X_train_scaled, y_train_pop)

reg_crops = XGBRegressor(n_estimators=200, learning_rate=0.07, max_depth=6, random_state=42)
reg_crops.fit(X_train_scaled, y_train_crops)

reg_houses = XGBRegressor(n_estimators=200, learning_rate=0.07, max_depth=6, random_state=42)
reg_houses.fit(X_train_scaled, y_train_houses)

# Model evaluation
print("Flood Risk Accuracy:", accuracy_score(y_test_risk, clf_risk.predict(X_test_scaled)))
print("Area Affected RMSE:", np.sqrt(mean_squared_error(y_test_area, reg_area.predict(X_test_scaled))))
print("Population Affected RMSE:", np.sqrt(mean_squared_error(y_test_pop, reg_pop.predict(X_test_scaled))))
print("Damage to Crops RMSE:", np.sqrt(mean_squared_error(y_test_crops, reg_crops.predict(X_test_scaled))))
print("Damage to Houses RMSE:", np.sqrt(mean_squared_error(y_test_houses, reg_houses.predict(X_test_scaled))))

# Save models and encoders
joblib.dump(clf_risk, "xgb_flood_risk_model.pkl")
joblib.dump(reg_area, "xgb_area_affected_model.pkl")
joblib.dump(reg_pop, "xgb_population_affected_model.pkl")
joblib.dump(reg_crops, "xgb_damage_crops_model.pkl")
joblib.dump(reg_houses, "xgb_damage_houses_model.pkl")
joblib.dump(scaler, "xgb_scaler.pkl")
joblib.dump(label_encoder_district, "label_encoder_district.pkl")
joblib.dump(label_encoder_river, "label_encoder_river.pkl")

print("✅ Models and encoders saved successfully!")


Flood Risk Accuracy: 0.7948561862730178
Area Affected RMSE: 0.9314020927503911
Population Affected RMSE: 0.9033535327320438
Damage to Crops RMSE: 0.8852296741132138
Damage to Houses RMSE: 0.9328805196294667
✅ Models and encoders saved successfully!


In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
import joblib

# Load dataset
df = pd.read_csv("../data/cleaned_flood_data.csv")

# Define features and targets
features = ['District', 'Rainfall (mm)', 'River', 'River Level',  'Month_sin', 'Month_cos']
targets = {
    'classification': ['Flood Risk'],
    'regression': ['Area affected in (m.ha)', 'Population affected in (million)', 
                   'Damage to Crops', 'Damage to Houses']
}

# Prepare features
X = df[features]

# Save feature names for later use
feature_names = X.columns.tolist()

# Handling missing values
imputer = SimpleImputer(strategy="median")  
X = pd.DataFrame(imputer.fit_transform(X), columns=feature_names)

# Standardizing features
scaler_X = StandardScaler()
X_scaled = pd.DataFrame(scaler_X.fit_transform(X), columns=feature_names)

# Prepare targets
y_flood_risk = df["Flood Risk"].fillna(df["Flood Risk"].mode()[0]).astype(int)

y_area = df["Area affected in (m.ha)"].fillna(df["Area affected in (m.ha)"].median())
y_pop = df["Population affected in (million)"].fillna(df["Population affected in (million)"].median())
y_crops = df["Damage to Crops"].fillna(df["Damage to Crops"].median())
y_houses = df["Damage to Houses"].fillna(df["Damage to Houses"].median())

# Standardize regression targets
scaler_area = StandardScaler()
scaler_pop = StandardScaler()
scaler_crops = StandardScaler()
scaler_houses = StandardScaler()

y_area_scaled = scaler_area.fit_transform(y_area.values.reshape(-1, 1))
y_pop_scaled = scaler_pop.fit_transform(y_pop.values.reshape(-1, 1))
y_crops_scaled = scaler_crops.fit_transform(y_crops.values.reshape(-1, 1))
y_houses_scaled = scaler_houses.fit_transform(y_houses.values.reshape(-1, 1))

# Time-based splitting
train_size = int(0.8 * len(df))
X_train, X_test = X_scaled.iloc[:train_size], X_scaled.iloc[train_size:]

# Split targets
y_train_flood_risk, y_test_flood_risk = y_flood_risk.iloc[:train_size], y_flood_risk.iloc[train_size:]
y_train_area, y_test_area = y_area.iloc[:train_size], y_area.iloc[train_size:]
y_train_pop, y_test_pop = y_pop.iloc[:train_size], y_pop.iloc[train_size:]
y_train_crops, y_test_crops = y_crops.iloc[:train_size], y_crops.iloc[train_size:]
y_train_houses, y_test_houses = y_houses.iloc[:train_size], y_houses.iloc[train_size:]

# Train models
models = {
    'flood_risk': XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5),
    'area': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5),
    'population': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5),
    'crops': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5),
    'houses': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5)
}

models['flood_risk'].fit(X_train, y_train_flood_risk)
models['area'].fit(X_train, y_train_area)
models['population'].fit(X_train, y_train_pop)
models['crops'].fit(X_train, y_train_crops)
models['houses'].fit(X_train, y_train_houses)

# Save models
for name, model in models.items():
    joblib.dump(model, f"xgb_{name}.pkl")

# Save scalers and imputer
joblib.dump(scaler_X, "scaler_X.pkl")
joblib.dump(imputer, "imputer.pkl")
joblib.dump(scaler_area, "scaler_area.pkl")
joblib.dump(scaler_pop, "scaler_pop.pkl")
joblib.dump(scaler_crops, "scaler_crops.pkl")
joblib.dump(scaler_houses, "scaler_houses.pkl")

# Save feature names
joblib.dump(feature_names, "feature_names.pkl")

['feature_names.pkl']

In [53]:
import pandas as pd
import joblib
import numpy as np

# Load models
models = {
    'flood_risk': joblib.load("xgb_flood_risk.pkl"),
    
    'area': joblib.load("xgb_area.pkl"),
    'population': joblib.load("xgb_population.pkl"),
    'crops': joblib.load("xgb_crops.pkl"),
    'houses': joblib.load("xgb_houses.pkl")
}

# Load scalers and imputer
scaler_X = joblib.load("scaler_X.pkl")
imputer = joblib.load("imputer.pkl")
scaler_area = joblib.load("scaler_area.pkl")
scaler_pop = joblib.load("scaler_pop.pkl")
scaler_crops = joblib.load("scaler_crops.pkl")
scaler_houses = joblib.load("scaler_houses.pkl")

# Load feature names
feature_names = joblib.load("feature_names.pkl")

# Define new data sample (example values)
X_new = pd.DataFrame([{
    'District': 34,
    'Rainfall (mm)': 5.5,
    'River': 4,
    'River Level': 3.5,
    'Month_sin': 0.5,  
    'Month_cos': 0.866  
}])

# Ensure all features are present and in correct order
for feature in feature_names:
    if feature not in X_new.columns:
        X_new[feature] = 0
X_new = X_new[feature_names]

# Preprocess new data
X_new_imputed = pd.DataFrame(imputer.transform(X_new), columns=feature_names)
X_new_scaled = pd.DataFrame(scaler_X.transform(X_new_imputed), columns=feature_names)

# Make predictions
predictions = {
    'Flood Risk': models['flood_risk'].predict(X_new_scaled)[0],
    'Area affected': scaler_area.inverse_transform(
        models['area'].predict(X_new_scaled).reshape(-1, 1))[0][0],
    'Population affected': scaler_pop.inverse_transform(
        models['population'].predict(X_new_scaled).reshape(-1, 1))[0][0],
    'Damage to Crops': scaler_crops.inverse_transform(
        models['crops'].predict(X_new_scaled).reshape(-1, 1))[0][0],
    'Damage to Houses': scaler_houses.inverse_transform(
        models['houses'].predict(X_new_scaled).reshape(-1, 1))[0][0]
}

# Display results
print("Prediction Results:")
for target, value in predictions.items():
    if target in ['Flood Risk']:
        print(f"{target}: {value} (0=No, 1=Yes, 2=Severe)")
    else:
        print(f"{target}: {value:.2f}")

Prediction Results:
Flood Risk: 1 (0=No, 1=Yes, 2=Severe)
Area affected: 0.51
Population affected: -0.04
Damage to Crops: -0.00
Damage to Houses: -0.21
