In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os

In [2]:
# Load dataset
df = pd.read_csv("../data/11.csv")

# Drop 'Year' column if present
df.drop(columns=["Year"], inplace=True, errors="ignore")

# Display first few rows
df.head()

Unnamed: 0,Year-Month,District,Rainfall (mm),River,River Level,Flood Risk,Flood Occurred,Area affected in (m.ha),Population affected in (million),Damage to Crops,Damage to Houses
0,1981-06,0,29.0,3,67.2175,0,No,-0.221896,-0.218068,-0.221629,-0.230728
1,1989-06,0,12.6,3,67.0945,0,No,-0.226377,-0.226563,-0.221941,-0.202502
2,1989-07,0,171.400001,3,68.2636,0,No,-0.223359,-0.223544,-0.218925,-0.202502
3,1989-08,0,114.199998,3,67.61575,0,No,-0.224452,-0.224637,-0.220017,-0.202502
4,1989-09,0,450.2,3,71.35,2,Yes,-0.176487,-0.176668,-0.172093,-0.184149


In [3]:
# Create interaction term
df["Rainfall × River Level"] = df["Rainfall (mm)"] * df["River Level"]

In [4]:
label_encoders = {}

for col in ["River", "District"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoders for later use

# Save encoders
joblib.dump(label_encoders, "label_encoders.pkl")

['label_encoders.pkl']

In [5]:
# Define target variables and fill missing values
y_flood_risk = df["Flood Risk"].fillna(df["Flood Risk"].mode()[0]).astype(int)
y_area_affected = df["Area affected in (m.ha)"].fillna(df["Area affected in (m.ha)"].median())
y_population_affected = df["Population affected in (million)"].fillna(df["Population affected in (million)"].median())
y_damage_crops = df["Damage to Crops"].fillna(df["Damage to Crops"].median())
y_damage_houses = df["Damage to Houses"].fillna(df["Damage to Houses"].median())

In [6]:
# Initialize scalers
scaler_area = StandardScaler()
scaler_population = StandardScaler()
scaler_crops = StandardScaler()
scaler_houses = StandardScaler()

# Apply scaling
y_area_affected_scaled = scaler_area.fit_transform(y_area_affected.values.reshape(-1, 1))
y_population_affected_scaled = scaler_population.fit_transform(y_population_affected.values.reshape(-1, 1))
y_damage_crops_scaled = scaler_crops.fit_transform(y_damage_crops.values.reshape(-1, 1))
y_damage_houses_scaled = scaler_houses.fit_transform(y_damage_houses.values.reshape(-1, 1))

# Save scalers for later use
joblib.dump(scaler_area, "scaler_area.pkl")
joblib.dump(scaler_population, "scaler_population.pkl")
joblib.dump(scaler_crops, "scaler_crops.pkl")
joblib.dump(scaler_houses, "scaler_houses.pkl")

['scaler_houses.pkl']

In [7]:
# Define feature columns
X = df[["River", "River Level", "Rainfall (mm)", "District", "Rainfall × River Level"]]

# Save processed data
joblib.dump((X, y_flood_risk, y_area_affected_scaled, y_population_affected_scaled, y_damage_crops_scaled, y_damage_houses_scaled), "processed_data.pkl")

print("Preprocessing Complete. Data Saved.")

Preprocessing Complete. Data Saved.


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import json

file_path = "../data/imputed_flood_data.csv"
df = pd.read_csv(file_path)

df[['Year', 'Month']] = df['Year-Month'].str.split('-', expand=True)
df['Year'] = df['Year'].astype(int)
df['Month'] = df['Month'].astype(int)
df.drop(columns=['Year-Month'], inplace=True)

#categorical columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols = [col for col in cat_cols if col not in ['Year', 'Month']]


label_enc_cols = ['Flood Risk']
one_hot_enc_cols = [col for col in cat_cols if col not in label_enc_cols]
le = LabelEncoder()
for col in label_enc_cols:
    df[col] = le.fit_transform(df[col])
mappings = {}
for col in one_hot_enc_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    mappings[col] = {str(k): int(v) for k, v in zip(le.classes_, le.transform(le.classes_))}


df.to_csv("../data/encoded_flood_data.csv", index=False)

mapping_file = "../data/category_mappings.json"
with open(mapping_file, "w") as f:
    json.dump(mappings, f, indent=4)

print(f"Encoding Complete. New dataset shape: {df.shape}")
print(f"Category mappings saved to {mapping_file}")


Encoding Complete. New dataset shape: (27376, 12)
Category mappings saved to ../data/category_mappings.json


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

file_path = "../data/encoded_flood_data.csv"
df = pd.read_csv(file_path)

df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)


df.drop(columns=['Month'], inplace=True)


features_to_scale = ['Rainfall (mm)', 'River Level', 'Area affected in (m.ha)',
                     'Population affected in (million)', 'Damage to Crops', 'Damage to Houses']


scaler = StandardScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])


df.to_csv("../data/scaled_flood_data.csv", index=False)

print(f"Feature Scaling Complete. New dataset shape: {df.shape}")

Feature Scaling Complete. New dataset shape: (27376, 13)
