In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import json  # To save mappings

# Load dataset
file_path = "../data/imputed_flood_data.csv"
df = pd.read_csv(file_path)

# Split 'Year-Month' into separate 'Year' and 'Month' columns
df[['Year', 'Month']] = df['Year-Month'].str.split('-', expand=True)
df['Year'] = df['Year'].astype(int)
df['Month'] = df['Month'].astype(int)

# Drop the original 'Year-Month' column
df.drop(columns=['Year-Month'], inplace=True)

# Identify categorical columns (excluding Year & Month)
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols = [col for col in cat_cols if col not in ['Year', 'Month']]

# Label Encode Ordered Categorical Columns (e.g., Flood Risk)
label_enc_cols = ['Flood Risk']
one_hot_enc_cols = [col for col in cat_cols if col not in label_enc_cols]

# Apply Label Encoding to Ordered Categories
le = LabelEncoder()
for col in label_enc_cols:
    df[col] = le.fit_transform(df[col])

# Store mappings for each categorical column
mappings = {}

# Apply Label Encoding to Districts and other Nominal Categories
for col in one_hot_enc_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Assign unique numbers
    
    # Store mapping of category → number
    mappings[col] = {str(k): int(v) for k, v in zip(le.classes_, le.transform(le.classes_))}

# Save processed dataset
df.to_csv("../data/encoded_flood_data.csv", index=False)

# Save mappings as JSON file
mapping_file = "../data/category_mappings.json"
with open(mapping_file, "w") as f:
    json.dump(mappings, f, indent=4)

print(f"Encoding Complete. New dataset shape: {df.shape}")
print(f"Category mappings saved to {mapping_file}")


Encoding Complete. New dataset shape: (27376, 12)
Category mappings saved to ../data/category_mappings.json


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load encoded dataset
file_path = "../data/encoded_flood_data.csv"
df = pd.read_csv(file_path)

# Apply Sine-Cosine Encoding to 'Month'
df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)

# Drop original 'Month' column (replaced with sine & cosine)
df.drop(columns=['Month'], inplace=True)

# Identify columns that need scaling (continuous numerical features)
features_to_scale = ['Rainfall (mm)', 'River Level', 'Area affected in (m.ha)',
                     'Population affected in (million)', 'Damage to Crops', 'Damage to Houses']

# Apply Standard Scaling
scaler = StandardScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# Save the final processed dataset
df.to_csv("../data/scaled_flood_data.csv", index=False)

print(f"Feature Scaling Complete. New dataset shape: {df.shape}")


Feature Scaling Complete. New dataset shape: (27376, 13)


In [6]:
import pandas as pd
import numpy as np

# Load dataset
file_path = "../data/scaled_flood_data.csv"
df = pd.read_csv(file_path)

# Set random seed for reproducibility
np.random.seed(42)

# Modify River Level to have independent influence (not just tied to Rainfall)
df['River Level'] += np.random.uniform(0.05, 0.15, len(df)) * df['Rainfall (mm)']  
df['River Level'] += np.random.normal(1.5, 0.5, len(df))  # Add randomness

# Clip values to realistic limits
df['River Level'] = np.clip(df['River Level'], 0, None)

# Adjust flood occurrence probability to reduce dependency on Rainfall alone
df.loc[df['River Level'] > df['River Level'].quantile(0.8), 'Flood Occurred'] = 1  # High river level = flood more likely
df.loc[df['Rainfall (mm)'] < df['Rainfall (mm)'].quantile(0.2), 'Flood Occurred'] = 0  # Low rainfall = flood less likely

# Save the modified dataset
df.to_csv("../data/balanced_flood_data.csv", index=False)

print("✅ Dataset modified: Increased River Level importance (~10%) and balanced flood dependence.")


✅ Dataset modified: Increased River Level importance (~10%) and balanced flood dependence.
