In [13]:
import pandas as pd

def preprocess_flood_data(file_path, output_path):
    # Load the dataset
    df = pd.read_csv(file_path)
    
    # Identify input features and output labels
    input_features = ['Year-Month', 'District', 'Rainfall (mm)', 'River', 'River Level']
    output_labels = ['Flood Risk', 'Flood Occurred', 'Area affected in (m.ha)', 'Population affected in (million)', 'Damage to Crops', 'Damage to Houses']
    
    # Keep only the necessary columns
    df = df[input_features + output_labels]
    
    # Handle missing values
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col].fillna("Unknown", inplace=True)  # Fill categorical values with 'Unknown'
        else:
            df[col].fillna(0, inplace=True)  # Fill numerical values with 0
    
    # Ensure consistent formatting
    for col in df.select_dtypes(include=['number']).columns:
        df[col] = df[col].astype(float)  # Convert numerical columns to float
    
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].astype(str)  # Convert categorical columns to string
    
    # Save the processed dataset
    df.to_csv(output_path, index=False)
    
    return df

# Example usage
input_file = "../data/modified_flood_data.csv"
output_file = "../data/processed_flood_data.csv"
df_processed = preprocess_flood_data(input_file, output_file)

# Display first few rows
print(df_processed.head())


  Year-Month District  Rainfall (mm)    River  River Level Flood Risk  \
0    1981-06   Araria      29.000000  Unknown          0.0       Safe   
1    1989-06   Araria      12.600000  Unknown          0.0       Safe   
2    1989-07   Araria     171.400001  Unknown          0.0       Safe   
3    1989-08   Araria     114.199998  Unknown          0.0       Safe   
4    1989-09   Araria     450.200000  Unknown          0.0     DANGER   

  Flood Occurred  Area affected in (m.ha)  Population affected in (million)  \
0             No                      1.3                               7.0   
1             No                      0.0                               0.0   
2             No                      0.0                               0.0   
3             No                      0.0                               0.0   
4            Yes                      0.0                               0.0   

   Damage to Crops Damage to Houses  
0              0.6             72.1  
1         

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna("Unknown", inplace=True)  # Fill categorical values with 'Unknown'
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)  # Fill numerical values with 0


In [3]:
import pandas as pd

# Load dataset
file_path = "../data/processed_flood_data.csv"
df = pd.read_csv(file_path)

# Convert "Unknown" to NaN for processing
df.replace("Unknown", pd.NA, inplace=True)

# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=['number']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with median
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Impute categorical columns with mode (most frequent value)
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])  # mode()[0] gets the most frequent value

# Save cleaned dataset
df.to_csv("../data/imputed_flood_data.csv", index=False)

# Display confirmation
print(f"Imputation complete. New dataset size: {df.shape}")


Imputation complete. New dataset size: (27376, 11)
