# Flood Prediction using Machine Learning (Colab Version)

This notebook demonstrates the process of building a machine learning model to predict floods based on weather data. 
It is optimized for Google Colab environment.

In [None]:
# Install necessary libraries
!pip install pickle-mixin
!pip install seaborn
!pip install scikit-learn
!pip install pandas
!pip install openpyxl

In [None]:
# Upload Dataset
from google.colab import files
print("Please upload 'flood dataset.xlsx'")
uploaded = files.upload()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load Dataset
# Assuming the file is uploaded to the current directory
data_path = 'flood dataset.xlsx'
try:
    df = pd.read_excel(data_path)
    print("Dataset loaded successfully.")
    display(df.head())
except FileNotFoundError:
    print("Error: 'flood dataset.xlsx' not found. Please upload it using the cell above.")

In [None]:
df.info()

In [None]:
# Data Visualization
# Correlation Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Outlier Detection using Boxplots
plt.figure(figsize=(15, 10))
for i, col in enumerate(df.columns[:-1]): # Exclude target
    plt.subplot(3, 4, i+1)
    sns.boxplot(df[col])
    plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
# Handling Missing Values
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
print("Missing values after handling:")
print(df.isnull().sum())

In [None]:
# Feature Selection
X = df[['Temp', 'Humidity', 'Cloud Cover', 'ANNUAL', 'Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec', 'avgjune', 'sub']]
y = df['flood']

In [None]:
# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Model Building & Comparison
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

best_model_name = ""
best_accuracy = 0
best_model_obj = None

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")
    
    if acc > best_accuracy:
        best_accuracy = acc
        best_model_name = name
        best_model_obj = model

print(f"\nBest Model: {best_model_name} with Accuracy: {best_accuracy:.4f}")

In [None]:
# Evaluation of Best Model
y_pred = best_model_obj.predict(X_test)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Saving the Best Model and Scaler
with open('floods.save', 'wb') as f:
    pickle.dump(best_model_obj, f)

with open('transform.save', 'wb') as f:
    pickle.dump(scaler, f)
    
print("Model saved as floods.save")
print("Scaler saved as transform.save")

In [None]:
# Download the saved model and scaler
from google.colab import files
files.download('floods.save')
files.download('transform.save')