# AI-Based Water Quality Monitoring System - Model Training

This notebook trains a Random Forest Classifier to predict water potability based on `pH` and `Solids` (TDS).

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Load Data
Make sure `water_potability.csv` is in the `Dataset` directory.

In [None]:
try:
    df = pd.read_csv('Dataset/water_potability.csv')
    print("Dataset loaded successfully.")
    print(df.head())
except FileNotFoundError:
    print("Error: 'Dataset/water_potability.csv' not found. Please download it from Kaggle and place it in the Dataset directory.")

## 2. Data Preprocessing
We will only use `pH` and `Solids` features as requested.

In [None]:
# Select relevant features
features = ['ph', 'Solids']
target = 'Potability'

# Check if columns exist
if set(features).issubset(df.columns) and target in df.columns:
    df_selected = df[features + [target]].copy()
    
    # Handle missing values: Fill with mean
    print("\nMissing values before imputation:")
    print(df_selected.isnull().sum())
    
    df_selected['ph'] = df_selected['ph'].fillna(df_selected['ph'].mean())
    df_selected['Solids'] = df_selected['Solids'].fillna(df_selected['Solids'].mean())
    
    print("\nMissing values after imputation:")
    print(df_selected.isnull().sum())
else:
    print("Required columns not found in the dataset.")

## 3. Train-Test Split

In [None]:
X = df_selected[features]
y = df_selected[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

## 4. Model Training (Random Forest)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("Model training completed.")

## 5. Model Evaluation

In [None]:
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot Confusion Matrix using Seaborn if installed, else just print
try:
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Unsafe', 'Safe'], yticklabels=['Unsafe', 'Safe'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()
except Exception as e:
    print("Could not plot confusion matrix (matplotlib/seaborn might be missing or error).")

## 6. Save Model

In [None]:
with open('water_model.pkl', 'wb') as f:
    pickle.dump(model, f)
print("Model saved as 'water_model.pkl'.")