In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime

## 📄 Step 1: Load and Prepare Dataset

We load a CSV file containing water quality data from 2000 to 2021.
We convert the date column to a proper datetime format, sort the values, and generate useful time-based features.

In [None]:
# Load and prepare dataset
df = pd.read_csv('PB_All_2000_2021.csv', sep=';')
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
df = df.sort_values(by=['id', 'date'])

In [None]:
# Feature engineering
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['dayofyear'] = df['date'].dt.dayofyear
df['weekofyear'] = df['date'].dt.isocalendar().week
df = df.ffill()

## 🧪 Step 2: Select Features and Target Variables

We define the independent variables (`features`) and the outputs we want to predict (`targets`).

In [None]:
features = ['id', 'NH4', 'BSK5', 'Suspended', 'year', 'month', 'dayofyear', 'weekofyear']
targets = ['O2', 'NO3', 'NO2', 'SO4', 'PO4', 'CL']
df = df.dropna(subset=features + targets)

In [None]:
X = df[features]
y = df[targets]

## 🧠 Step 3: Model Training

We use a Random Forest Regressor within a MultiOutputRegressor to predict multiple water quality parameters.

In [None]:
# Model training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Streamlit UI
st.set_page_config(page_title="Water Quality Prediction", layout="wide")
st.title("🌊 Water Quality Prediction Dashboard")

In [None]:
# Dataset Preview
st.subheader("📋 Dataset Preview")
st.dataframe(df.head(50))

## 📈 Step 4: Model Evaluation

We compute the Mean Squared Error (MSE) and R² Score to evaluate the model.

In [None]:
# Evaluation Metrics
st.subheader("📈 Model Performance")
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
st.write(f"**Mean Squared Error:** {mse:.2f}")
st.write(f"**R² Score (Overall):** {r2:.4f}")

In [None]:
for i, col in enumerate(targets):
    st.write(f"- {col}: R² = {r2_score(y_test[col], y_pred[:, i]):.4f}")

## 📊 Step 5: Feature Importance

We show which features contributed most to the prediction of the first target (O₂) using a bar chart.

In [None]:
# Feature Importance
st.subheader("🔍 Feature Importance (based on O2)")
importances = model.estimators_[0].feature_importances_
feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)

In [None]:
fig, ax = plt.subplots(figsize=(4, 2))
sns.barplot(x=feat_imp, y=feat_imp.index, ax=ax)
st.pyplot(fig)

In [None]:
# Custom Prediction Input
st.subheader("🧪 Predict Water Quality From Custom Input")

In [None]:
col1, col2 = st.columns(2)
with col1:
    id_val = st.number_input("Location ID", 1, 22, value=1)
    nh4 = st.number_input("NH4", value=0.5)
    bsk5 = st.number_input("BSK5", value=3.0)
    suspended = st.number_input("Suspended Solids", value=10.0)
with col2:
    year = st.number_input("Year", min_value=2000, max_value=2025, value=2024)
    month = st.slider("Month", 1, 12, 6)

In [None]:
# Calculate derived features
sample_date = datetime(year, month, 15)
dayofyear = sample_date.timetuple().tm_yday
weekofyear = sample_date.isocalendar().week

## 🔮 Step 6: Custom Prediction from New Input

Here you can manually enter values for a new sample and get predicted water quality parameters.

In [None]:
# Make prediction on input
if st.button("🔮 Predict Water Quality"):
    input_data = pd.DataFrame([[id_val, nh4, bsk5, suspended, year, month, dayofyear, weekofyear]], columns=features)
    prediction = model.predict(input_data)[0]
    result_df = pd.DataFrame([prediction], columns=targets)
    st.write("### 🎯 Prediction Result")
    st.dataframe(result_df.style.format("{:.2f}"))