Loads all necessary libraries for data manipulation (pandas, numpy), visualization (matplotlib, seaborn), modeling (sklearn), model saving (joblib), and date handling (datetime).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime


Loads the water quality dataset.

Converts the date column to datetime format.

Sorts the data chronologically by station id and date.

In [None]:
df = pd.read_csv('PB_All_2000_2021.csv', sep=';')
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
df = df.sort_values(by=['id', 'date'])


Creates new time-related features that can help with prediction (like year, month, week, etc.).

Uses forward fill (ffill) to handle any missing values in the dataset.

In [None]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['dayofyear'] = df['date'].dt.dayofyear
df['weekofyear'] = df['date'].dt.isocalendar().week
df = df.ffill()


Defines:

features: independent input variables.

targets: pollutant levels (what we want to predict).

Drops rows where any of the features or targets are missing.

Splits the dataset into X (inputs) and y (outputs).

In [None]:
features = ['id', 'NH4', 'BSK5', 'Suspended', 'year', 'month', 'dayofyear', 'weekofyear']
targets = ['O2', 'NO3', 'NO2', 'SO4', 'PO4', 'CL']
df = df.dropna(subset=features + targets)
X = df[features]
y = df[targets]


Splits the data into training and testing sets (80% train, 20% test).

Trains a MultiOutput Random Forest model to predict multiple pollutants at once.

y_pred contains the predictions on the test set.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


Calculates overall Mean Squared Error (MSE) and R² score to measure accuracy.

Also prints individual R² scores for each pollutant (O2, NO3, etc.).

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score (Overall): {r2:.4f}")
for i, col in enumerate(targets):
    print(f"- {col}: R² = {r2_score(y_test[col], y_pred[:, i]):.4f}")


Visualizes feature importance for the first target variable (O2).

Helps understand which inputs (e.g., NH4, month, year) most influence the prediction.

In [None]:
importances = model.estimators_[0].feature_importances_
feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(8, 4))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title("Feature Importance for O2 Prediction")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.tight_layout()
plt.show()


Creates a hypothetical input row (station 22 in 2024) and predicts pollutant levels for that location and time.

In [None]:
station_id = 22
year_input = 2024
month_input = 6
sample_date = datetime(year_input, month_input, 15)
dayofyear = sample_date.timetuple().tm_yday
weekofyear = sample_date.isocalendar().week
input_data = pd.DataFrame([[station_id, 0.5, 3.0, 10.0, year_input, month_input, dayofyear, weekofyear]],
                          columns=features)
predicted_pollutants = model.predict(input_data)[0]
print(f"\nPredicted pollutant levels for station '{station_id}' in {year_input}:")
for p, val in zip(targets, predicted_pollutants):
    print(f"  {p}: {val:.2f}")


Saves the trained model and its input column structure for future use — helpful when deploying or reusing the model.



In [None]:
joblib.dump(model, 'pollution_model.pkl')
joblib.dump(X.columns.tolist(), 'model_columns.pkl')
print('\nModel and column structure saved successfully!')


Allows prediction for any custom user input (e.g., a different location, month, etc.).

Displays predicted pollutant levels for that custom scenario.

In [None]:
custom_input = {
    'id': 1,
    'NH4': 0.6,
    'BSK5': 3.5,
    'Suspended': 15.0,
    'year': 2025,
    'month': 5
}
sample_date = datetime(custom_input['year'], custom_input['month'], 15)
custom_input['dayofyear'] = sample_date.timetuple().tm_yday
custom_input['weekofyear'] = sample_date.isocalendar().week
input_df = pd.DataFrame([custom_input])[features]
prediction = model.predict(input_df)[0]
custom_result = pd.DataFrame([prediction], columns=targets)
print("\nCustom Prediction Result:")
print(custom_result.round(2))
