In [7]:
# model_training.ipynb

import os
import sys

sys.path.append(os.path.abspath('../src'))
from models import train_model
from evaluate import evaluate_model

PROCESSED_PATH = '../data/processed/water_quality_cleaned.csv'
MODEL_PATH = '../results/reports/model.pkl'
RESULTS_DIR = '../results/figures'

model, X_test, y_test = train_model(PROCESSED_PATH, MODEL_PATH)
evaluate_model(model, X_test, y_test, RESULTS_DIR)



Model Evaluation Metrics:
R² Score: 0.8306
MAE: 0.0839
MSE: 0.0156
RMSE: 0.1248


In [None]:
# Cell 1 - Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Cell 2 - Paths
PROCESSED_PATH = '../data/processed/water_quality_cleaned.csv'
MODEL_SAVE_PATH = '../results/reports/model.pkl'

# Cell 3 - Load data
df = pd.read_csv(PROCESSED_PATH)

# Cell 4 - Inspect data
df.head()

# Cell 5 - Drop non-numeric/categorical columns if needed
non_features = ['Water Control Zone', 'Station', 'Dates', 'Sample No']  # update if not present
df = df.drop(columns=[col for col in non_features if col in df.columns])

# Cell 6 - Clean and split features/target

# Drop unnecessary text columns if they exist
non_features = ['Water Control Zone', 'Station', 'Dates', 'Sample No']
df = df.drop(columns=[col for col in non_features if col in df.columns])

# Drop rows with missing values
df = df.dropna()

# Identify numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Make sure 'pH' is in the list and is used as the target
if 'pH' not in numeric_cols:
    raise ValueError("'pH' column is not numeric or not found in dataset.")

# Define features and target
numeric_cols.remove('pH')
X = df[numeric_cols]
y = df['pH']


# Cell 7 - Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Cell 8 - Train model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Cell 9 - Evaluate model
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.3f}")
print(f"R² Score: {r2:.3f}")

# Cell 10 - Save model
os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)
joblib.dump(rf_model, MODEL_SAVE_PATH)

print(f"Model saved to {MODEL_SAVE_PATH}")

# Cell 11 - Optional: Plot actual vs predicted
plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.7)
plt.xlabel('Actual pH')
plt.ylabel('Predicted pH')
plt.title('Actual vs Predicted pH')
plt.grid(True)
plt.tight_layout()
plt.show()


ValueError: could not convert string to float: 'Middle Water'