In [4]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import pickle

# Load the dataset
data = pd.read_csv("water_potability.csv")

# Handle missing values
data['ph'].fillna(data['ph'].median(), inplace=True)
data['Sulfate'].fillna(data['Sulfate'].median(), inplace=True)
data['Trihalomethanes'].fillna(data['Trihalomethanes'].median(), inplace=True)

# Feature engineering
data['Hardness_by_Conductivity'] = data['Hardness'] / (data['Conductivity'] + 1e-5)
data['Organic_Carbon_Ratio'] = data['Organic_carbon'] / (data['Solids'] + 1e-5)
data['Chloramines_per_Turbidity'] = data['Chloramines'] / (data['Turbidity'] + 1e-5)

# Split features and target
X = data.drop(columns=["Potability"])
y = data["Potability"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

# Train Random Forest
random_forest = RandomForestClassifier(random_state=42, n_estimators=100)
random_forest.fit(X_train_balanced, y_train_balanced)

# Test model performance
y_pred_rf = random_forest.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")

# Save the Random Forest model and scaler in a single .pkl file
with open("model_and_scaler.pkl", "wb") as file:
    pickle.dump({"model": random_forest, "scaler": scaler}, file)

print("Model and scaler saved as 'model_and_scaler.pkl'")


Random Forest Accuracy: 64.18%
Model and scaler saved as 'model_and_scaler.pkl'
