In [2]:
import pandas as pd 
import numpy as np

In [3]:
df = pd.read_csv("D:/water_potability/WaterPotability/src/models/model_metrics.csv")

In [4]:
df

Unnamed: 0,model,accuracy,precision,recall,f1_score
0,LogisticRegression,0.628049,0.394445,0.628049,0.484562
1,DecisionTreeClassifier,0.573171,0.580383,0.573171,0.576264
2,RandomForestClassifier,0.670732,0.65677,0.670732,0.642212
3,GaussianNB,0.631098,0.600287,0.631098,0.583681
4,GradientBoostingClassifier,0.669207,0.658014,0.669207,0.629237
5,AdaBoostClassifier,0.63872,0.662579,0.63872,0.520626
6,SVC,0.692073,0.690187,0.692073,0.657145
7,KNeighborsClassifier,0.631098,0.619428,0.631098,0.622673


In [5]:
df.sort_values(by="f1_score")

Unnamed: 0,model,accuracy,precision,recall,f1_score
0,LogisticRegression,0.628049,0.394445,0.628049,0.484562
5,AdaBoostClassifier,0.63872,0.662579,0.63872,0.520626
1,DecisionTreeClassifier,0.573171,0.580383,0.573171,0.576264
3,GaussianNB,0.631098,0.600287,0.631098,0.583681
7,KNeighborsClassifier,0.631098,0.619428,0.631098,0.622673
4,GradientBoostingClassifier,0.669207,0.658014,0.669207,0.629237
2,RandomForestClassifier,0.670732,0.65677,0.670732,0.642212
6,SVC,0.692073,0.690187,0.692073,0.657145


<b>
<ul> <li>1. SVC with 65% of F1 score</li>
<li>2. Random Forest Classifier with 64.2 f1 score</li>
</ul</b>

In [2]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# Hyperparameter Tuning for SVC and Random Forest Classifier

param_grids={
    'Support Vector Machine': {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01, 0.001,"scale"], 'kernel': ['rbf',"linear"]},
    'Random Forest': {'n_estimators': [50, 100, 150], 'max_depth': [None, 5, 10,15],'min_samples_split': [2, 5]}

}

best_estimators={}

models = {
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
}


In [3]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
for model_name, model in models.items():
    if model_name in param_grids:  # Check if the model requires tuning
        param_grid = param_grids[model_name]
        
        # Perform Grid Search or Randomized Search based on the size of the parameter grid
        if len(param_grid) < 6:
            # Perform Grid Search
            grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
        else:
            # Perform Randomized Search
            grid_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
        

In [6]:
from data.load_data import load_data
data = load_data("D:/water_potability/WaterPotability/data/processed/processed_data.csv")

Data successfully loaded from D:/water_potability/WaterPotability/data/processed/processed_data.csv


In [7]:
X = data.drop('Potability', axis=1) 
y = data['Potability']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
grid_search.fit(X_train, y_train)
best_estimators[model_name] = grid_search.best_estimator_


Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [10]:
print(f"Best parameters for {model_name}: {grid_search.best_params_}")
print(f"Training Accuracy: {grid_search.best_score_}")

Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 150}
Training Accuracy: 0.6767175572519084


In [11]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
for model_name, model in models.items():
    if model_name in param_grids:  # Check if the model requires tuning
        param_grid = param_grids[model_name]
        
        # Perform Grid Search or Randomized Search based on the size of the parameter grid
        if len(param_grid) < 6:
            # Perform Grid Search
            grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
        else:
            # Perform Randomized Search
            grid_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
        
        grid_search.fit(X_train, y_train)
        best_estimators[model_name] = grid_search.best_estimator_

        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        print(f"Training Accuracy: {grid_search.best_score_}")

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters for Support Vector Machine: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Training Accuracy: 0.6725190839694657
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Training Accuracy: 0.6790076335877864


# SVC will be chosen as the best model