#### **Model Training**

In [253]:
#importing necessary libraries
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score,precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.simplefilter(action = "ignore") 

In [254]:
#reading csv file
df = pd.read_csv("../Datasets/diabetes_processed_data.csv")

In [255]:
df.head()

Unnamed: 0,Pregnancies,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction,Age,Outcome,NewBMI_Obesity 1,NewBMI_Obesity 2,NewBMI_Obesity 3,NewBMI_Overweight,NewBMI_Underweight,NewGlucose_Normal,NewGlucose_Overweight,NewGlucose_Secret
0,6.0,72.0,35.0,0.0,0.627,50,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,66.0,29.0,0.0,0.351,31,0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,8.0,64.0,0.0,0.0,0.672,32,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,66.0,23.0,94.0,0.167,21,0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,40.0,35.0,168.0,1.2,33,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


Dependent and Independent Features

In [256]:
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

In [257]:
X.head()

Unnamed: 0,Pregnancies,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction,Age,NewBMI_Obesity 1,NewBMI_Obesity 2,NewBMI_Obesity 3,NewBMI_Overweight,NewBMI_Underweight,NewGlucose_Normal,NewGlucose_Overweight,NewGlucose_Secret
0,6.0,72.0,35.0,0.0,0.627,50,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,66.0,29.0,0.0,0.351,31,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,8.0,64.0,0.0,0.0,0.672,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,66.0,23.0,94.0,0.167,21,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,40.0,35.0,168.0,1.2,33,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [258]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [259]:
X.shape, y.shape

((768, 14), (768,))

Train Test Split

In [260]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [261]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 14), (154, 14), (614,), (154,))

**Feature Scaling Or Standardization**

In [262]:
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [263]:
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    recall = recall_score(true, predicted, average="weighted")
    precision = precision_score(true, predicted, average="weighted")
    cm = confusion_matrix(true, predicted)
    return accuracy,precision, recall, cm

In [264]:
models = {
    "Logistic Classifier": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Support Vector Classifier": SVC()
}
model_list = []
accuracy_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_accuracy , model_train_precision, model_train_recall, model_train_cm = evaluate_model(y_train, y_train_pred)

    model_test_accuracy , model_test_precision, model_test_recall, model_test_cm = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy Score: {:.4f}".format(model_train_accuracy))
    print("- Precision Score: {:.4f}".format(model_train_precision))
    print("- Recall Score: {:.4f}".format(model_train_recall))
    print("- Confusion Matrix:",(model_train_cm))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Accuracy Score: {:.4f}".format(model_test_accuracy))
    print("- Precision Score: {:.4f}".format(model_test_precision))
    print("- Recall Score: {:.4f}".format(model_test_recall))
    print("- Confusion Matrix:",(model_test_cm))
    accuracy_list.append(model_test_accuracy)

    print('='*35)
    print('\n')

Logistic Classifier
Model performance for Training set
- Accuracy Score: 0.7704
- Precision Score: 0.7660
- Recall Score: 0.7704
- Confusion Matrix: [[342  59]
 [ 82 131]]
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7338
- Precision Score: 0.7373
- Recall Score: 0.7338
- Confusion Matrix: [[77 22]
 [19 36]]


K-Neighbors Classifier
Model performance for Training set
- Accuracy Score: 0.8241
- Precision Score: 0.8221
- Recall Score: 0.8241
- Confusion Matrix: [[354  47]
 [ 61 152]]
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7468
- Precision Score: 0.7440
- Recall Score: 0.7468
- Confusion Matrix: [[81 18]
 [21 34]]


Decision Tree Classifier
Model performance for Training set
- Accuracy Score: 1.0000
- Precision Score: 1.0000
- Recall Score: 1.0000
- Confusion Matrix: [[401   0]
 [  0 213]]
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7078
- Precision Score: 0.70

In [265]:
for i in range(len(model_list)):
    print(model_list[i], accuracy_list[i])

Logistic Classifier 0.7337662337662337
K-Neighbors Classifier 0.7467532467532467
Decision Tree Classifier 0.7077922077922078
Random Forest Classifier 0.7792207792207793
Support Vector Classifier 0.7597402597402597


**Hyperparameter Tuning**

Logistic Regression

In [266]:
params_log = {
        "penalty" : ["l1", "l2", "elasticnet"],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'lbfgs', "sag", "saga"]
    }
grid_log = GridSearchCV(estimator=LogisticRegression(),
                    param_grid=params_log,
                    n_jobs=-1,
                    scoring="accuracy",
                    verbose=2)
grid_log.fit(X_train, y_train)
best_params = grid_log.best_params_
best_score = grid_log.best_score_
y_pred_log = grid_log.predict(X_test)
score = accuracy_score(y_test, y_pred_log)

print("Best Params:", best_params)
print("Best Score:", best_score)
print("Accuracy Score:", score)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Params: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
Best Score: 0.7655071304811408
Accuracy Score: 0.7467532467532467


K-Neighbors Classifier

In [267]:
# Define the parameter grid
param_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan'], 
    "algorithm" : ["auto", "ball_tree", "kd_tree"]
}

grid_knn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=param_knn,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

grid_knn.fit(X_train, y_train)
best_knn = grid_knn.best_estimator_

print("Best Params for KNN:", grid_knn.best_params_)
print("Best CV Score:", grid_knn.best_score_)

y_pred_knn = best_knn.predict(X_test)
score = accuracy_score(y_test, y_pred_knn)
print("Test Accuracy:", score)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Params for KNN: {'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'uniform'}
Best CV Score: 0.7720111955217913
Test Accuracy: 0.7662337662337663


Decision Tree Classifier

In [268]:
param_dt = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_dt = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_dt,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

grid_dt.fit(X_train, y_train)
best_dt = grid_dt.best_estimator_

print("Best Params for Decision Tree:", grid_dt.best_params_)
print("Best CV Score:", grid_dt.best_score_)

y_pred_dt = best_dt.predict(X_test)
score = accuracy_score(y_test, y_pred_dt)
print("Test Accuracy:", score)

Fitting 5 folds for each of 135 candidates, totalling 675 fits
Best Params for Decision Tree: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best CV Score: 0.732906837265094
Test Accuracy: 0.7532467532467533


Random Forest

In [269]:
param_rf = {
    'n_estimators': [50, 100, 200],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_rf,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_

print("Best Params for Random Forest:", grid_rf.best_params_)
print("Best CV Score:", grid_rf.best_score_)

y_pred_rf = best_rf.predict(X_test)
score = accuracy_score(y_test, y_pred_rf)
print("Test Accuracy:", score)


Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Best Params for Random Forest: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV Score: 0.7687458349993337
Test Accuracy: 0.7467532467532467


In [270]:
# Define parameter grid for SVC
param_svc = {
    'C': [0.1, 1, 10, 100],        # Regularization strength
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel types
    'gamma': ['scale', 'auto']     # Kernel coefficient
}

# Initialize the SVC model
svc = SVC(random_state=42)

# Initialize GridSearchCV
grid_svc = GridSearchCV(
    estimator=svc,
    param_grid=param_svc,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# Fit on the training data (already scaled)
grid_svc.fit(X_train, y_train)
best_svc = grid_svc.best_estimator_

print("Best Params for SVC:", grid_svc.best_params_)
print("Best CV Score:", grid_svc.best_score_)

y_pred_svc = best_rf.predict(X_test)
score = accuracy_score(y_test, y_pred_svc)
print("Test Accuracy:", score)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Params for SVC: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Best CV Score: 0.7622684259629482
Test Accuracy: 0.7467532467532467


In [272]:
models = {
    "KNN": best_knn,
    "Decision Tree": best_dt,
    "Random Forest": best_rf,
    "SVC": best_svc
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name:15s} - > Test Accuracy: {acc:.4f}")


KNN             - > Test Accuracy: 0.7662
Decision Tree   - > Test Accuracy: 0.7532
Random Forest   - > Test Accuracy: 0.7468
SVC             - > Test Accuracy: 0.7597


##### **Best Model**

with I conclude that KNN as best model and I am using this model to save into a pickel file and will be used for further development

**Pickeling**

In [273]:
import pickle
with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)


In [274]:
best_model = grid_knn.best_estimator_

In [275]:
with open("best_model.pkl", "wb") as file:
    pickle.dump(best_model, file)