In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
data = pd.read_csv('/kaggle/input/diabetes-data/diabetes.csv')
print(data.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [4]:
X = data.drop(columns=['Outcome'], axis=1)
y = data['Outcome']

#### Split the Data into Training and Test Sets

In [5]:
train, val_train, test, val_test = train_test_split(X, y, test_size=0.5, random_state=355)
X_train, X_test, y_train, y_test = train_test_split(train, test, test_size=0.2, random_state=355)

### Model 1: Implementation of KNeighborsClassifier

In [6]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
knn.score(X_test, y_test)

0.7402597402597403

### model 2: Implementation of SVC classifier

In [10]:
svc = SVC()
svc.fit(X_train, y_train)
svc.score(X_test, y_test)

0.7402597402597403

### Step 6: Generate Predictions for the Validation Set

In [11]:
predict_val1 = knn.predict(val_train)
predict_val2 = svc.predict(val_train)
predict_val = np.column_stack((predict_val1, predict_val2))
print(predict_val)

[[0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 1]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]

### Step 7: Generate Predictions for the Test Set

In [12]:
predict_test1 = knn.predict(X_test)
predict_test2 = svc.predict(X_test)
predict_test = np.column_stack((predict_test1, predict_test2))
print(predict_test)

[[1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [0 1]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]]


### Step 8: Train the Meta-Model (Random Forest Classifier)

In [13]:
rand_clf = RandomForestClassifier()
rand_clf.fit(predict_val, val_test)
print("Random Forest Test Score:", rand_clf.score(predict_test, y_test))

Random Forest Test Score: 0.7402597402597403


### Step 9: Hyperparameter Tuning using GridSearchCV

In [14]:
grid_param = {
    "n_estimators": [90, 100, 115],
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [4, 5, 6, 7, 8],
    'max_features': ['auto', 'log2']
}

grid_search = GridSearchCV(estimator=rand_clf, param_grid=grid_param, cv=5, n_jobs=-1, verbose=3)
grid_search.fit(predict_val, val_test)
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
[CV 1/5] END criterion=gini, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=90;, score=nan total time=   0.0s
[CV 5/5] END criterion=gini, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=90;, score=nan total time=   0.0s
[CV 2/5] END criterion=gini, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100;, score=nan total time=   0.0s
[CV 4/5] END criterion=gini, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100;, score=nan total time=   0.0s
[CV 3/5] END criterion=gini, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=115;, score=nan total time=   0.0s
[CV 2/5] END criterion=gini, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=90;, score=nan total time=   0.0s
[CV 1/5] END criterion=gini, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=nan

750 fits failed out of a total of 1500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
401 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
s

### Step 10: Train the Meta-Model with Tuned Hyperparameters

In [18]:
best_params = grid_search.best_params_
rand_clf_hyperparameter_tuning = RandomForestClassifier(**best_params)
rand_clf_hyperparameter_tuning.fit(predict_val, val_test)
print("Tuned Random Forest Test Score:", rand_clf_hyperparameter_tuning.score(predict_test, y_test))

Tuned Random Forest Test Score: 0.7402597402597403

[CV 2/5] END criterion=entropy, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=100;, score=0.740 total time=   0.3s
[CV 3/5] END criterion=entropy, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=100;, score=0.805 total time=   0.3s
[CV 4/5] END criterion=entropy, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=100;, score=0.701 total time=   0.3s
[CV 5/5] END criterion=entropy, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=100;, score=0.789 total time=   0.3s
[CV 4/5] END criterion=entropy, max_features=log2, min_samples_leaf=5, min_samples_split=4, n_estimators=90;, score=0.701 total time=   0.3s
[CV 5/5] END criterion=entropy, max_features=log2, min_samples_leaf=5, min_samples_split=4, n_estimators=90;, score=0.789 total time=   0.3s
[CV 1/5] END criterion=entropy, max_features=log2, min_samples_leaf=5, min_samples_split=4, n_esti