###### Importing required libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

**Loading Dataset**

In [2]:
df = pd.read_csv('diabetes_prediction_dataset.csv')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [3]:
df.shape

(100000, 9)

**Identifying null values**

In [4]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

Great Data contains no null values

**Identifying Duplicates**

In [5]:
df.duplicated().sum()

3854

In [6]:
df = df.drop_duplicates()

In [7]:
df.shape

(96146, 9)

Great duplicates are removed

**Handling Numeric and Categorical Features**  

In [8]:
#numeric_features = ['age','hypertension', 'heart_disease','bmi', 'HbA1c_level','blood_glucose_level']
#categorical_features = ['gender','smoking_history']

**Separating features (X) and target variable (y)**

In [9]:
X = df.drop('diabetes',axis=1)
y = df['diabetes']

**Splitting the Data into train and test set**

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [11]:
X_train.shape, y_train.shape

((67302, 8), (67302,))

In [12]:
X_test.shape, y_test.shape

((28844, 8), (28844,))

**Transforming and Normalizing Data**

In [13]:
# Define preprocessing steps for numeric and categorical features
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

In [15]:
numeric_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())  # Standardize numeric features
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Encode categorical features
    ]
)

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create a list of classifiers
classifiers = [
    ('K-NN', KNeighborsClassifier()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Logistic Regression', LogisticRegression()),
    ('SVM', SVC())
]

# Create and evaluate pipelines for each classifier
for name, classifier in classifiers:
    model = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('classifier', classifier)  # Add the classifier
        ]
    )

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")
    print(f"Classification Report for {name}:\n{report}\n")


K-NN Accuracy: 0.96
Classification Report for K-NN:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     26267
           1       0.88      0.62      0.73      2577

    accuracy                           0.96     28844
   macro avg       0.92      0.80      0.85     28844
weighted avg       0.96      0.96      0.96     28844


Decision Tree Accuracy: 0.95
Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     26267
           1       0.71      0.74      0.73      2577

    accuracy                           0.95     28844
   macro avg       0.84      0.86      0.85     28844
weighted avg       0.95      0.95      0.95     28844


Random Forest Accuracy: 0.97
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     26267
           1       0.94      0.69      0.80

**Great!!**