In [1]:

import pandas as pd



# Dataset Basic Info

In [2]:
df = pd.read_csv('diabetes_prediction_dataset.csv')
rows , col =  df.shape
print(f"Number of Rows : {rows} \nNumber of Columns : {col}")

Number of Rows : 100000 
Number of Columns : 9


In [3]:
df.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')

In [4]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [5]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


# Data Preprocessing|

In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Separate features and target
X = df.drop('diabetes', axis=1)
y = df['diabetes']

# Define categorical and numerical columns
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)],
    remainder='drop')

# Fit the preprocessor to obtain transformed column names
X_preprocessed = preprocessor.fit_transform(X)
column_names = (numerical_cols + 
                list(preprocessor.named_transformers_['cat'].named_steps['onehot']
                     .get_feature_names_out(categorical_cols)))


# Class Imbalance 

In [7]:
from sklearn.utils import resample

# Re-create DataFrame with new column names for easier manipulation
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=column_names)

# Add the target variable back to the dataframe
X_preprocessed_df['diabetes'] = y

# Separate majority and minority classes
df_majority = X_preprocessed_df[X_preprocessed_df.diabetes==0]
df_minority = X_preprocessed_df[X_preprocessed_df.diabetes==1]

# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False,    # sample without replacement
                                   n_samples=8500,  # to match minority class
                                   random_state=123) # reproducible results

# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Display new class counts
print(df_downsampled.diabetes.value_counts())

# Removing the target variable from features dataset for selection
X_res = df_downsampled.drop('diabetes', axis=1)
y_res = df_downsampled['diabetes']


diabetes
0    8500
1    8500
Name: count, dtype: int64


# Feature Selection

In [8]:
from sklearn.feature_selection import SelectKBest, f_classif

# Apply SelectKBest class to extract top k best features
bestfeatures = SelectKBest(score_func=f_classif, k=4)
bestfeatures.fit(X_res, y_res)

# Get the boolean mask of the selected features
mask = bestfeatures.get_support()  # list of booleans
selected_features = []  # The list of your K best features

for bool, feature in zip(mask, X_res.columns):
    if bool:
        selected_features.append(feature)

print("Selected features:", selected_features)

Selected features: ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']


# Dataset Split

In [9]:
from sklearn.model_selection import train_test_split

# Assuming 'selected_features' is the list of the names of the selected features
# Refine X_res to only include selected features
X_final = X_res[selected_features]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y_res, test_size=0.2, random_state=42, stratify=y_res)


# Support Vector Machine

In [10]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize the Support Vector Classifier
model = SVC(kernel='linear', C=1.0, random_state=42)  

# Train the model
model.fit(X_train, y_train)


In [11]:
# Predict on the testing data
y_pred = model.predict(X_test)

# Print the classification report and confusion matrix
# print("Classification Report:\n", classification_report(y_test, y_pred))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Accuracy Score: 0.8785294117647059


# Random Forest Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier
# Initialize the RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=100,  # Number of trees
                                random_state=42,   # Random state for reproducibility
                                max_features=4, # Number of features to consider at every split
                                max_depth=None,    # Maximum number of levels in tree
                                min_samples_split=2, # Minimum number of samples required to split a node
                                min_samples_leaf=1) # Minimum number of samples required at each leaf node
rf_clf.fit(X_train, y_train)


In [13]:
# Predictions
y_pred = rf_clf.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2%}")
# print("Classification Report:")
# print(classification_report(y_test, y_pred))


Accuracy: 88.88%


In [16]:
from sklearn.neural_network import MLPClassifier
nn_clf = MLPClassifier(hidden_layer_sizes=(100,),  # One hidden layer with 100 neurons
                       activation='relu',         # Activation function for the neurons
                       solver='adam',             # Solver for weight optimization
                       max_iter=300,              # Maximum number of iterations
                       random_state=42,           # Random state for reproducibility
                       verbose=True)              # Print progress messages to stdout
nn_clf.fit(X_train, y_train)
y_pred = nn_clf.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2%}")


Iteration 1, loss = 0.53162018
Iteration 2, loss = 0.32610707
Iteration 3, loss = 0.27176704
Iteration 4, loss = 0.25653328
Iteration 5, loss = 0.25129625
Iteration 6, loss = 0.24837691
Iteration 7, loss = 0.24590758
Iteration 8, loss = 0.24394162
Iteration 9, loss = 0.24174729
Iteration 10, loss = 0.23954097
Iteration 11, loss = 0.23741508
Iteration 12, loss = 0.23509714
Iteration 13, loss = 0.23308127
Iteration 14, loss = 0.23084170
Iteration 15, loss = 0.22860683
Iteration 16, loss = 0.22652007
Iteration 17, loss = 0.22482899
Iteration 18, loss = 0.22288194
Iteration 19, loss = 0.22117262
Iteration 20, loss = 0.21928932
Iteration 21, loss = 0.21780186
Iteration 22, loss = 0.21610014
Iteration 23, loss = 0.21436828
Iteration 24, loss = 0.21315971
Iteration 25, loss = 0.21170381
Iteration 26, loss = 0.21038940
Iteration 27, loss = 0.20918615
Iteration 28, loss = 0.20821246
Iteration 29, loss = 0.20715463
Iteration 30, loss = 0.20623894
Iteration 31, loss = 0.20519994
Iteration 32, los

Accuracy: 89.82%
