In [150]:
import pandas as pd
import numpy as np


In [151]:
df = pd.read_csv('diabetes_prediction_dataset.csv')

In [152]:
df.CLASS.unique()

array(['N', 'N ', 'P', 'Y', 'Y '], dtype=object)

In [153]:
df['CLASS']=df['CLASS'].str.strip()
df['CLASS'].unique()

array(['N', 'P', 'Y'], dtype=object)

In [154]:
df['Gender']=df['Gender'].map({'f':'F', 'F':'F', 'M':'M' })

In [155]:
df['Gender'].value_counts()

Gender
M    565
F    435
Name: count, dtype: int64

In [156]:
df

Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,502,17975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,735,34221,M,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,420,47975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,680,87656,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,504,34223,M,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,200,454317,M,71,11.0,97,7.0,7.5,1.7,1.2,1.8,0.6,30.0,Y
996,671,876534,M,31,3.0,60,12.3,4.1,2.2,0.7,2.4,15.4,37.2,Y
997,669,87654,M,30,7.1,81,6.7,4.1,1.1,1.2,2.4,8.1,27.4,Y
998,99,24004,M,38,5.8,59,6.7,5.3,2.0,1.6,2.9,14.0,40.5,Y


In [157]:
from sklearn.model_selection import train_test_split
X=df.drop('CLASS', axis=1)
y=df['CLASS'].map({'N':0, 'P':1, 'Y':2})
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.75, random_state=42, shuffle=True)

In [195]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score, confusion_matrix

Apply Standard Scaling to all numeric columns, and One-hot Encoding to all categorical columns.


In [159]:
numeric_cols = X.select_dtypes(exclude='object').columns
categoric_cols = X.select_dtypes(include='object').columns
categoric_cols
numeric_cols

Index(['ID', 'No_Pation', 'AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL',
       'LDL', 'VLDL', 'BMI'],
      dtype='object')

In [160]:
encode_ct = ColumnTransformer([
    ("ohe", OneHotEncoder(sparse_output=False, drop='first'), categoric_cols)
], remainder='passthrough', verbose_feature_names_out=False)
scale_ct = ColumnTransformer([
    ("ss", StandardScaler(), numeric_cols)
], remainder='passthrough', verbose_feature_names_out=False)
pipe = Pipeline([
    ("encoder", encode_ct),
    ("scale", scale_ct)
]).set_output(transform='pandas')
pipe
X_tf=pipe.fit_transform(X)

In [161]:
X_tf.shape, y.shape

((1000, 13), (1000,))

In [162]:
X_train, X_test, y_train, y_test=train_test_split(X_tf,y, train_size=0.75, shuffle=True, stratify=y, random_state=42 )

In [163]:
knn = KNeighborsClassifier(n_neighbors=7, weights='distance', metric='euclidean')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.69      0.73        26
           1       0.83      0.38      0.53        13
           2       0.95      1.00      0.97       211

    accuracy                           0.93       250
   macro avg       0.86      0.69      0.74       250
weighted avg       0.93      0.93      0.92       250



In [164]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(activation='relu', solver='adam', learning_rate_init=0.001, max_iter=50, random_state=1729, early_stopping=True, hidden_layer_sizes=(32,16))
mlp.fit(X_train, y_train)
print(mlp.loss_)

0.24560424771504447


In [165]:
rfc = RandomForestClassifier(n_estimators=150, max_depth=12, max_features='sqrt', random_state=1729)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
y_prob = rfc.predict_proba(X_test)
print(roc_auc_score(y_test, y_prob, multi_class='ovo'))

0.9996658160165269


In [166]:
gbc = GradientBoostingClassifier(loss='log_loss', random_state=1729)
param_grid = {
    'n_estimators': [50,100,200],
    'max_depth': [2,3],
    'learning_rate': [0.01,0.05,0.1]
}
grid = GridSearchCV(gbc, param_grid)
grid.fit(X_train, y_train)
grid.best_score_

np.float64(0.9880000000000001)

In [167]:
svm = SVC(kernel='linear', C=0.5, random_state=1729)
svm.fit(X_train, y_train)
y_pred=svm.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 23,   0,   3],
       [  2,   3,   8],
       [  4,   0, 207]])

In [168]:
dtc = DecisionTreeClassifier(criterion='entropy', splitter='random', min_samples_split=6, random_state=1729)
dtc.fit(X_train, y_train)
y_pred=dtc.predict(X_test)
accuracy_score(y_test, y_pred)

0.96

In [171]:
pca  = PCA(n_components=0.95)
pca.fit_transform(X_train)
pca.transform(X_test)
n_com = pca.n_components_
print(n_com)

11


In [194]:
lr = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, random_state=1729)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
precision_score(y_test, y_pred, pos_label=1, average='macro')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


0.5650907577885266