In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns


In [34]:
data = pd.read_csv('diabetes.csv')
data.head()  # Shows first 5 rows


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [35]:
# Check for missing values
data.isnull().sum()

# Replace zeros with median
cols = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for col in cols:
    data[col] = data[col].replace(0, data[col].median())

# Separate features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [36]:
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_scaled, y)

# Check new class distribution
import numpy as np
np.bincount(y_res)


array([500, 500])

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)


In [38]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [39]:
models = {'Logistic Regression': lr, 'Random Forest': rf, 'XGBoost': xgb}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"{name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-------------------------")


Logistic Regression:
Accuracy: 0.735
Precision: 0.7222222222222222
Recall: 0.7722772277227723
ROC-AUC: 0.7346234623462348
Confusion Matrix:
 [[69 30]
 [23 78]]
-------------------------
Random Forest:
Accuracy: 0.78
Precision: 0.7567567567567568
Recall: 0.8316831683168316
ROC-AUC: 0.7794779477947795
Confusion Matrix:
 [[72 27]
 [17 84]]
-------------------------
XGBoost:
Accuracy: 0.785
Precision: 0.7589285714285714
Recall: 0.8415841584158416
ROC-AUC: 0.7844284428442844
Confusion Matrix:
 [[72 27]
 [16 85]]
-------------------------


In [40]:
print(X.columns.tolist())


['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']


In [41]:
new_patient = [[120, 70, 30, 100, 25, 0, 33, 25]]  # Glucose, BP, Skin, Insulin, BMI, Pedigree, Age
new_patient_scaled = scaler.transform(new_patient)
prediction = xgb.predict(new_patient_scaled)
if prediction[0]==1:
    print("High risk of diabetes")
else:
    print("Low risk of diabetes")


Low risk of diabetes




In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score

models = {'Logistic Regression': lr, 'Random Forest': rf, 'XGBoost': xgb}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"{name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-------------------------")


Logistic Regression:
Accuracy: 0.735
Precision: 0.7222222222222222
Recall: 0.7722772277227723
ROC-AUC: 0.7346234623462348
Confusion Matrix:
 [[69 30]
 [23 78]]
-------------------------
Random Forest:
Accuracy: 0.78
Precision: 0.7567567567567568
Recall: 0.8316831683168316
ROC-AUC: 0.7794779477947795
Confusion Matrix:
 [[72 27]
 [17 84]]
-------------------------
XGBoost:
Accuracy: 0.785
Precision: 0.7589285714285714
Recall: 0.8415841584158416
ROC-AUC: 0.7844284428442844
Confusion Matrix:
 [[72 27]
 [16 85]]
-------------------------
