In [104]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import GridSearchCV


In [105]:
df = pd.read_csv("diabetes.csv")
print(df.head())
print("Shape:", df.shape)

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
Shape: (768, 9)


In [106]:
print("Class Distribution:", Counter(df["Outcome"]))

Class Distribution: Counter({0: 500, 1: 268})


In [107]:
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)


In [108]:
weight = (y_train == 0).sum() / (y_train == 1).sum()

In [109]:
import xgboost as xgb
from xgboost import XGBClassifier

model_xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    eval_metric='logloss',
    scale_pos_weight=500 / 268,  
    random_state=42
)

model_xgb.fit(X_train, y_train)


In [110]:
y_pred_xgb = model_xgb.predict(X_test)
print("🎯 XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

🎯 XGBoost Accuracy: 0.7402597402597403
              precision    recall  f1-score   support

           0       0.82      0.77      0.79       100
           1       0.62      0.69      0.65        54

    accuracy                           0.74       154
   macro avg       0.72      0.73      0.72       154
weighted avg       0.75      0.74      0.74       154



In [111]:
model_ada = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=3),
    n_estimators=200,
    learning_rate=0.1,
    algorithm='SAMME'
)

model_ada.fit(X_train, y_train)
y_pred_ada = model_ada.predict(X_test)

print("🌿 AdaBoost Accuracy:", accuracy_score(y_test, y_pred_ada))
print(classification_report(y_test, y_pred_ada))


🌿 AdaBoost Accuracy: 0.7727272727272727
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       100
           1       0.70      0.61      0.65        54

    accuracy                           0.77       154
   macro avg       0.75      0.74      0.74       154
weighted avg       0.77      0.77      0.77       154



In [112]:
print(type(model_xgb))
print(type(model_ada))


<class 'xgboost.sklearn.XGBClassifier'>
<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


In [113]:
print(y_train.shape)
print(np.unique(y_train))
print(type(y_train))


(614,)
[0 1]
<class 'pandas.core.series.Series'>


In [114]:
print(X_train.shape)
print(type(X_train))
print(X_train[:5]) 

(614, 8)
<class 'numpy.ndarray'>
[[-0.84488505 -0.96691063 -0.36733675 -0.53547548 -0.31952605 -0.6082704
   0.3265464  -0.78628618]
 [ 0.3429808   0.1597866   0.45982725  0.40544544 -0.50186686 -0.30366421
  -0.09929033  0.57511787]
 [-0.54791859 -0.49745345 -0.57412775  1.22091023  0.12330164  0.36900779
  -0.74559573 -0.70119842]
 [-0.84488505  0.7857295  -0.67752325 -1.28821221 -0.69289057 -0.29097229
   0.2782245  -0.36084741]
 [-1.14185152 -0.81042491 -0.26394125  1.15818217  0.2188135   1.60012447
  -0.319759   -0.95646168]]


In [115]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', model_xgb),
        ('ada', model_ada)
    ],
    voting='hard',
    weights=[1, 2]  
)

voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)

print("✅ Voting Ensemble Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


✅ Voting Ensemble Accuracy: 0.7727272727272727
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       100
           1       0.70      0.61      0.65        54

    accuracy                           0.77       154
   macro avg       0.75      0.74      0.74       154
weighted avg       0.77      0.77      0.77       154

