# Import Dataset

In [142]:
# Import all things here
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score
import joblib

In [143]:
df = pd.read_csv('diabetes indicator dataset.csv')
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [144]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70692 entries, 0 to 70691
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Diabetes_binary       70692 non-null  float64
 1   HighBP                70692 non-null  float64
 2   HighChol              70692 non-null  float64
 3   CholCheck             70692 non-null  float64
 4   BMI                   70692 non-null  float64
 5   Smoker                70692 non-null  float64
 6   Stroke                70692 non-null  float64
 7   HeartDiseaseorAttack  70692 non-null  float64
 8   PhysActivity          70692 non-null  float64
 9   Fruits                70692 non-null  float64
 10  Veggies               70692 non-null  float64
 11  HvyAlcoholConsump     70692 non-null  float64
 12  AnyHealthcare         70692 non-null  float64
 13  NoDocbcCost           70692 non-null  float64
 14  GenHlth               70692 non-null  float64
 15  MentHlth           

# Data Prep

In [145]:
# Check duplicated data
print(f"Jumlah duplikasi data: {df.duplicated().sum()}")

Jumlah duplikasi data: 1635


In [146]:
# Drop duplicated data
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

print(f"Jumlah duplikasi data setelah dihapus: {df.duplicated().sum()}")

Jumlah duplikasi data setelah dihapus: 0


In [147]:
# Check missing values
print(f"Jumlah missing values: {df.isnull().sum().sum()}")

Jumlah missing values: 0


In [148]:
# Split data into features and target
X = df.drop(columns=['Diabetes_binary'], axis=1)
y = df['Diabetes_binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Data Modeling

In [149]:
xgb = xgb.XGBClassifier(
    objective='binary:logistic',
    random_state=42,
    eval_metric='logloss'
)

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2],         
    'reg_alpha': [0, 0.005, 0.01]  
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

xgb_clf = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid_xgb,
    n_iter=25,
    cv=skf,
    scoring='f1_macro',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

xgb_clf.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [150]:
print("Best Parameters:", xgb_clf.best_params_)
print("Best F1 Score (Macro):", xgb_clf.best_score_)

model_xgb = xgb_clf.best_estimator_

Best Parameters: {'subsample': 0.7, 'reg_alpha': 0.005, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.8}
Best F1 Score (Macro): 0.7495990082586085


In [151]:
y_pred_proba = model_xgb.predict_proba(X_test) 
y_pred = model_xgb.predict(X_test)

In [152]:
print("Classification Report:\n", classification_report(y_test, y_pred))
print("F1 Score (Macro):", f1_score(y_test, y_pred, average='macro'))
print("Validation Accuracy:", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

         0.0       0.77      0.70      0.73      6792
         1.0       0.73      0.80      0.77      7020

    accuracy                           0.75     13812
   macro avg       0.75      0.75      0.75     13812
weighted avg       0.75      0.75      0.75     13812

F1 Score (Macro): 0.7489634579559415
Validation Accuracy: 0.7500724008108891


In [153]:
# Save the model to a file
joblib.dump(model_xgb, 'diabetes_model.joblib')

['diabetes_model.joblib']