# Random Forest

In [14]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
import numpy as np

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Scale the data using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train Random Forest model using cross_val_predict
rf_model = RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1)

y_pred_cv = cross_val_predict(rf_model, X_scaled, y, cv=skf)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("=== Evaluasi Model dengan Cross-Validation ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)


=== Evaluasi Model dengan Cross-Validation ===
Accuracy    : 0.8253
Precision   : 0.4873
Recall      : 0.2058
F1 Score    : 0.2894
Specificity : 0.9547

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.85      0.95      0.90    190055
         1.0       0.49      0.21      0.29     39726

    accuracy                           0.83    229781
   macro avg       0.67      0.58      0.59    229781
weighted avg       0.79      0.83      0.79    229781


=== Confusion Matrix (Cross-Validation) ===
[[181453   8602]
 [ 31550   8176]]


# Random Forest + SMOTE

In [19]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Define pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Scaling
    ('smote', SMOTE(random_state=42)),  # Oversampling
    ('rf', RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1))  # Random Forest model
])

# Perform stratified cross-validation with cross_val_predict
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("\n=== Evaluasi Model dengan Cross-Validation dan SMOTE ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)



=== Evaluasi Model dengan Cross-Validation dan SMOTE ===
Accuracy    : 0.8149
Precision   : 0.4490
Recall      : 0.3102
F1 Score    : 0.3669
Specificity : 0.9204

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89    190055
         1.0       0.45      0.31      0.37     39726

    accuracy                           0.81    229781
   macro avg       0.66      0.62      0.63    229781
weighted avg       0.79      0.81      0.80    229781


=== Confusion Matrix (Cross-Validation) ===
[[174931  15124]
 [ 27402  12324]]


# Random Forest + ENN

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import EditedNearestNeighbours

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Define pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Scaling
    ('enn', EditedNearestNeighbours()),  # Undersampling
    ('rf', RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1))  # Random Forest model
])

# Perform stratified cross-validation with cross_val_predict
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("\n=== Evaluasi Model dengan Cross-Validation dan SMOTE ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)



=== Evaluasi Model dengan Cross-Validation dan SMOTE ===
Accuracy    : 0.7731
Precision   : 0.3948
Recall      : 0.5864
F1 Score    : 0.4719
Specificity : 0.8121

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.90      0.81      0.86    190055
         1.0       0.39      0.59      0.47     39726

    accuracy                           0.77    229781
   macro avg       0.65      0.70      0.66    229781
weighted avg       0.82      0.77      0.79    229781


=== Confusion Matrix (Cross-Validation) ===
[[154339  35716]
 [ 16429  23297]]


# Random Forest + SMOTE-ENN

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Define pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Scaling
    ('smoteenn', SMOTEENN(random_state=42)),  # Hybrid
    ('rf', RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1))  # Random Forest model
])

# Perform stratified cross-validation with cross_val_predict
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("\n=== Evaluasi Model dengan Cross-Validation dan SMOTE ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)



=== Evaluasi Model dengan Cross-Validation dan SMOTE ===
Accuracy    : 0.7576
Precision   : 0.3804
Recall      : 0.6396
F1 Score    : 0.4771
Specificity : 0.7823

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.91      0.78      0.84    190055
         1.0       0.38      0.64      0.48     39726

    accuracy                           0.76    229781
   macro avg       0.65      0.71      0.66    229781
weighted avg       0.82      0.76      0.78    229781


=== Confusion Matrix (Cross-Validation) ===
[[148671  41384]
 [ 14316  25410]]


# Random Forest + Parameter Tuning

In [22]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
import numpy as np

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Scale the data using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train Random Forest model using cross_val_predict
rf_model = RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1, max_depth=30, min_samples_split=10, min_samples_leaf=6)

y_pred_cv = cross_val_predict(rf_model, X_scaled, y, cv=skf)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("=== Evaluasi Model dengan Cross-Validation ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)


=== Evaluasi Model dengan Cross-Validation ===
Accuracy    : 0.8365
Precision   : 0.5941
Recall      : 0.1716
F1 Score    : 0.2663
Specificity : 0.9755

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.85      0.98      0.91    190055
         1.0       0.59      0.17      0.27     39726

    accuracy                           0.84    229781
   macro avg       0.72      0.57      0.59    229781
weighted avg       0.81      0.84      0.80    229781


=== Confusion Matrix (Cross-Validation) ===
[[185399   4656]
 [ 32910   6816]]


# Random Forest + SMOTE + Parameter Tuning

In [23]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Define pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Scaling
    ('smote', SMOTE(random_state=42)),  # Oversampling
    ('rf', RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1, max_depth=30, min_samples_split=10, min_samples_leaf=6))  # Random Forest model
])

# Perform stratified cross-validation with cross_val_predict
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("\n=== Evaluasi Model dengan Cross-Validation dan SMOTE ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)



=== Evaluasi Model dengan Cross-Validation dan SMOTE ===
Accuracy    : 0.8149
Precision   : 0.4614
Recall      : 0.4237
F1 Score    : 0.4417
Specificity : 0.8966

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.88      0.90      0.89    190055
         1.0       0.46      0.42      0.44     39726

    accuracy                           0.81    229781
   macro avg       0.67      0.66      0.67    229781
weighted avg       0.81      0.81      0.81    229781


=== Confusion Matrix (Cross-Validation) ===
[[170409  19646]
 [ 22896  16830]]


# Random Forest + ENN + Parameter Tuning

In [24]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import EditedNearestNeighbours

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Define pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Scaling
    ('enn', EditedNearestNeighbours()),  # Undersampling
    ('rf', RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1, max_depth=30, min_samples_split=10, min_samples_leaf=6))  # Random Forest model
])

# Perform stratified cross-validation with cross_val_predict
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("\n=== Evaluasi Model dengan Cross-Validation dan SMOTE ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)



=== Evaluasi Model dengan Cross-Validation dan SMOTE ===
Accuracy    : 0.7891
Precision   : 0.4194
Recall      : 0.5719
F1 Score    : 0.4839
Specificity : 0.8345

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.90      0.83      0.87    190055
         1.0       0.42      0.57      0.48     39726

    accuracy                           0.79    229781
   macro avg       0.66      0.70      0.68    229781
weighted avg       0.82      0.79      0.80    229781


=== Confusion Matrix (Cross-Validation) ===
[[158607  31448]
 [ 17007  22719]]


# Random Forest + SMOTE-ENN + Parameter Tuning

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Define pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Scaling
    ('smoteenn', SMOTEENN(random_state=42)),  # Hybrid
    ('rf', RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1, max_depth=30, min_samples_split=10, min_samples_leaf=6))  # Random Forest model
])

# Perform stratified cross-validation with cross_val_predict
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("\n=== Evaluasi Model dengan Cross-Validation dan SMOTE ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)



=== Evaluasi Model dengan Cross-Validation dan SMOTE ===
Accuracy    : 0.7380
Precision   : 0.3660
Recall      : 0.7043
F1 Score    : 0.4817
Specificity : 0.7450

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.92      0.75      0.82    190055
         1.0       0.37      0.70      0.48     39726

    accuracy                           0.74    229781
   macro avg       0.64      0.72      0.65    229781
weighted avg       0.83      0.74      0.77    229781


=== Confusion Matrix (Cross-Validation) ===
[[141596  48459]
 [ 11748  27978]]


# Hyperparameter Tuning Random Search

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict, RandomizedSearchCV
import numpy as np
from scipy.stats import randint

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Scale the data using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1)

# Define parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(100, 1000),  # Randomly sample from 100 to 1000
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': randint(2, 20),  # Randomly sample from 2 to 20
    'min_samples_leaf': randint(1, 20),  # Randomly sample from 1 to 20
    'bootstrap': [True, False]
}

# Set up RandomizedSearchCV with a specified number of iterations
random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, 
                                   n_iter=10, cv=skf, scoring='recall', 
                                   n_jobs=-1, verbose=1, random_state=42)

# Fit RandomizedSearchCV
random_search.fit(X_scaled, y)

# Get the best model from random search
best_rf_model = random_search.best_estimator_

# Perform cross-validation with the best model
y_pred_cv = cross_val_predict(best_rf_model, X_scaled, y, cv=skf)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("=== Evaluasi Model dengan RandomizedSearchCV ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)

print("\n=== Best Hyperparameters from RandomizedSearchCV ===")
print(random_search.best_params_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits




=== Evaluasi Model dengan RandomizedSearchCV ===
Accuracy    : 0.8349
Precision   : 0.5666
Recall      : 0.1913
F1 Score    : 0.2861
Specificity : 0.9694

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.85      0.97      0.91    190055
         1.0       0.57      0.19      0.29     39726

    accuracy                           0.83    229781
   macro avg       0.71      0.58      0.60    229781
weighted avg       0.80      0.83      0.80    229781


=== Confusion Matrix (Cross-Validation) ===
[[184241   5814]
 [ 32125   7601]]

=== Best Hyperparameters from RandomizedSearchCV ===
{'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 918}


# Random Forest + Best Parameter Random Search

In [9]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
import numpy as np

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Scale the data using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train Random Forest model using cross_val_predict
rf_model = RandomForestClassifier(bootstrap=False, max_depth=None, min_samples_leaf=3, min_samples_split=6, n_estimators=918, n_jobs=-1)

y_pred_cv = cross_val_predict(rf_model, X_scaled, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("=== Evaluasi Model dengan Cross-Validation ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)


=== Evaluasi Model dengan Cross-Validation ===
Accuracy    : 0.8351
Precision   : 0.5679
Recall      : 0.1927
F1 Score    : 0.2878
Specificity : 0.9694

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.85      0.97      0.91    190055
         1.0       0.57      0.19      0.29     39726

    accuracy                           0.84    229781
   macro avg       0.71      0.58      0.60    229781
weighted avg       0.80      0.84      0.80    229781


=== Confusion Matrix (Cross-Validation) ===
[[184231   5824]
 [ 32071   7655]]


# Random Forest + SMOTE + Best Parameter Random Search

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Define pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Scaling
    ('smote', SMOTE(random_state=42)),  # Oversampling
    ('rf', RandomForestClassifier(bootstrap=False, max_depth=None, min_samples_leaf=3, min_samples_split=6, n_estimators=918, n_jobs=-1))  # Random Forest model
])

# Perform stratified cross-validation with cross_val_predict
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("\n=== Evaluasi Model dengan Cross-Validation dan SMOTE ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)



=== Evaluasi Model dengan Cross-Validation dan SMOTE ===
Accuracy    : 0.8224
Precision   : 0.4806
Recall      : 0.3364
F1 Score    : 0.3958
Specificity : 0.9240

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.87      0.92      0.90    190055
         1.0       0.48      0.34      0.40     39726

    accuracy                           0.82    229781
   macro avg       0.68      0.63      0.65    229781
weighted avg       0.80      0.82      0.81    229781


=== Confusion Matrix (Cross-Validation) ===
[[175612  14443]
 [ 26362  13364]]


# Random Forest + ENN + Best Parameter Random Search

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import EditedNearestNeighbours

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Define pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Scaling
    ('enn', EditedNearestNeighbours()),  # Oversampling
    ('rf', RandomForestClassifier(bootstrap=False, max_depth=None, min_samples_leaf=3, min_samples_split=6, n_estimators=918, n_jobs=-1))  # Random Forest model
])

# Perform stratified cross-validation with cross_val_predict
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("\n=== Evaluasi Model dengan Cross-Validation dan SMOTE ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)



=== Evaluasi Model dengan Cross-Validation dan SMOTE ===
Accuracy    : 0.7829
Precision   : 0.4101
Recall      : 0.5835
F1 Score    : 0.4817
Specificity : 0.8246

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.90      0.82      0.86    190055
         1.0       0.41      0.58      0.48     39726

    accuracy                           0.78    229781
   macro avg       0.66      0.70      0.67    229781
weighted avg       0.82      0.78      0.80    229781


=== Confusion Matrix (Cross-Validation) ===
[[156711  33344]
 [ 16544  23182]]


# Random Forest + SMOTE-ENN + Best Parameter Random Search

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Define pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Scaling
    ('smoteenn', SMOTEENN(random_state=42)),  # Oversampling
    ('rf', RandomForestClassifier(bootstrap=False, max_depth=None, min_samples_leaf=3, min_samples_split=6, n_estimators=918, n_jobs=-1))  # Random Forest model
])

# Perform stratified cross-validation with cross_val_predict
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("\n=== Evaluasi Model dengan Cross-Validation dan SMOTE ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)



=== Evaluasi Model dengan Cross-Validation dan SMOTE ===
Accuracy    : 0.7543
Precision   : 0.3792
Recall      : 0.6612
F1 Score    : 0.4820
Specificity : 0.7738

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.92      0.77      0.84    190055
         1.0       0.38      0.66      0.48     39726

    accuracy                           0.75    229781
   macro avg       0.65      0.72      0.66    229781
weighted avg       0.82      0.75      0.78    229781


=== Confusion Matrix (Cross-Validation) ===
[[147060  42995]
 [ 13461  26265]]


# XGB

In [26]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from xgboost import XGBClassifier
import numpy as np

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Scale the data using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train XGBoost model using cross_val_predict
xgb_model = XGBClassifier(random_state=42, n_estimators=100, n_jobs=-1)

y_pred_cv = cross_val_predict(xgb_model, X_scaled, y, cv=skf)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("=== Evaluasi Model dengan Cross-Validation ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


=== Evaluasi Model dengan Cross-Validation ===
Accuracy    : 0.8355
Precision   : 0.5656
Recall      : 0.2102
F1 Score    : 0.3065
Specificity : 0.9663

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.85      0.97      0.91    190055
         1.0       0.57      0.21      0.31     39726

    accuracy                           0.84    229781
   macro avg       0.71      0.59      0.61    229781
weighted avg       0.80      0.84      0.80    229781


=== Confusion Matrix (Cross-Validation) ===
[[183641   6414]
 [ 31376   8350]]


# XGB + SMOTE

In [27]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Define pipeline with XGBoost
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Scaling
    ('smote', SMOTE(random_state=42)),  # Oversampling
    ('xgb', XGBClassifier(random_state=42, n_estimators=100, n_jobs=-1))  # XGBoost model
])

# Perform stratified cross-validation with cross_val_predict
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("\n=== Evaluasi Model dengan Cross-Validation dan SMOTE ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.



=== Evaluasi Model dengan Cross-Validation dan SMOTE ===
Accuracy    : 0.8355
Precision   : 0.5585
Recall      : 0.2322
F1 Score    : 0.3280
Specificity : 0.9616

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.86      0.96      0.91    190055
         1.0       0.56      0.23      0.33     39726

    accuracy                           0.84    229781
   macro avg       0.71      0.60      0.62    229781
weighted avg       0.81      0.84      0.81    229781


=== Confusion Matrix (Cross-Validation) ===
[[182762   7293]
 [ 30501   9225]]


# XGB + ENN

In [28]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import EditedNearestNeighbours

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Define pipeline with XGBoost
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Scaling
    ('enn', EditedNearestNeighbours()),  # Undersampling
    ('xgb', XGBClassifier(random_state=42, n_estimators=100, n_jobs=-1))  # XGBoost model
])

# Perform stratified cross-validation with cross_val_predict
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("\n=== Evaluasi Model dengan Cross-Validation dan Edited Nearest Neighbours (ENN) ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.



=== Evaluasi Model dengan Cross-Validation dan Edited Nearest Neighbours (ENN) ===
Accuracy    : 0.7844
Precision   : 0.4136
Recall      : 0.5914
F1 Score    : 0.4868
Specificity : 0.8248

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.91      0.82      0.86    190055
         1.0       0.41      0.59      0.49     39726

    accuracy                           0.78    229781
   macro avg       0.66      0.71      0.68    229781
weighted avg       0.82      0.78      0.80    229781


=== Confusion Matrix (Cross-Validation) ===
[[156752  33303]
 [ 16234  23492]]


# XGB + SMOTE-ENN

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Define pipeline with XGBoost
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Scaling
    ('smoteenn', SMOTEENN(random_state=42)),  # Hybrid sampling
    ('xgb', XGBClassifier(random_state=42, n_estimators=100, n_jobs=-1))  # XGBoost model
])

# Perform stratified cross-validation with cross_val_predict
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("\n=== Evaluasi Model dengan Cross-Validation dan SMOTE ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.



=== Evaluasi Model dengan Cross-Validation dan SMOTE ===
Accuracy    : 0.7819
Precision   : 0.4105
Recall      : 0.5996
F1 Score    : 0.4874
Specificity : 0.8200

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.91      0.82      0.86    190055
         1.0       0.41      0.60      0.49     39726

    accuracy                           0.78    229781
   macro avg       0.66      0.71      0.67    229781
weighted avg       0.82      0.78      0.80    229781


=== Confusion Matrix (Cross-Validation) ===
[[155852  34203]
 [ 15907  23819]]


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
