# Random Forest

In [14]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
import numpy as np

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Scale the data using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train Random Forest model using cross_val_predict
rf_model = RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1)

y_pred_cv = cross_val_predict(rf_model, X_scaled, y, cv=skf)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("=== Evaluasi Model dengan Cross-Validation ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)


=== Evaluasi Model dengan Cross-Validation ===
Accuracy    : 0.8253
Precision   : 0.4873
Recall      : 0.2058
F1 Score    : 0.2894
Specificity : 0.9547

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.85      0.95      0.90    190055
         1.0       0.49      0.21      0.29     39726

    accuracy                           0.83    229781
   macro avg       0.67      0.58      0.59    229781
weighted avg       0.79      0.83      0.79    229781


=== Confusion Matrix (Cross-Validation) ===
[[181453   8602]
 [ 31550   8176]]


In [19]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Define pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Scaling
    ('smote', SMOTE(random_state=42)),  # Oversampling
    ('rf', RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1))  # Random Forest model
])

# Perform stratified cross-validation with cross_val_predict
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("\n=== Evaluasi Model dengan Cross-Validation dan SMOTE ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)



=== Evaluasi Model dengan Cross-Validation dan SMOTE ===
Accuracy    : 0.8149
Precision   : 0.4490
Recall      : 0.3102
F1 Score    : 0.3669
Specificity : 0.9204

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89    190055
         1.0       0.45      0.31      0.37     39726

    accuracy                           0.81    229781
   macro avg       0.66      0.62      0.63    229781
weighted avg       0.79      0.81      0.80    229781


=== Confusion Matrix (Cross-Validation) ===
[[174931  15124]
 [ 27402  12324]]


In [20]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import EditedNearestNeighbours

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Define pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Scaling
    ('enn', EditedNearestNeighbours()),  # Undersampling
    ('rf', RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1))  # Random Forest model
])

# Perform stratified cross-validation with cross_val_predict
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("\n=== Evaluasi Model dengan Cross-Validation dan SMOTE ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)



=== Evaluasi Model dengan Cross-Validation dan SMOTE ===
Accuracy    : 0.7731
Precision   : 0.3948
Recall      : 0.5864
F1 Score    : 0.4719
Specificity : 0.8121

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.90      0.81      0.86    190055
         1.0       0.39      0.59      0.47     39726

    accuracy                           0.77    229781
   macro avg       0.65      0.70      0.66    229781
weighted avg       0.82      0.77      0.79    229781


=== Confusion Matrix (Cross-Validation) ===
[[154339  35716]
 [ 16429  23297]]


In [21]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN

# Load dataset
url = 'https://raw.githubusercontent.com/adeeeehidayat/diabetes-health-indicators-dataset/main/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(url)

# Data preprocessing
df = df.dropna()
df = df.drop_duplicates()
df['Diabetes_012'] = df['Diabetes_012'].replace({1: 1, 2: 1})
df = df.rename(columns={'Diabetes_012': 'Diabetes'})

# Split features and target
X = df.drop(columns=['Diabetes'])
y = df['Diabetes']

# Define pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Scaling
    ('smoteenn', SMOTEENN(random_state=42)),  # Hybrid
    ('rf', RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1))  # Random Forest model
])

# Perform stratified cross-validation with cross_val_predict
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(pipeline, X, y, cv=skf, n_jobs=-1)

# Evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cv)
precision_cv = precision_score(y, y_pred_cv)
recall_cv = recall_score(y, y_pred_cv)
f1_cv = f1_score(y, y_pred_cv)

# Confusion matrix
cm_cv = confusion_matrix(y, y_pred_cv)
tn_cv, fp_cv, fn_cv, tp_cv = cm_cv.ravel()
specificity_cv = tn_cv / (tn_cv + fp_cv)

# Print evaluation results
print("\n=== Evaluasi Model dengan Cross-Validation dan SMOTE ===")
print(f"Accuracy    : {accuracy_cv:.4f}")
print(f"Precision   : {precision_cv:.4f}")
print(f"Recall      : {recall_cv:.4f}")
print(f"F1 Score    : {f1_cv:.4f}")
print(f"Specificity : {specificity_cv:.4f}")

print("\n=== Classification Report (Cross-Validation) ===")
print(classification_report(y, y_pred_cv))

print("\n=== Confusion Matrix (Cross-Validation) ===")
print(cm_cv)



=== Evaluasi Model dengan Cross-Validation dan SMOTE ===
Accuracy    : 0.7576
Precision   : 0.3804
Recall      : 0.6396
F1 Score    : 0.4771
Specificity : 0.7823

=== Classification Report (Cross-Validation) ===
              precision    recall  f1-score   support

         0.0       0.91      0.78      0.84    190055
         1.0       0.38      0.64      0.48     39726

    accuracy                           0.76    229781
   macro avg       0.65      0.71      0.66    229781
weighted avg       0.82      0.76      0.78    229781


=== Confusion Matrix (Cross-Validation) ===
[[148671  41384]
 [ 14316  25410]]
