In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from scipy.stats import chi2_contingency

In [2]:
import pandas as pd
df = pd.read_csv("diabetes_dataset.csv")
df.head()

Unnamed: 0,age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage,diagnosed_diabetes
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,...,41,160,145,136,236,6.36,8.18,29.6,Type 2,1
1,48,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,...,55,50,30,93,150,2.0,5.63,23.0,No Diabetes,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,...,66,99,36,118,195,5.07,7.51,44.7,Type 2,1
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,...,50,79,140,139,253,5.28,9.03,38.2,Type 2,1
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,...,52,125,160,137,184,12.74,7.2,23.5,Type 2,1


In [3]:
# Jika ada yang duplikat, hapus baris duplikat tersebut
df = df.drop_duplicates().reset_index(drop=True)

In [4]:
cat_cols = df.select_dtypes(include=['object','category']).columns

for col in cat_cols:
    df[col] = (
        df[col]
        .astype(str)
        .str.strip()
        .str.title()
    )

In [5]:
num_cols = df.select_dtypes(include=['int64','float64']).columns
cat_cols = df.select_dtypes(include=['object','category']).columns

num_cols, cat_cols

(Index(['age', 'alcohol_consumption_per_week',
        'physical_activity_minutes_per_week', 'diet_score',
        'sleep_hours_per_day', 'screen_time_hours_per_day',
        'family_history_diabetes', 'hypertension_history',
        'cardiovascular_history', 'bmi', 'waist_to_hip_ratio', 'systolic_bp',
        'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol',
        'ldl_cholesterol', 'triglycerides', 'glucose_fasting',
        'glucose_postprandial', 'insulin_level', 'hba1c', 'diabetes_risk_score',
        'diagnosed_diabetes'],
       dtype='object'),
 Index(['gender', 'ethnicity', 'education_level', 'income_level',
        'employment_status', 'smoking_status', 'diabetes_stage'],
       dtype='object'))

In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder

In [7]:
df_clean = df.copy()
df_clean = df_clean.drop_duplicates().reset_index(drop=True)

# Merapihkan data kategorikal
cat_cols = df_clean.select_dtypes(include=['object','category']).columns
for col in cat_cols:
    df_clean[col] = (
        df_clean[col]
        .astype(str)
        .str.strip()
        .str.title()
    )

In [8]:
df_train, df_test = train_test_split(
    df_clean,
    test_size=0.30,
    stratify=df_clean['diagnosed_diabetes'],
    random_state=42
)

In [9]:
label_encoders = {}

cat_cols = df_train.select_dtypes(include=['object','category']).columns

for col in cat_cols:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.transform(df_test[col])
    label_encoders[col] = le

In [10]:
# Kolom numerik (kecuali kolom target)
num_cols_task1 = [
    'hba1c',
    'glucose_fasting',
    'glucose_postprandial',
    'insulin_level',
    'triglycerides',
    'age',
    'bmi'
]

from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

df_train[num_cols_task1] = scaler.fit_transform(df_train[num_cols_task1])
df_test[num_cols_task1] = scaler.transform(df_test[num_cols_task1])

In [11]:
features_task1 = [
    'hba1c',
    'glucose_fasting',
    'glucose_postprandial',
    'insulin_level',
    'triglycerides',
    'family_history_diabetes',
    'hypertension_history',
    'age',
    'bmi'
]

X1_train = df_train[features_task1]
y1_train = df_train['diagnosed_diabetes']

# Balanced test set
df_test1 = (
    df_test
    .groupby('diagnosed_diabetes', group_keys=False)
    .apply(lambda x: x.sample(1500, random_state=42))
    .sample(frac=1, random_state=42)
)

X1_test = df_test1[features_task1]
y1_test = df_test1['diagnosed_diabetes']

  .apply(lambda x: x.sample(1500, random_state=42))


In [12]:
df_train2 = df_train[df_train['diagnosed_diabetes'] == 1]

min_count = df_train2['diabetes_stage'].value_counts().min()

df_train2_bal = (
    df_train2
    .groupby('diabetes_stage', group_keys=False)
    .apply(lambda x: x.sample(min_count, random_state=42))
)

features_task2 = [
    'hba1c',
    'glucose_fasting',
    'glucose_postprandial',
    'insulin_level',
    'bmi',
    'age'
]

X2_train = df_train2_bal[features_task2]
y2_train = df_train2_bal['diabetes_stage']


  .apply(lambda x: x.sample(min_count, random_state=42))


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)
import pandas as pd


In [14]:
def evaluate_binary_model(name, y_true, y_pred, y_prob=None):
    results = {
        'Model': name,
        'Accuracy (%)': round(accuracy_score(y_true, y_pred) * 100, 2),
        'Precision (%)': round(precision_score(y_true, y_pred) * 100, 2),
        'Recall (%)': round(recall_score(y_true, y_pred) * 100, 2),
        'F1-Score (%)': round(f1_score(y_true, y_pred) * 100, 2)
    }
    
    if y_prob is not None:
        results['AUC (%)'] = round(roc_auc_score(y_true, y_prob) * 100, 2)
    else:
        results['AUC (%)'] = None
    
    return results


In [15]:
from sklearn.ensemble import RandomForestClassifier

X1_train = df_train[features_task1]
y1_train = df_train['diagnosed_diabetes']

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf.fit(X1_train, y1_train)

y_pred_rf = rf.predict(X1_test)
y_prob_rf = rf.predict_proba(X1_test)[:, 1]

rf_results = evaluate_binary_model(
    "Random Forest",
    y1_test,
    y_pred_rf,
    y_prob_rf
)

print("=== Random Forest ===")
print(classification_report(y1_test, y_pred_rf))
print(confusion_matrix(y1_test, y_pred_rf))


=== Random Forest ===
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      1500
           1       1.00      0.88      0.94      1500

    accuracy                           0.94      3000
   macro avg       0.95      0.94      0.94      3000
weighted avg       0.95      0.94      0.94      3000

[[1498    2]
 [ 178 1322]]


In [16]:
df_train2 = df_train[df_train['diagnosed_diabetes'] == 1].copy()

print("Jumlah data Task 2:", len(df_train2))
print("Distribusi stage (encoded):")
print(df_train2['diabetes_stage'].value_counts())

Jumlah data Task 2: 41999
Distribusi stage (encoded):
diabetes_stage
4    41838
0      111
3       50
Name: count, dtype: int64


In [17]:
stage_mapping = {
    0: "Gestational",
    1: "Pre-Diabetes",
    2: "No Diabetes",
    3: "Type 1",
    4: "Type 2"
}

print(
    df_train2['diabetes_stage']
    .map(stage_mapping)
    .value_counts()
)

diabetes_stage
Type 2         41838
Gestational      111
Type 1            50
Name: count, dtype: int64


In [18]:
min_count = df_train2['diabetes_stage'].value_counts().min()

df_train2_bal = (
    df_train2
    .groupby('diabetes_stage', group_keys=False)
    .apply(lambda x: x.sample(min_count, random_state=42))
    .reset_index(drop=True)
)

print("Setelah balancing:")
print(df_train2_bal['diabetes_stage'].map(stage_mapping).value_counts())

Setelah balancing:
diabetes_stage
Gestational    50
Type 1         50
Type 2         50
Name: count, dtype: int64


  .apply(lambda x: x.sample(min_count, random_state=42))


In [19]:
features_task2 = [
    'hba1c',
    'glucose_fasting',
    'glucose_postprandial',
    'insulin_level',
    'bmi',
    'age'
]

X2_train = df_train2_bal[features_task2]
y2_train = df_train2_bal['diabetes_stage']

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf_stage = RandomForestClassifier(
    n_estimators=500,
    random_state=42,
    n_jobs=-1
)

rf_stage.fit(X2_train, y2_train)

In [21]:
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

y2_pred_cv = cross_val_predict(
    rf_stage,
    X2_train,
    y2_train,
    cv=cv
)

In [22]:
stage_mapping = {
    0: "Gestational",
    3: "Type 1",
    4: "Type 2"
}

# urutan sesuai label di y
labels_sorted = sorted(stage_mapping.keys())
target_names = [stage_mapping[i] for i in labels_sorted]

print("\nClassification Report:")
print(
    classification_report(
        y2_train,
        y2_pred_cv,
        labels=labels_sorted,
        target_names=target_names
    )
)


Classification Report:
              precision    recall  f1-score   support

 Gestational       0.46      0.42      0.44        50
      Type 1       0.57      0.68      0.62        50
      Type 2       0.80      0.70      0.74        50

    accuracy                           0.60       150
   macro avg       0.61      0.60      0.60       150
weighted avg       0.61      0.60      0.60       150



In [23]:
print("=== TASK 2 - RANDOM FOREST (3 CLASS) ===")
print("Accuracy  :", round(accuracy_score(y2_train, y2_pred_cv) * 100, 2))
print("Precision :", round(precision_score(y2_train, y2_pred_cv, average='macro') * 100, 2))
print("Recall    :", round(recall_score(y2_train, y2_pred_cv, average='macro') * 100, 2))
print("F1-Score  :", round(f1_score(y2_train, y2_pred_cv, average='macro') * 100, 2))

=== TASK 2 - RANDOM FOREST (3 CLASS) ===
Accuracy  : 60.0
Precision : 60.62
Recall    : 60.0
F1-Score  : 60.01


In [24]:
import joblib

joblib.dump(rf, "rf_diabetes_task1.pkl")
joblib.dump(scaler, "scaler_task1.pkl")
joblib.dump(label_encoders, "label_encoders_task1.pkl")

['label_encoders_task1.pkl']