In [None]:
!pip install xgboost lightgbm catboost

# =======================================
# IMPORTS
# =======================================

import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# =======================================
# LOAD & PREPARE DATA
# =======================================

URL = "https://raw.githubusercontent.com/abieniek03/Diabetes-Prediction-Expert-System/refs/heads/main/diabetes.csv"
df = pd.read_csv(URL)

# Columns where zero is biologically impossible â†’ treat as missing
zero_as_nan = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
df[zero_as_nan] = df[zero_as_nan].replace(0, np.nan)

# Fill missing values with median
for col in zero_as_nan:
    df[col] = df[col].fillna(df[col].median())

# Features and label
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# =======================================
# EXPERT SYSTEM
# =======================================

def expert_system_predict(X_data):
    predictions = []
    logs = []

    for _, row in X_data.iterrows():
        score = 0
        triggered_rules = []

        # R1: High glucose
        if row["Glucose"] > 150:
            score += 2
            triggered_rules.append("R1: Glucose > 150")

        # R2: Very high BMI
        if row["BMI"] > 35:
            score += 2
            triggered_rules.append("R2: BMI > 35")

        # R3: Very high insulin
        if row["Insulin"] > 200:
            score += 1
            triggered_rules.append("R3: Insulin > 200")

        # R4: High genetic risk
        if row["DiabetesPedigreeFunction"] > 0.8:
            score += 1
            triggered_rules.append("R4: Pedigree > 0.8")

        # R5: Age + blood pressure rule
        if row["Age"] > 50 and row["BloodPressure"] > 80:
            score += 1
            triggered_rules.append("R5: Age > 50 + BP > 80")

        # R6: Skin thickness + BMI rule
        if row["SkinThickness"] > 32 and row["BMI"] > 32:
            score += 1
            triggered_rules.append("R6: SkinThickness > 32 + BMI > 32")

        decision = 1 if score >= 2 else 0

        predictions.append(decision)
        logs.append((decision, score, triggered_rules))

    return np.array(predictions), logs


# =======================================
# EXPERT SYSTEM RESULTS
# =======================================

y_pred_expert, logs_expert = expert_system_predict(X_test)

print("\n==============================")
print(" EXPERT SYSTEM")
print("==============================")
print(f"Accuracy: {accuracy_score(y_test, y_pred_expert):.4f}")
print(classification_report(y_test, y_pred_expert,
                            target_names=['Healthy', 'Diabetes']))

# =======================================
# ML MODELS
# =======================================

# XGBOOST
xgb = XGBClassifier(random_state=42, eval_metric="logloss")
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print("\n==============================")
print(" XGBOOST")
print("==============================")
print(accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb,
                            target_names=['Healthy', 'Diabetes']))

# LIGHTGBM
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train, y_train)
y_pred_lgbm = lgbm.predict(X_test)

print("\n==============================")
print(" LIGHTGBM")
print("==============================")
print(accuracy_score(y_test, y_pred_lgbm))
print(classification_report(y_test, y_pred_lgbm,
                            target_names=['Healthy', 'Diabetes']))

# CATBOOST
catb = CatBoostClassifier(random_state=42, verbose=0)
catb.fit(X_train, y_train)
y_pred_catb = catb.predict(X_test)

print("\n==============================")
print(" CATBOOST")
print("==============================")
print(accuracy_score(y_test, y_pred_catb))
print(classification_report(y_test, y_pred_catb,
                            target_names=['Healthy', 'Diabetes']))

# =======================================
# DETAILED EXPERT SYSTEM REPORT
# =======================================

print("\n===== EXPERT SYSTEM REPORT =====")

for idx, (decision, score, rules) in enumerate(logs_expert):
    print("\n---------------------------------")
    print(f"PATIENT {idx}")
    print(f"Decision: {'DIABETES' if decision == 1 else 'HEALTHY'}")
    print(f"Risk score: {score}")
    print("Triggered rules:")
    if rules:
        for r in rules:
            print(" -", r)
    else:
        print(" - None")

# Rule statistics
rule_counter = Counter()
for decision, score, rules in logs_expert:
    for r in rules:
        rule_counter[r] += 1

# =======================================
# EXPERT SYSTEM SUMMARY
# =======================================

total = len(y_pred_expert)
positive = y_pred_expert.sum()
negative = total - positive

print("\n==============================")
print(" EXPERT SYSTEM SUMMARY")
print("==============================")
print(f"Total patients: {total}")
print(f"Predicted diabetes: {positive}")
print(f"Predicted healthy: {negative}")

print("\n==============================")
print(" RULE ACTIVATION STATISTICS")
print("==============================")

for rule, cnt in rule_counter.most_common():
    print(f"{rule}: {cnt} times")

# Most decisive rule (most frequently triggered)
if rule_counter:
    most_decisive_rule, count = rule_counter.most_common(1)[0]
    print("\n==============================")
    print(" MOST DECISIVE RULE")
    print("==============================")
    print(f"{most_decisive_rule}")
    print(f"Triggered: {count} times")
else:
    print("No rules were triggered.")


 EXPERT SYSTEM
Accuracy: 0.6302
              precision    recall  f1-score   support

     Healthy       0.75      0.66      0.70       125
    Diabetes       0.48      0.58      0.52        67

    accuracy                           0.63       192
   macro avg       0.61      0.62      0.61       192
weighted avg       0.65      0.63      0.64       192


 XGBOOST
0.75
              precision    recall  f1-score   support

     Healthy       0.79      0.84      0.81       125
    Diabetes       0.66      0.58      0.62        67

    accuracy                           0.75       192
   macro avg       0.73      0.71      0.72       192
weighted avg       0.74      0.75      0.75       192

[LightGBM] [Info] Number of positive: 201, number of negative: 375
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000071 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 637
[LightGBM] [Info] Number of data poi