In [None]:
# !pip install pandas numpy scikit-learn matplotlib imblearn

In [None]:
"""
Algorithms:
    1. SVM
    2. Random Forest
    3. Naive Bayes
    4. KNN
    5. Decision Tree

Outputs:
    - Dataset description
    - Train/Test split
    - Accuracy, Precision, Recall, F1
    - Confusion matrices
    - Bar chart comparison
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay)

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from imblearn.over_sampling import SMOTE  


In [None]:
DATA_PATH = "diabetes_prediction_dataset.csv"

df = pd.read_csv(DATA_PATH)
df.head()

In [None]:


# -----------------------------
# 2) Feature / Target Split
# -----------------------------

TARGET = "diabetes"
X = df.drop(columns=[TARGET])
y = df[TARGET]

categorical_cols = ["gender", "smoking_history"]
numeric_cols = [
    "age", "bmi", "HbA1c_level",
    "blood_glucose_level", "hypertension", "heart_disease"
]

# -----------------------------
# 3) One-Hot Encoding کَتگوریکال‌ها
#    (بدون ColumnTransformer، با get_dummies)
# -----------------------------

X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
print("\nColumns after one-hot encoding:")
print(X_encoded.columns)

# -----------------------------
# 4) Train/Test Split (مثلاً 80/20)
# -----------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, stratify=y, random_state=42
)

print("\n=== Train/Test Split ===")
print("Train size:", X_train.shape)
print("Test size:", X_test.shape)
print("Train class distribution:\n", y_train.value_counts(normalize=True))

# -----------------------------
# 5) Scaling فقط روی ستون‌های عددی اصلی
# -----------------------------

scaler = StandardScaler()

# توجه: numeric_cols هنوز اسم ستون‌های قبل از get_dummies است
# اما در X_encoded هم با همان نام‌ها وجود دارد
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

# -----------------------------
# 6) SMOTE فقط روی Train
# -----------------------------

use_smote = True
if use_smote:
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)
    print("\nAfter SMOTE class distribution (train):")
    print(y_train_res.value_counts())
else:
    X_train_res, y_train_res = X_train_scaled, y_train

# -----------------------------
# 7) Define Models (بدون Pipeline)
# -----------------------------

models = {
    "SVM": SVC(kernel="rbf", probability=True, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Naive Bayes": GaussianNB(),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

results = {}

# -----------------------------
# 8) Train + Evaluate Each Model
# -----------------------------

for name, model in models.items():
    print(f"\n=== Training {name} ===")
    start = time.time()

    # آموزش مستقیم مدل، بدون Pipeline
    model.fit(X_train_res, y_train_res)

    end = time.time()
    train_time = end - start

    # پیش‌بینی روی تست
    y_pred = model.predict(X_test_scaled)

    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec  = recall_score(y_test, y_pred)
    f1   = f1_score(y_test, y_pred)

    results[name] = {
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "Train Time (s)": train_time
    }

    # ------------------------
    # Confusion Matrix
    # ------------------------
    disp = ConfusionMatrixDisplay.from_predictions(
        y_test, y_pred, cmap="Blues"
    )
    disp.ax_.set_title(f"Confusion Matrix - {name}")
    plt.show()

# -----------------------------
# 9) Show Results Table
# -----------------------------

df_results = pd.DataFrame(results).T
print("\n\n=== FINAL RESULTS ===\n")
print(df_results)

# -----------------------------
# 10) Bar Chart Comparison
# -----------------------------

plt.figure(figsize=(10, 6))
df_results[["Accuracy", "Precision", "Recall", "F1"]].plot(kind="bar")
plt.title("Algorithm Performance Comparison")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()