In [None]:
# =========================
# Decision Tree on Diabetes Dataset
# =========================

# 1) Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    confusion_matrix,
    classification_report,
    precision_score,
    recall_score,
    f1_score
)
from sklearn.preprocessing import StandardScaler


In [None]:

# 2) Load data
# If running in the same folder as the attached file, set:
csv_path = Path("diabetes_dataset.csv")  # change to the actual path if needed
df = pd.read_csv(csv_path)

print("Shape:", df.shape)
print(df.head(3))

# 3) Basic checks
print("\nMissing values per column:")
print(df.isna().sum())

print("\nOutcome distribution:")
print(df['Outcome'].value_counts())

In [None]:

# 4) Handle likely invalid zeros in medical features
# In the Pima dataset, zeros in these columns often mean "missing":
zero_as_missing_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

df_clean = df.copy()
for col in zero_as_missing_cols:
    # Replace zeros with NaN
    df_clean[col] = df_clean[col].replace(0, np.nan)
    # Impute with median (robust to outliers)
    median_val = df_clean[col].median()
    df_clean[col] = df_clean[col].fillna(median_val)

In [None]:
# 5) Split features and target
X = df_clean.drop(columns=['Outcome'])
y = df_clean['Outcome'].astype(int)

# 6) Train/test split (stratified to preserve class ratio)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=41, stratify=y
)

print("\nTrain shape:", X_train.shape, "Test shape:", X_test.shape)

In [None]:
# 7) Optional scaling (trees don’t need it, but harmless and sometimes helps with stability)
# Comment out if you prefer raw features.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:



# 8) Baseline Decision Tree (quick start)
baseline_clf = DecisionTreeClassifier(random_state=41)
baseline_clf.fit(X_train_scaled, y_train)
baseline_pred = baseline_clf.predict(X_test_scaled)

print("\n=== Baseline Decision Tree ===")
print("Accuracy:", f"{accuracy_score(y_test, baseline_pred):.4f}")
print("Balanced Accuracy:", f"{balanced_accuracy_score(y_test, baseline_pred):.4f}")
print("Precision (pos=1):", f"{precision_score(y_test, baseline_pred, zero_division=0):.4f}")
print("Recall (pos=1):", f"{recall_score(y_test, baseline_pred, zero_division=0):.4f}")
print("F1 (pos=1):", f"{f1_score(y_test, baseline_pred, zero_division=0):.4f}")


In [None]:

# 9) Hyperparameter tuning with GridSearchCV for better generalization
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],  # log_loss uses information gain with log loss
    'max_depth': [None, 3, 4, 5, 6, 8, 10],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [None, 'sqrt', 'log2']
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=41)
grid = GridSearchCV(
    DecisionTreeClassifier(random_state=41),
    param_grid=param_grid,
    scoring='balanced_accuracy',
    cv=cv,
    n_jobs=-1,
    verbose=0
)
grid.fit(X_train_scaled, y_train)

best_clf = grid.best_estimator_
print("\n=== Best CV Model ===")
print("Best params:", grid.best_params_)
print("Best CV balanced accuracy:", f"{grid.best_score_: .4f}")

In [None]:
# 10) Evaluate the tuned model
y_pred = best_clf.predict(X_test_scaled)
print("\n=== Test Performance (Tuned Model) ===")
print("Accuracy:", f"{accuracy_score(y_test, y_pred):.4f}")
print("Balanced Accuracy:", f"{balanced_accuracy_score(y_test, y_pred):.4f}")
print("Precision (pos=1):", f"{precision_score(y_test, y_pred, zero_division=0):.4f}")
print("Recall (pos=1):", f"{recall_score(y_test, y_pred, zero_division=0):.4f}")
print("F1 (pos=1):", f"{f1_score(y_test, y_pred, zero_division=0):.4f}")

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=['No Diabetes (0)','Diabetes (1)'], digits=4, zero_division=0))


In [None]:
# 11) Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=[0,1])
print("Confusion Matrix (rows=true, cols=pred) [0=No, 1=Yes]:\n", cm)

In [None]:
# Extract TN, FP, FN, TP
TN, FP, FN, TP = cm.ravel()
print(f"TN: {TN}, FP: {FP}, FN: {FN}, TP: {TP}")

In [None]:
# Plot heatmap
plt.figure(figsize=(4,3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['No (0)','Yes (1)'], yticklabels=['No (0)','Yes (1)'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

In [None]:
# 12) Feature importances
importances = pd.Series(best_clf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nFeature Importances:")
print(importances)

plt.figure(figsize=(8,5))
sns.barplot(x=importances.values, y=importances.index, color='teal')
plt.title('Decision Tree Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()


In [None]:

# 13) Optional: visualize the tree (small trees only; may be large)
# If the tree is too big, set smaller max_depth in best_clf or plot the baseline_clf with smaller depth.
try:
    plt.figure(figsize=(16, 10))
    plot_tree(best_clf, feature_names=X.columns, class_names=['No (0)','Yes (1)'],
              filled=True, max_depth=3)  # limit depth in visualization for readability
    plt.title("Decision Tree (Top Levels)")
    plt.show()
except Exception as e:
    print("Tree plotting skipped:", e)

In [None]:
# 14) Build a production-ready pipeline and save it
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer

# Columns where 0 means missing
zero_as_missing_cols = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
all_features = X.columns.tolist()

def zeros_to_nan(df):
    df = df.copy()
    for c in zero_as_missing_cols:
        if c in df.columns:
            df[c] = df[c].replace(0, np.nan)
    return df

# Preprocess: 0->NaN, median impute, scale (kept because you trained with StandardScaler)
preprocess = Pipeline(steps=[
    ('zero_to_nan', FunctionTransformer(zeros_to_nan, validate=False)),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())  # keep only if you used scaling in training
])

prod_pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', best_clf)  # your tuned DecisionTreeClassifier
])

# Fit the pipeline fully on the training data (X_train, y_train)
prod_pipe.fit(X_train, y_train)

# Save the pipeline
joblib.dump(prod_pipe, 'diabetes_decision_tree_pipeline.joblib')
print("Saved pipeline to diabetes_decision_tree_pipeline.joblib")
