In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
import joblib

In [18]:
df = pd.read_csv("diabetes.csv")

In [19]:
def categorize_bmi(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif 18.5 <= bmi < 24.9:
        return "Normal weight"
    elif 25 <= bmi < 29.9:
        return "Overweight"
    else:
        return "Obese"

In [20]:
df["BMI_category"] = df["BMI"].apply(categorize_bmi)

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["Outcome"])

X_train = train_df.drop(columns=["Outcome"])
y_train = train_df["Outcome"]
X_val = val_df.drop(columns=["Outcome"])
y_val = val_df["Outcome"]

In [21]:
numeric_features = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]
categorical_features = ["BMI_category"]

scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_val[numeric_features] = scaler.transform(X_val[numeric_features])

In [22]:
encoder = OneHotEncoder(drop="first", sparse_output=False) # Changed 'sparse' to 'sparse_output'
X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_val_encoded = encoder.transform(X_val[categorical_features])

In [23]:
encoded_columns = encoder.get_feature_names_out(categorical_features)
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_columns, index=X_train.index)
X_val_encoded_df = pd.DataFrame(X_val_encoded, columns=encoded_columns, index=X_val.index)

In [24]:
X_train = pd.concat([X_train[numeric_features], X_train_encoded_df], axis=1)
X_val = pd.concat([X_val[numeric_features], X_val_encoded_df], axis=1)

In [25]:
best_knn_model = None
best_knn_f1 = 0
best_k = 0

for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_val)
    f1 = f1_score(y_val, y_pred)

    if f1 > best_knn_f1:
        best_knn_f1 = f1
        best_knn_model = knn
        best_k = k

print(f"Best KNN model: k={best_k} with F1 score={best_knn_f1:.4f}")


Best KNN model: k=7 with F1 score=0.5743


In [26]:
best_dt_model = None
best_dt_f1 = 0
best_depth = 0

for depth in [3, 5, 7]:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_val)
    f1 = f1_score(y_val, y_pred)

    if f1 > best_dt_f1:
        best_dt_f1 = f1
        best_dt_model = dt
        best_depth = depth

print(f"Best Decision Tree model: max_depth={best_depth} with F1 score={best_dt_f1:.4f}")


Best Decision Tree model: max_depth=5 with F1 score=0.7222


In [27]:
best_model = best_knn_model if best_knn_f1 > best_dt_f1 else best_dt_model
model_name = "knn_model.pkl" if best_knn_f1 > best_dt_f1 else "decision_tree_model.pkl"


In [28]:
joblib.dump(best_model, model_name)
joblib.dump(scaler, "scaler.pkl")
joblib.dump(encoder, "encoder.pkl")
print(f"Best model saved as {model_name}")

Best model saved as decision_tree_model.pkl


In [29]:
def predict_diabetes(sample):
    model = joblib.load(model_name)
    scaler = joblib.load("scaler.pkl")
    encoder = joblib.load("encoder.pkl")

    sample_df = pd.DataFrame([sample.to_dict()])
    sample_df['BMI_category'] = sample_df.apply(lambda row: categorize_bmi(row['BMI']), axis=1)

    numeric_sample = sample_df[numeric_features].values.reshape(1, -1)
    categorical_sample = sample_df[categorical_features].values.reshape(1, -1)

    numeric_sample_scaled = scaler.transform(numeric_sample)

    categorical_sample_encoded = encoder.transform(categorical_sample)

    final_sample = np.hstack([numeric_sample_scaled, categorical_sample_encoded])
    prediction = model.predict(final_sample)
    return "Diabetic" if prediction[0] == 1 else "Not Diabetic"

In [30]:
sample_indices = val_df.sample(5, random_state=42).index
for idx in sample_indices:
    sample = X_val.loc[idx]
    result = predict_diabetes(sample)
    print(f"Sample {idx}: {result}")


Sample 116: Not Diabetic
Sample 402: Not Diabetic
Sample 425: Not Diabetic
Sample 736: Not Diabetic
Sample 330: Not Diabetic


