In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
import joblib  # For saving models and transformers


In [30]:
df = pd.read_csv("diabetes.csv")  # Load the dataset


In [41]:
def categorize_bmi(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif 18.5 <= bmi < 24.9:
        return "Normal weight"
    elif 25 <= bmi < 29.9:
        return "Overweight"
    else:
        return "Obese"

df["BMI_category"] = df["BMI"].apply(categorize_bmi)


In [32]:
X = df.drop(columns=["Outcome"])  # Features
y = df["Outcome"]  # Target

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [33]:
numeric_features = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "Age"]
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_val[numeric_features] = scaler.transform(X_val[numeric_features])


In [34]:
categorical_features = ["BMI_category"]
encoder = OneHotEncoder(drop="first", sparse=False)
X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_val_encoded = encoder.transform(X_val[categorical_features])

# Convert encoded features into DataFrame
encoded_train_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(categorical_features))
encoded_val_df = pd.DataFrame(X_val_encoded, columns=encoder.get_feature_names_out(categorical_features))

# Drop original categorical column and add encoded columns
X_train = X_train.drop(columns=categorical_features).reset_index(drop=True)
X_val = X_val.drop(columns=categorical_features).reset_index(drop=True)

X_train = pd.concat([X_train, encoded_train_df], axis=1)
X_val = pd.concat([X_val, encoded_val_df], axis=1)




In [35]:
best_knn, best_k, best_knn_score = None, None, 0

for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    
    if f1 > best_knn_score:
        best_knn, best_k, best_knn_score = knn, k, f1

print(f"Best KNN Model: k={best_k}, F1 Score={best_knn_score:.4f}")


Best KNN Model: k=7, F1 Score=0.5577


In [36]:
best_tree, best_depth, best_tree_score = None, None, 0

for depth in [3, 5, 7]:
    tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_val)
    f1 = f1_score(y_val, y_pred)

    if f1 > best_tree_score:
        best_tree, best_depth, best_tree_score = tree, depth, f1

print(f"Best Decision Tree: max_depth={best_depth}, F1 Score={best_tree_score:.4f}")


Best Decision Tree: max_depth=5, F1 Score=0.7222


In [37]:
# Choose best model
best_model = best_knn if best_knn_score > best_tree_score else best_tree
model_name = "knn_model.pkl" if best_knn_score > best_tree_score else "tree_model.pkl"

# Save model, scaler, and encoder
joblib.dump(best_model, model_name)
joblib.dump(scaler, "scaler.pkl")
joblib.dump(encoder, "encoder.pkl")

print(f"Saved best model: {model_name}")


Saved best model: tree_model.pkl


In [44]:
def predict(sample):
    # Load preprocessor and model
    preprocessor = joblib.load("preprocessor.pkl")
    model = joblib.load("best_model.pkl")

    # Convert sample to DataFrame
    sample_df = pd.DataFrame([sample])

    # **Recompute BMI Category**
    sample_df["BMI_category"] = sample_df["BMI"].apply(categorize_bmi)

    # Apply preprocessing
    sample_transformed = preprocessor.transform(sample_df)

    # Predict
    prediction = model.predict(sample_transformed)
    return "Diabetic" if prediction[0] == 1 else "Non-Diabetic"


In [None]:
def predict(sample):
    # Load saved model and transformers
    model = joblib.load("best_model.pkl")
    scaler = joblib.load("scaler.pkl")
    encoder = joblib.load("encoder.pkl")
    trained_columns = joblib.load("trained_feature_columns.pkl")  # Ensure feature consistency

    # Convert sample to DataFrame
    sample_df = pd.DataFrame([sample])

    # Apply standard scaling on numeric features
    sample_df[numeric_features] = scaler.transform(sample_df[numeric_features])

    # Apply one-hot encoding
    encoded_sample = encoder.transform(sample_df[categorical_features])
    encoded_sample_df = pd.DataFrame(
        encoded_sample, columns=encoder.get_feature_names_out(categorical_features)
    )

    # Drop original categorical columns and concatenate encoded features
    sample_df = sample_df.drop(columns=categorical_features).reset_index(drop=True)
    sample_df = pd.concat([sample_df, encoded_sample_df], axis=1)

    # Ensure test data has the same feature set as training data
    for col in trained_columns:
        if col not in sample_df.columns:
            sample_df[col] = 0  # Add missing columns with default value (0 for one-hot encoding)

    # Ensure correct order and remove extra columns
    sample_df = sample_df[trained_columns]

    # Convert to NumPy array to remove feature names
    sample_array = sample_df.to_numpy()

    # Predict class
    prediction = model.predict(sample_array)
    return "Diabetic" if prediction[0] == 1 else "Non-Diabetic"

# Save feature names during training (Ensure you do this when training the model)
joblib.dump(X_train.columns.tolist(), "trained_feature_columns.pkl")

# Test inference with 5 samples
for i in range(5):
    sample = X_val.iloc[i].to_dict()
    print(f"Sample {i+1}: {predict(sample)}")


In [46]:
import pandas as pd
import numpy as np
import joblib  # For saving models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score

df = pd.read_csv("diabetes.csv")


def categorize_bmi(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif 18.5 <= bmi < 24.9:
        return "Normal"
    elif 25 <= bmi < 29.9:
        return "Overweight"
    else:
        return "Obese"

df["BMI_category"] = df["BMI"].apply(categorize_bmi)


X = df.drop(columns=["Outcome"])
y = df["Outcome"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

num_features = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]
cat_features = ["BMI_category"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)

# Save the preprocessor for inference
joblib.dump(preprocessor, "preprocessor.pkl")


best_knn, best_k, best_f1_knn = None, None, 0
for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_transformed, y_train)
    y_pred = knn.predict(X_val_transformed)
    f1 = f1_score(y_val, y_pred)
    if f1 > best_f1_knn:
        best_knn, best_k, best_f1_knn = knn, k, f1

print(f"Best KNN model: k={best_k}, F1 Score={best_f1_knn:.3f}")

Best KNN model: k=5, F1 Score=0.588


In [27]:

best_dt, best_depth, best_f1_dt = None, None, 0
for depth in [3, 5, 7]:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train_transformed, y_train)
    y_pred = dt.predict(X_val_transformed)
    f1 = f1_score(y_val, y_pred)
    if f1 > best_f1_dt:
        best_dt, best_depth, best_f1_dt = dt, depth, f1

print(f"Best Decision Tree model: max_depth={best_depth}, F1 Score={best_f1_dt:.3f}")

best_model = best_knn if best_f1_knn > best_f1_dt else best_dt
joblib.dump(best_model, "best_model.pkl")


def predict(sample):
    # Load preprocessor and model
    preprocessor = joblib.load("preprocessor.pkl")
    model = joblib.load("best_model.pkl")

    # Convert sample to DataFrame & Apply preprocessing
    sample_df = pd.DataFrame([sample])
    sample_transformed = preprocessor.transform(sample_df)

    # Predict
    prediction = model.predict(sample_transformed)
    return "Diabetic" if prediction[0] == 1 else "Non-Diabetic"

for i in range(5):
    sample = X_val.iloc[i].to_dict()
    print(f"Sample {i+1}: {predict(sample)}")

Best Decision Tree model: max_depth=5, F1 Score=0.704
Sample 1: Diabetic
Sample 2: Non-Diabetic
Sample 3: Non-Diabetic
Sample 4: Diabetic
Sample 5: Non-Diabetic
