In [13]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib
import os

In [14]:
# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# 1. Load Data
# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
def load_data(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    print(f"âœ… Data loaded: {df.shape[0]} rows, {df.shape[1]} columns")
    print(df.head())
    return df

In [15]:
# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# 2. Preprocess Data
# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
def preprocess(df: pd.DataFrame):
    df = df.copy()
    
    # Encode gender
    le_gender = LabelEncoder()
    df["gender"] = le_gender.fit_transform(df["gender"])  # Female=0, Male=1, Other=2
    
    # Encode smoking_history
    smoking_map = {
        "never": 0,
        "No Info": 1,
        "former": 2,
        "current": 3,
        "ever": 4,
        "not current": 5,
    }
    df["smoking_history"] = df["smoking_history"].map(smoking_map).fillna(1)
    
    # Features and target
    feature_cols = [
        "gender", "age", "hypertension", "heart_disease",
        "smoking_history", "bmi", "HbA1c_level", "blood_glucose_level"
    ]
    
    # Make sure column names are lowercase
    df.columns = [c.lower() for c in df.columns]
    feature_cols = [c.lower() for c in feature_cols]
    
    X = df[feature_cols]
    y = df["diabetes"]
    
    print(f"\nðŸ“Š Target distribution:\n{y.value_counts()}")
    print(f"\nDiabetes positive rate: {y.mean()*100:.2f}%")
    
    return X, y, feature_cols


In [16]:
# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# 3. Train Model
# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
def train(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = LogisticRegression(
        max_iter=1000,
        class_weight="balanced",       # Because the dataset is imbalanced
        random_state=42,
        C=1.0
    )
    
    model.fit(X_train_scaled, y_train)
    
    # Evaluation
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    print("\nðŸ“ˆ Model Evaluation Report:")
    print(classification_report(y_test, y_pred, target_names=["Non-Diabetic", "Diabetic"]))
    print(f"AUC-ROC: {roc_auc_score(y_test, y_prob):.4f}")
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    return model, scaler


In [None]:
# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# 4. Save Model & Artifacts
# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
def save_model(model, scaler, feature_cols, output_dir: str = "../diabetes/models"):
    os.makedirs(output_dir, exist_ok=True)
    
    joblib.dump(model,        os.path.join(output_dir, "diabetes_model.pkl"))
    joblib.dump(scaler,       os.path.join(output_dir, "diabetes_scaler.pkl"))
    joblib.dump(feature_cols, os.path.join(output_dir, "diabetes_features.pkl"))
    
    print(f"\nâœ… Model artifacts saved to: {output_dir}/")
    print("   - diabetes_model.pkl")
    print("   - diabetes_scaler.pkl")
    print("   - diabetes_features.pkl")


In [19]:
# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# Main Execution
# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
if __name__ == "__main__":
    import sys
    
    csv_path = r"D:\University-Master\Term-3\Cloud Computing\PRO\Medical-Management-Panel\data\diabetes_prediction_dataset.csv"
    
    df = load_data(csv_path)
    X, y, feature_cols = preprocess(df)
    model, scaler = train(X, y)
    save_model(model, scaler, feature_cols)

âœ… Data loaded: 100000 rows, 9 columns
   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  

ðŸ“Š Target distribution:
diabetes
0    91500
1     8500
Name: count, dtype: int64

Diabetes positive rate: 8.50%

ðŸ“ˆ Model Evaluation Report:
              precision    recall  f1-score   support

Non-Diabetic       0.99      0.89      0.93 