# Import Libraries 

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import cross_val_score
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

  from pandas.core import (


## Load the Dataset

In [2]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Data Preprocessing 

In [3]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [8]:
df = df[['Glucose','BloodPressure','SkinThickness',
                   'Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']]
df


Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,72,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,32,1
3,89,66,23,94,28.1,0.167,21,0
4,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


In [9]:
df.shape

(768, 8)

In [10]:
df.isna().sum()

Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [11]:
df.dtypes

Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [13]:
X = df.drop(columns=['Outcome'])
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training

In [14]:
## Baseline training + MLflow logging (with signature)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from mlflow.models.signature import infer_signature
import mlflow.sklearn

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "KNeighbors": KNeighborsClassifier(n_neighbors=5)
}

for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_baseline") as run:
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        probs = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

        acc = accuracy_score(y_test, preds)
        mlflow.log_metric("accuracy", acc)

        if probs is not None:
            auc = roc_auc_score(y_test, probs)
            mlflow.log_metric("auc", auc)

        signature = infer_signature(X_train, model.predict(X_train))
        mlflow.sklearn.log_model(sk_model=model, artifact_path="model", signature=signature)

        print(f"{name} model uri: runs:/{run.info.run_id}/model")



LogisticRegression model uri: runs:/c81b1c96edd649ae99c87b536157ff74/model




GradientBoosting model uri: runs:/519bd3679a184e93a35cf2d155e735c6/model
KNeighbors model uri: runs:/aaa76677ff4d470eb098b594009f64b6/model


In [18]:
### you can use this port to view the models - http://127.0.0.1:5000

In [19]:
## Hyperopt tuning (log each candidate to MLflow) and register best

def gb_objective(params):
    with mlflow.start_run(nested=True):
        n_estimators = int(params['n_estimators'])
        max_depth = int(params['max_depth'])
        learning_rate = float(params['learning_rate'])
        subsample = float(params['subsample'])
        
        clf = GradientBoostingClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            subsample=subsample,
            random_state=42
        )
        
        # 5-fold CV on training set
        score = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc').mean()
        
        # log params and score to MLflow
        mlflow.log_params(params)
        mlflow.log_metric('cv_auc', float(score))
        
        return {'loss': -score, 'status': STATUS_OK}

In [20]:
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
    'max_depth': hp.quniform('max_depth', 2, 10, 1),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),  # ~0.05 to 1
    'subsample': hp.uniform('subsample', 0.6, 1.0)
}

In [21]:
trials = Trials()
with mlflow.start_run(run_name='GB_Hyperopt'):
    best = fmin(fn=gb_objective,
                space=space,
                algo=tpe.suggest,
                max_evals=50,
                trials=trials)
    
    print("Best hyperparameters:", best)
    
    # Train final model with best params
    best_clf = GradientBoostingClassifier(
        n_estimators=int(best['n_estimators']),
        max_depth=int(best['max_depth']),
        learning_rate=float(best['learning_rate']),
        subsample=float(best['subsample']),
        random_state=42
    )
    best_clf.fit(X_train, y_train)
    
    preds = best_clf.predict(X_test)
    probs = best_clf.predict_proba(X_test)[:,1]
    
    acc = accuracy_score(y_test, preds)
    auc = roc_auc_score(y_test, probs)
    
    mlflow.log_metric('test_accuracy', acc)
    mlflow.log_metric('test_auc', auc)
    
    # log final model with signature
    signature = infer_signature(X_train, best_clf.predict(X_train))
    mlflow.sklearn.log_model(best_clf, artifact_path='model', signature=signature)
    
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/model"
    print("Final Gradient Boosting model uri:", model_uri)

    # register model in Model Registry (workspace must have Model Registry enabled)
    model_details = mlflow.register_model(model_uri, "best_diabtets_predictive_model")
    print('Registered model version:', model_details.version)

100%|███████████████████████████████████████████████| 50/50 [02:36<00:00,  3.12s/trial, best loss: -0.8479654238956563]
Best hyperparameters: {'learning_rate': 0.05031273980804025, 'max_depth': 2.0, 'n_estimators': 170.0, 'subsample': 0.9165317734238454}




Final Gradient Boosting model uri: runs:/c4f2a970ae254cf99b72df1aae53e24b/model


Successfully registered model 'best_diabtets_predictive_model'.


Registered model version: 1


Created version '1' of model 'best_diabtets_predictive_model'.


# Serve the Model Locally

In [30]:
import subprocess
import time

# Start MLflow model server
process = subprocess.Popen(
    ["mlflow", "models", "serve",
     "-m", "models:/best_diabtets_predictive_model/1",
     "-p", "1234",
     "--no-conda"],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE
)

# Wait a few seconds for server to start
time.sleep(10)  # Adjust if needed
print("MLflow server should be running now.")

MLflow server should be running now.


# Getting predictions

In [31]:
import requests
import pandas as pd
import json

data = pd.DataFrame([{
    "Glucose": 120,
    "BloodPressure": 70,
    "SkinThickness": 20,
    "Insulin": 79,
    "BMI": 28.0,
    "DiabetesPedigreeFunction": 0.5,
    "Age": 45
}])

payload = {"inputs": data.to_dict(orient="records")}  # Wrap in dict

response = requests.post(
    "http://127.0.0.1:1234/invocations",
    headers={"Content-Type": "application/json"},
    data=json.dumps(payload)  # Must convert dict to JSON string
)

print(response.json())

{'predictions': [0]}


In [32]:

process.terminate()