### Import Packages ###

In [0]:
# Comment code below after running the mlflow update
%pip install "mlflow-skinny[databricks]>=2.4.1"
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")
import mlflow
import mlflow.sklearn
from mlflow import MlflowClient
from mlflow.models.signature import infer_signature
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

### Define Catalog and Schema in Unity Catalog ###

In [0]:
catalog = "data_science"
schema = "models"

### Load Kaggle Data Set ###

https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset

In [0]:
data = pd.read_csv('/Workspace/Users/awnish.choudhary@anthology.ai/healthcare-dataset-stroke-data.csv')

### Clean the Data ###

In [0]:
# Convert string columns to numeric values in data
data['gender'] = data['gender'].map({'Male': 0, 'Female': 1, 'Other': 2})
data['ever_married'] = data['ever_married'].map({'No': 0, 'Yes': 1})
data['work_type'] = data['work_type'].map({'Private': 0, 'Self-employed': 1, 'Govt_job': 2, 'children': 3, 'Never_worked': 4})
data['Residence_type'] = data['Residence_type'].map({'Urban': 0, 'Rural': 1})
data['smoking_status'] = data['smoking_status'].map({'Unknown': 0, 'never smoked': 1, 'formerly smoked': 2, 'smokes': 3})

data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,0,67.0,0,1,1,0,0,228.69,36.6,2,1
1,51676,1,61.0,0,0,1,1,1,202.21,,1,1
2,31112,0,80.0,0,1,1,0,1,105.92,32.5,1,1
3,60182,1,49.0,0,0,1,0,0,171.23,34.4,3,1
4,1665,1,79.0,1,0,1,1,1,174.12,24.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,1,80.0,1,0,1,0,0,83.75,,1,0
5106,44873,1,81.0,0,0,1,1,0,125.20,40.0,1,0
5107,19723,1,35.0,0,0,1,1,1,82.99,30.6,1,0
5108,37544,0,51.0,0,0,1,0,1,166.29,25.6,2,0


In [0]:
# Drop rows with missing values
data.dropna(inplace=True)

In [0]:
X = data.drop(columns=["stroke"])
y = data["stroke"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Model training ###

In [0]:
# Train a random forest model
numTrees = 100
maxDepth = 7
model = RandomForestClassifier(n_estimators=numTrees, max_depth=maxDepth)
model.fit(X_train, y_train)
model_name="random_forest_model"
# Predict and calculate F1 score
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
# Calculate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy = {accuracy}")

Uploading artifacts:   0%|          | 0/3 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Test Accuracy = 0.96673455532926


### Add model signature which is required to register the model ###

In [0]:
# Infer model signature
sample = X_train.head()
signature = infer_signature(sample, model.predict(sample))

### Log and Register model to MLflow ###

In [0]:
with mlflow.start_run():
    mlflow.sklearn.log_model(model, model_name, signature=signature)
    uri = mlflow.get_artifact_uri(model_name)
    # Log metrics
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("accuracy_score", accuracy)
    mlflow.set_registry_uri("databricks-uc")
    # Register Model
    mlflow.register_model(
        model_uri=uri,
        name=f"{catalog}.{schema}.{model_name}"
    )

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Successfully registered model 'data_science.models.random_forest_model'.


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Created version '1' of model 'data_science.models.random_forest_model'.


### Create "Champion" alias for latest version of the model currently in production  ###

In [0]:
# Initialize the MLflow client
client = MlflowClient()

# Search for all versions of the model and fetch the latest one
model_version_infos = client.search_model_versions(f"name='{catalog}.{schema}.{model_name}'")
new_model_version = max(model_version_info.version for model_version_info in model_version_infos)

# Set the alias for the latest model version
client.set_registered_model_alias(
    name=f"{catalog}.{schema}.{model_name}",
    alias="Champion",
    version=new_model_version
)

### Load Registered Model for inference ###

In [0]:
model_version_uri = 'models:/'+f"{catalog}.{schema}.{model_name}@Champion"
champion_version = mlflow.sklearn.load_model(model_version_uri)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [0]:
champion_version.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])