# MLFlow Databricks Integration

In [1]:
import mlflow
from mlflow.models import infer_signature
import pandas as pd
import json
from tempfile import TemporaryDirectory
from pathlib import Path

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline

In the Databricks Workspace it is not necessary to set your tracking uri - it is set for you as an environment variable. Furthermore, the experiment name has to be in form of a path to your workspace: `/Users/<USER_NAME>/<ESPERIMENT_NAME>`

[MLFlow Tracking Documenation](https://docs.databricks.com/en/mlflow/tracking.html)

MLFlow is built into Databricks natively and you have access to all Features on your sidebar: 

![Databricks Integration](image/MachineLearningSidebar.png)


You can have an overview of your Experiments: 

![Experiment Tab](image/Experiments.png)

And all your models are versioned in the Models tab. You can also set stages for your model versions (Staging, Production, Archived) and add tags to a specific version of the model. 
![Databricks Integration](image/models.png)

What is more, you can also directly deploy your models on a serving instance in the `Serving` tab

Here is a code example for training the breastcancer model on databricks: 

In [None]:

def train(model: Pipeline, model_name: str, params: dict):
    # Load dataset
    print("Loading data...")
    cancer = datasets.load_breast_cancer()

    experiment = mlflow.set_experiment("/Users/florian.krempl@alexanderthamm.com/final-project-demo")
    with mlflow.start_run(experiment_id=experiment.experiment_id) as run:
        ## Model training

        # log config
        with TemporaryDirectory() as temp_dir:
            file_path = Path(temp_dir) / "dataset.json"
            dataset_config = {
                "features": list(cancer.feature_names),
                "target": list(cancer.target_names),
            }
            with file_path.open("w") as outfile:
                json.dump(dataset_config, outfile)

            mlflow.log_artifact(file_path)

        mlflow.log_params(params)

        # Split dataset into training set and test set; 70% training and 30% test
        X_train, X_test, y_train, y_test = train_test_split(
            cancer.data, cancer.target, test_size=0.3, random_state=109
        )

        # Create a svm Classifier
        pipe = model(**params)

        # Train the model using the training sets
        print("Model training...")
        pipe.fit(X_train, y_train)

        # Predict the response for test dataset
        print("Model evaluation...")    
        y_pred = pipe.predict(X_test)
        X_test_df = pd.DataFrame(X_test, columns=cancer.feature_names)
        signature = infer_signature(X_test_df, y_pred)

        accuracy = metrics.accuracy_score(y_test, y_pred)
        precision = metrics.precision_score(y_test, y_pred)
        recall = metrics.recall_score(y_test, y_pred)

        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.sklearn.log_model(
            pipe,
            artifact_path="sklearn-model",
            registered_model_name=model_name,
            signature=signature,
        )

        return pipe



In [None]:

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier
params = {"n_estimators": 10, "max_depth": 5}

pipe = train(model=model, model_name="RandomForestBreastCancerModel", params=params)
print(pipe)


Loading data...


2024/08/29 12:40:14 INFO mlflow.tracking.fluent: Experiment with name '/Users/florian.krempl@alexanderthamm.com/final-project-demo' does not exist. Creating a new experiment.


Model training...
Model evaluation...




Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Successfully registered model 'RandomForestBreastCancerModel'.
2024/08/29 12:40:33 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestBreastCancerModel, version 1
Created version '1' of model 'RandomForestBreastCancerModel'.


RandomForestClassifier(max_depth=5, n_estimators=10)
