In [None]:
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
import sklearn.model_selection

## This is a Databricks Connect enable environment

In [None]:
import socket

from pyspark.sql import SparkSession
import spark_sklearn

spark = SparkSession.builder.getOrCreate()

print(spark.version)
spark.sparkContext.range(4).map(lambda x: socket.gethostname()).collect()

## Define a cross validation function to switch between local and remote cross validation

In [None]:
def cross_val(classifier, x, y, *args, remote=False, **kwargs):
    if remote:
        gs = spark_sklearn.GridSearchCV(spark.sparkContext, classifier, *args, **kwargs)
    else:
        gs = sklearn.model_selection.GridSearchCV(classifier, *args, **kwargs)
    return gs.fit(x, y)

## Local cross validation

In [None]:
digits = datasets.load_digits()

X, y = digits.data, digits.target

In [None]:
%%time

param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
#              "min_samples_split": [2, 5, 10],
#              "min_samples_leaf": [1, 3, 10],
#              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
              "n_estimators": [10, 20, 40, 80]
             }

cv = cross_val(RandomForestClassifier(), X, y, param_grid=param_grid, remote=False)

## Remote cross validation

In [None]:
%%time

param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 5, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
              "n_estimators": [10, 20, 40, 80]
             }
cv = cross_val(RandomForestClassifier(), X, y, param_grid=param_grid, remote=True)

In [None]:
import mlflow
import mlflow.sklearn
import os
import pandas as pd
import datetime
import tempfile
import subprocess

def mlflow_track(clf, name):
    timestamp = datetime.datetime.now().isoformat().split(".")[0].replace(":", ".")

    i = clf.best_index_
    num_runs = len(clf.cv_results_["rank_test_score"])
    run_name = "run %d (best run of %d):" % (i, num_runs)

    with mlflow.start_run(run_name=run_name) as run:
        mlflow.log_param("folds", clf.cv)
        print("Logging parameters")
        params = list(clf.param_grid.keys())
        for param in params:
            mlflow.log_param(param, clf.cv_results_["param_%s" % param][i])

        print("Logging metrics")
        mlflow.log_metric("rank_test_score" , clf.cv_results_["rank_test_score"][i])
        mlflow.log_metric("mean_train_score", clf.cv_results_["mean_train_score"][i])
        mlflow.log_metric("std_train_score",  clf.cv_results_["std_train_score"][i])
        mlflow.log_metric("mean_test_score",  clf.cv_results_["mean_test_score"][i])
        mlflow.log_metric("std_test_score",   clf.cv_results_["std_test_score"][i])

        print("Logging model")
        mlflow.sklearn.log_model(clf.best_estimator_, "model")

        print("Logging CV results matrix")
        tempdir = tempfile.TemporaryDirectory().name
        os.mkdir(tempdir)

        filename = "%s-%s-cv_results.csv" % (name, timestamp)
        csv = os.path.join(tempdir, filename)
        pd.DataFrame(clf.cv_results_).sort_values(by='rank_test_score').to_csv(csv, index=False)
        
        mlflow.log_artifact(csv, "cv_results")
    print("Done")

In [None]:
mlflow.set_tracking_uri("databricks://westeu")

experiment="/Shared/experiments/digits-spark-sklearn"
mlflow.set_experiment(experiment)

mlflow_track(cv, "digits")