# MLflow Train Wine Quality Notebook
This is a Quick Start notebook.
* It is based on [train.py](https://github.com/databricks/mlflow/blob/master/example/tutorial/train.py) from [MLflow's tutorial](https://mlflow.org/docs/latest/tutorial.html). 
* It creates runs in the experiment "sklearn_wine".


In [16]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
print("MLflow Version:",mlflow.version.VERSION)
mlflow.tracking.get_tracking_uri()

MLflow Version: 1.2.0


'http://localhost:5000'

In [2]:
def now():
    import time
    now = int(time.time()+.5)
    return time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(now))

In [3]:
print("Run Start:",now())

Run Start: 2019-08-25 18:06:23


In [4]:
experiment_name = "sklearn_wine"
data_path = "../../data/wine-quality/wine-quality-white.csv"
data_url = "https://raw.githubusercontent.com/mlflow/mlflow/master/examples/sklearn_elasticnet_wine/wine-quality.csv"
run_origin = "jupyter"

In [5]:
mlflow.set_experiment(experiment_name)
mlflow_client = mlflow.tracking.MlflowClient()
experiment_id = mlflow_client.get_experiment_by_name(experiment_name).experiment_id
print("experiment_id:",experiment_id)
print("experiment_name:",experiment_name)

MLflow Version: 1.2.0
experiment_id: 2
experiment_name: sklearn_wine


In [6]:
import os
import requests

if not os.path.exists(data_path):
    print("Downloading {} to {}".format(data_url,data_path))
    rsp = requests.get(data_url)
    with open(data_path, 'w') as f:
        f.write(rsp.text)
else:
    print("File {} already exists".format(data_path))

File ../../data/wine-quality/wine-quality-white.csv already exists


#### Write your ML code based on the`train.py` code
This tutorial is based on the MLflow's example [train.py](https://github.com/databricks/mlflow/blob/master/example/tutorial/train.py), which uses an external [wine-quality.csv](https://github.com/databricks/mlflow/blob/master/example/tutorial/wine-quality.csv) dataset to predict wine quality.

In [7]:
# The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality
# P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
# Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.

import sys
import os
import platform
import warnings

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

import mlflow
import mlflow.sklearn

In [8]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [9]:
import numpy as np
import matplotlib.pyplot as plt

def create_plot_file(y_test_set, y_predicted, plot_file):
    global image
    fig, ax = plt.subplots()
    ax.scatter(y_test_set, y_predicted, edgecolors=(0, 0, 0))
    ax.plot([y_test_set.min(), y_test_set.max()], [y_test_set.min(), y_test_set.max()], 'k--', lw=4)
    ax.set_xlabel('Actual')
    ax.set_ylabel('Predicted')
    ax.set_title("Ground Truth vs Predicted")
    #plt.show()

    image = fig
    fig.savefig(plot_file)
    plt.close(fig)                    

In [10]:
data = pd.read_csv(data_path)
train, test = train_test_split(data)
# The predicted column is "quality" which is a scalar from [3, 9]
train_x = train.drop(["quality"], axis=1)
test_x = test.drop(["quality"], axis=1)
train_y = train[["quality"]]
test_y = test[["quality"]]

In [11]:
def train(max_depth, max_leaf_nodes):
    with mlflow.start_run(run_name=run_origin) as run:
        run_id = run.info.run_uuid
        experiment_id = run.info.experiment_id
        print("MLflow:")
        print("  run_id:",run_id)
        print("  experiment_id:",experiment_id)

        # Create model
        dt = DecisionTreeRegressor(max_depth=max_depth, max_leaf_nodes=max_leaf_nodes)
        print("Model:",dt)

        # Fit and predict
        dt.fit(train_x, train_y)
        predictions = dt.predict(test_x)

        # MLflow params
        print("Parameters:")
        print("  max_depth:",max_depth)
        print("  max_leaf_nodes:",max_leaf_nodes)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("max_leaf_nodes", max_leaf_nodes)

        # MLflow metrics
        rmse = np.sqrt(mean_squared_error(test_y, predictions))
        mae = mean_absolute_error(test_y, predictions)
        r2 = r2_score(test_y, predictions)
        print("Metrics:")
        print("  rmse:",rmse)
        print("  mae:",mae)
        print("  r2:",r2)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        
        # MLflow tags
        mlflow.set_tag("mlflow.runName",run_origin) # mlflow CLI picks this up
        mlflow.set_tag("data_path", data_path)
        mlflow.set_tag("exp_id", experiment_id)
        mlflow.set_tag("exp_name", experiment_name)
        mlflow.set_tag("run_origin", run_origin)
        mlflow.set_tag("platform", platform.system())

        # MLflow log model
        mlflow.sklearn.log_model(dt, "sklearn-model")

        # MLflow log plot file artifact
        create_plot_file(test_y, predictions, "plot.png")
        mlflow.log_artifact("plot.png")

    return (experiment_id,run_id)

In [12]:
train(2,16)

MLflow:
  run_id: b9abaffea0e3461680a751aa366ea772
  experiment_id: 2
Model: DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=16, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
Parameters:
  max_depth: 2
  max_leaf_nodes: 16
Metrics:
  rmse: 0.7579987076338718
  mae: 0.6121287106846993
  r2: 0.2564761417250073


('2', 'b9abaffea0e3461680a751aa366ea772')

In [13]:
print("Run End:",now())

Run End: 2019-08-25 18:06:24
