## Configs

In [1]:
# private key for the storage account 
import os 
import pandas as pd 

os.environ['GOOGLE_APPLICATION_CREDENTIALS']='./keyfile.json' # Specify the gcs authentification file path 


In [2]:
# Install packages
!pip install mlflow
!pip install google-cloud 
!pip install google-cloud-storage
!pip install xgboost

Collecting mlflow
  Downloading mlflow-1.19.0-py3-none-any.whl (14.4 MB)
[K     |████████████████████████████████| 14.4 MB 8.3 MB/s eta 0:00:01
Collecting alembic<=1.4.1
  Using cached alembic-1.4.1-py2.py3-none-any.whl
Collecting databricks-cli>=0.8.7
  Using cached databricks_cli-0.14.3-py3-none-any.whl
Collecting docker>=4.0.0
  Using cached docker-5.0.0-py2.py3-none-any.whl (146 kB)
Collecting gitpython>=2.1.0
  Downloading GitPython-3.1.18-py3-none-any.whl (170 kB)
[K     |████████████████████████████████| 170 kB 20.9 MB/s eta 0:00:01
Collecting querystring-parser
  Using cached querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting gunicorn
  Using cached gunicorn-20.1.0-py3-none-any.whl (79 kB)
Collecting Flask
  Using cached Flask-2.0.1-py3-none-any.whl (94 kB)
Collecting prometheus-flask-exporter
  Using cached prometheus_flask_exporter-0.18.2-py3-none-any.whl
Collecting sqlparse>=0.3.1
  Using cached sqlparse-0.4.1-py3-none-any.whl (42 kB)
Collecting tabulate>=0.

In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
import mlflow
import mlflow.xgboost

mlflow.set_tracking_uri('http://loadbalancerip') # Change to Mlflow URI  (loadbalancer ip)
print("MLflow Version:", mlflow.__version__)
print("MLflow Tracking URI:", mlflow.get_tracking_uri())
print("XGBoost version:",xgb.__version__)
client = mlflow.tracking.MlflowClient()



MLflow Version: 1.19.0
MLflow Tracking URI: http://loadbalancerip
XGBoost version: 1.4.2


## Train and register a model

In [4]:
def build_data(data_path):
    data = pd.read_csv(data_path)
    train, test = train_test_split(data, test_size=0.30, random_state=2019)

    # The predicted column is "quality" which is a scalar from [3, 9]
    X_train = train.drop(["quality"], axis=1)
    X_test = test.drop(["quality"], axis=1)
    y_train = train["quality"]
    y_test = test["quality"]

    return X_train, X_test, y_train, y_test 


In [5]:
def train(data_path, max_depth, min_child_weight, estimators, model_name):
    X_train, X_test, y_train, y_test = build_data(data_path)
    with mlflow.start_run() as run:
        run_id = run.info.run_uuid
        experiment_id = run.info.experiment_id
        print("MLflow:")
        print("  run_id:", run_id)
        print("  experiment_id:", experiment_id)
        print("  experiment_name:", client.get_experiment(experiment_id).name)

        # MLflow params
        print("Parameters:")
        print("  max_depth:", max_depth)
        print("  min_child_weight:", min_child_weight)
        print("  estimators:", estimators)
        
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("min_child_weight", min_child_weight)
        mlflow.log_param("estimators", estimators)

        # Create and fit model
        model = xgb.XGBRegressor(
                 max_depth=max_depth,
                 min_child_weight=min_child_weight,
                 random_state=42) 
        model.fit(X_train, y_train)
        
        # MLflow metrics
        predictions = model.predict(X_test)
        print("predictions:",predictions)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        mae = mean_absolute_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        
        print("Metrics:")
        print("  rmse:", rmse)
        print("  mae:", mae)
        print("  r2:", r2)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        # Log model
        mlflow.xgboost.log_model(model, "xgboost-model", registered_model_name = model_name)


In [6]:
data_path = 'https://raw.githubusercontent.com/amesar/mlflow-examples/master/data/train/wine-quality-white.csv'
experiment_name = 'test_xgboost'
model_name = 'xgb_0'
max_depth = 10
min_child_weight = 1
estimators = 100
train(data_path, max_depth , min_child_weight, estimators, model_name)


MLflow:
  run_id: b66effdfef4e477ebacb3859a32b424a
  experiment_id: 0
  experiment_name: Exp1
Parameters:
  max_depth: 10
  min_child_weight: 1
  estimators: 100
predictions: [5.930445  6.9174986 6.943309  ... 6.4364724 6.987052  5.6387153]
Metrics:
  rmse: 0.6605505923590334
  mae: 0.44089301443424356
  r2: 0.44297057516776106


Successfully registered model 'xgb_0'.
2021/07/16 11:28:41 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: xgb_0, version 1
Created version '1' of model 'xgb_0'.


## Load the model from mlflow and make predictions

In [7]:
# Predict on a Pandas DataFrame.
import pandas as pd
test_samples =pd.read_csv(data_path).head(5).drop(columns=['quality'])

loaded_model = mlflow.pyfunc.load_model("runs:/a0928931dff54a829b881be2a3e41d00/xgboost-model")
loaded_model.predict(test_samples)

array([6.0004535, 5.994977 , 5.629604 , 5.9799094, 5.9799094],
      dtype=float32)