# Week Summary - MLFlow

# Problem

When testing multiple models, things can get messy
* Rewriting code cells for testing different models
* Keeping track of best parameters through a spreadsheet

In [1]:
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris, load_diabetes
from sklearn.metrics import accuracy_score, mean_squared_error, make_scorer

from sklearn.model_selection import KFold, cross_val_score, train_test_split

In [2]:
X, y = load_iris(return_X_y=True)

# 1st test
lr = LogisticRegression(C=0.1, random_state=42).fit(X, y)
y_pred = lr.predict(X)
print(f'Accuracy Logistic Regression: {accuracy_score(y, y_pred)}')

# 2st test
dt = DecisionTreeClassifier(min_samples_split=4, random_state=42).fit(X, y)
y_pred = dt.predict(X)
print(f'Accuracy Decision Tree: {accuracy_score(y, y_pred)}')

# 3 test
# variable overwritten
dt = DecisionTreeClassifier(min_samples_split=2, max_depth=5, random_state=42).fit(X, y)
y_pred = dt.predict(X)
print(f'Accuracy Decision Tree: {accuracy_score(y, y_pred)}')


Accuracy Logistic Regression: 0.96
Accuracy Decision Tree: 0.9866666666666667
Accuracy Decision Tree: 1.0


We want a better to keep track of the experiments tried ang globally check all the combinations and their effects in the final performance.

**Solution** - track experiments

The code below will start a MLFlow experiment locally

In [3]:
import mlflow 

mlflow.set_experiment('experiment_1')

2024/05/27 11:32:49 INFO mlflow.tracking.fluent: Experiment with name 'experiment_1' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/caiomiyashiro/repo/Personal/MLOpsZoomcamp/02-experiment-tracking/mlruns/239494832745782215', creation_time=1716777169074, experiment_id='239494832745782215', last_update_time=1716777169074, lifecycle_stage='active', name='experiment_1', tags={}>

In order to see the results in an UI, we can start a MLFlow UI page

``` bash
pip install mlflow
mlflow ui --backend-store-uri sqlite:///mlflow.db
``` 

You will be able to access the MLFlow UX at http://127.0.0.1:5000/

## First example with simple logging

In [5]:
X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

with mlflow.start_run():
    mlflow.set_tag('developer', 'caio')

    alpha = 0.1
    lr = LogisticRegression(C=alpha, random_state=42)
    mlflow.log_param('alpha', alpha)

    kf = KFold(n_splits=3, shuffle=True, random_state=1)
    rmse_scorer = make_scorer(mean_squared_error)
    cv_results = cross_val_score(lr, X_train, y_train, cv=kf, scoring=rmse_scorer)
    print("Average cross-validation RMSE:", cv_results.mean())
    mlflow.log_metric('rmse',cv_results.mean())
    
    lr.fit(X_train, y_train)

    # y_test_predicted = lr.predict(y_test)
    # test_rmse = mean_squared_error(y_test, y_test_predicted)

    with open('models/lasso.bin', 'wb') as f_out:
        pickle.dump(lr, f_out)
    mlflow.log_artifact(local_path="models/lasso.bin", artifact_path="models_pickle")

Average cross-validation RMSE: 7476.9177832749265


## More complex example with grid tuning and hyperopt

Hyperopt is a faster way to search hiperparameter space (compared to grid search)

After running them, select all runs at mlflow and click "compare"

In [11]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment('experiment_hypteropt_fine_tuning')

2024/05/27 12:10:53 INFO mlflow.tracking.fluent: Experiment with name 'experiment_hypteropt_fine_tuning' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/caiomiyashiro/repo/Personal/MLOpsZoomcamp/02-experiment-tracking/mlruns/1', creation_time=1716779453213, experiment_id='1', last_update_time=1716779453213, lifecycle_stage='active', name='experiment_hypteropt_fine_tuning', tags={}>

In [7]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_test, label=y_test)

In [13]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=500,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        mlflow.xgboost.log_model(booster, 'models_mlflow')

    return {'loss': rmse, 'status': STATUS_OK}

In [1]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

In [2]:
params = {
    'learning_rate': 0.09585355369315604,
    'max_depth': 30,
    'min_child_weight': 1.060597050922164,
    'objective': 'reg:linear',
    'reg_alpha': 0.018060244040060163,
    'reg_lambda': 0.011658731377413597,
    'seed': 42
}

with mlflow.start_run():
    mlflow.set_tag("model", "xgboost")
    mlflow.log_params(params)
    booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )
    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
    mlflow.xgboost.log_model(booster, 'models_mlflow')

## Loading the model from MLFlow

We check the MLFlow UI and check the run that has achieved the smaller RMSE. We're going to load the model in 2 ways:

1. Loading from a specific run
2. loading from model registry

In [15]:
# "model_mlflow" is the folder defined during model logging
logged_model = 'runs:/fe14c14ff3364b3e89576e1014dd2e1e/models_mlflow' 


# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# OR

xgboost_model = mlflow.xgboost.load_model(logged_model)

# Setting up MLFlow for different scenarios

## Show the different parts of MLFlow
1. Tracking server
2. Backend server
3. Artifact server


## Instatiate remote MLFlow in Azure

1. Start VM. Save public key and public IP address to add to alias. Change key permissions to only be accessible by owner
  
```bash
chmod 600 /Users/caiomiyashiro/.ssh/vm-mlops_key.pem
```
  
2. Start PostgreSQL Database and allow VM's IP to connect
3. Start Storage Container and allow VM's IP to connect

## Connect to VM using bash

1. Writing alias at .ssh/config

``` bash
Host mlops-zoomcamp
   HostName 20.18.227.5
   User azureuser
   IdentityFile ~/.ssh/vm-mlops_key.pem
   StrictHostKeyChecking no
```  

2. Connect to VM using: 
```bash
ssh mlops-zoomcamp
```

3. Install and configure anaconda
```bash
wget https://repo.anaconda.com/archive/Anaconda3-2024.02-1-Linux-x86_64.sh
bash Anaconda3-2024.02-1-Linux-x86_64.sh
# could install docker and docker container

conda create --name mlflow-tracking
conda install jupyter scikit-learn  pandas  seaborn  xgboost azure-storage-blob
pip3 install mlflow psycopg2-binary hyperopt azure-identity
# conda install -c conda-forge psycopg2-binary will update other packages and create a memory error!
# conda install mlflow might create error Unable to display MLflow UI - landing page (index.html) not found.
```

4. Install and configure Azure credentials
```bash
brew update && brew install azure-cli  # for mac or the equivalent for windows
az login  
# in case of new IP address, you might need to login with "az login --tenant 76529cbb-482f-4415-b366-251e1c034e34"
```

5. Initiate MLFlow with postgresSQL and blobstorage
```bash 
# Setting one of this combination and we DON'T NEED to login to azure
export AZURE_STORAGE_CONNECTION_STRING="<get from storage container>"
export AZURE_STORAGE_ACCOUNT="<>"
export AZURE_STORAGE_KEY="<>"

mlflow server -h 0.0.0.0 -p 5000 --backend-store-uri postgresql://azureuser:<password>@<hostname>:5432/postgres --default-artifact-root wasbs://<blob container>@<storage account>.blob.core.windows.net

# in case of errors, you can test the database connection from the VM
psql -h mlopspostgress.postgres.database.azure.com -U azureuser -d postgres -p 5432

az storage blob list --account-name mlopsdata37 --container-name mlops-blob --auth-mode login
```


# Locally

1. Install azure client so we can do browser login
```bash
brew update && brew install azure-cli  # for mac or the equivalent for windows
az login                               # login using browser
# in case of new IP address, you might need to login with "az login --tenant 76529cbb-482f-4415-b366-251e1c034e34"
```

In [24]:
import os # set access to storage container
os.environ["AZURE_STORAGE_CONNECTION_STRING"] = "<get from storage account>"

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

mlflow.set_tracking_uri('http://20.18.227.5:5000/')
mlflow.set_experiment("finally")

with mlflow.start_run():

    X, y = load_iris(return_X_y=True)

    params = {"C": 0.1, "random_state": 42}
    mlflow.log_params(params)

    lr = LogisticRegression(**params).fit(X, y)
    y_pred = lr.predict(X)
    mlflow.log_metric("accuracy", accuracy_score(y, y_pred))

    mlflow.sklearn.log_model(lr, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")



default artifacts URI: 'wasbs://mlops-blob@mlopsdata37.blob.core.windows.net/1/e732d7eca2c446b2bae55a91ee4237f0/artifacts'
