# Helper functions

> Helper: Module containing helper functions for mlflow etc

In [1]:
#| default_exp helper

In [3]:
#| hide
from nbdev.showdoc import *

In [4]:
#| hide
import nbdev; nbdev.nbdev_export()

In [1]:
#| export
from loguru import logger
import os
from pathlib import Path
from fastcore.basics import patch_to, patch

import mlflow
from mlflow import MlflowClient
from mlflow.entities import ViewType

import pandas as pd

from typing import Union, List

from datetime import datetime

## MLflow

In [None]:
#| export
def setup_mlflow(tracking_uri: str, # MLflow server tracking uri
                 experiment_id: Union[str, int], # Experiment name / ID 
) -> None:
    "MLflow set up."

    mlflow.set_tracking_uri(tracking_uri)
    mlflow.set_experiment(experiment_id)
    logger.info(f"Set MLflow experiment to {experiment_id}.")

In [None]:
# setup_mlflow("#####") #removed the URI for privacy

In [15]:
#| export
def log_artifact(file: str, # path of the file to save
                 artifact_path: str, # artifact path in mlflow 
                 run_id:str = None # run id to save the artifact. will create new run_id if None
                ):
    # logs a serialized artifact to MLflow
    with mlflow.start_run(run_id):
        mlflow.log_artifact(file, artifact_path)  

Logs a serialized artifact to MLflow

In [16]:
#| export
def search_runs(
    filter_string: str, #  Filter query string, defaults to searching all runs.
    experiment_ids: Union[List[int], int] = 1,  # List of experiment IDs, or a single int or string id.
    max_results: int = None, # Maximum number of runs desired.
    order: str =None # List of columns to order by (e.g., “metrics.rmse”). Can contain optional DESC or ASC value. Default value ASC.
) -> List:
    " wrapper function for [mlflow.client.MlflowClient.search_runs](https://mlflow.org/docs/latest/python_api/mlflow.client.html#mlflow.client.MlflowClient.search_runs) "
    
    results = MlflowClient().search_runs(
        experiment_ids=str(experiment_ids),
        filter_string=filter_string,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=max_results,
        order_by=order
        )
    
    run_ids = []

    for no, run in enumerate(results):
        logger.info(f"Run {no}: {run.info.run_id}")
        run_ids.append(run.info.run_id)
    return run_ids

Searches MLflow runs 

In [17]:
search_runs("", experiment_ids = 1)

[32m2023-10-24 10:09:24.202[0m | [1mINFO    [0m | [36m__main__[0m:[36msearch_runs[0m:[36m21[0m - [1mRun 0: 389a9e1293034591bdc31dc7b7dc63ee[0m
[32m2023-10-24 10:09:24.204[0m | [1mINFO    [0m | [36m__main__[0m:[36msearch_runs[0m:[36m21[0m - [1mRun 1: 5249f70572e144cf97499fe236a38da4[0m
[32m2023-10-24 10:09:24.205[0m | [1mINFO    [0m | [36m__main__[0m:[36msearch_runs[0m:[36m21[0m - [1mRun 2: a60cdd3d5e794eff9e2a75228988099b[0m
[32m2023-10-24 10:09:24.206[0m | [1mINFO    [0m | [36m__main__[0m:[36msearch_runs[0m:[36m21[0m - [1mRun 3: 3063b5ba5b2c44b4b148bdc84734e657[0m
[32m2023-10-24 10:09:24.206[0m | [1mINFO    [0m | [36m__main__[0m:[36msearch_runs[0m:[36m21[0m - [1mRun 4: af072dcf20924719ba0a50fddf5bf809[0m
[32m2023-10-24 10:09:24.207[0m | [1mINFO    [0m | [36m__main__[0m:[36msearch_runs[0m:[36m21[0m - [1mRun 5: fcb9be9381a44e528eb2e2b30a3b6c04[0m
[32m2023-10-24 10:09:24.208[0m | [1mINFO    [0m | [36m__main__[0

['389a9e1293034591bdc31dc7b7dc63ee',
 '5249f70572e144cf97499fe236a38da4',
 'a60cdd3d5e794eff9e2a75228988099b',
 '3063b5ba5b2c44b4b148bdc84734e657',
 'af072dcf20924719ba0a50fddf5bf809',
 'fcb9be9381a44e528eb2e2b30a3b6c04',
 '0c6e7c0be2964efb9cf08607efbf81c9',
 '2b26a11cde204b26abd93886207f72a8',
 '4a98276740ad4df697cca3152fd6cab9',
 '063c3f91e6bc47a1b2e3cc7e73407ce7',
 'ac4c922139184a95b68ea7fe00652644',
 'b10f8529e3004025a7fa4fd9566d4870']

In [18]:
#| export
def get_mlflow_model(run_id:str, # Run ID where artifact is stored
                     artifact_name:str, # name of the artifact
                     artifact_path:str="models" # Path of the artifact within the run
                    ) -> str:
    "Retrieves MLflow artifact path."
    
    model_path = Path(mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path=artifact_path))
    artifact_path = model_path / artifact_name
    
    return str(model_path / artifact_name)

In [19]:
run_id = "73ab5c472b574813a18be82f5395c546"
model_name = "setfit_preliminary_sampling.pkl"
int2label_name = "int2labeldict.pkl"

get_mlflow_model(run_id, model_name)

'/var/mlflow/mlruns/5/73ab5c472b574813a18be82f5395c546/artifacts/models/setfit_preliminary_sampling.pkl'

Retrieves MLflow runs

In [20]:
run_id = "0c6e7c0be2964efb9cf08607efbf81c9"
get_mlflow_model(run_id = run_id, artifact_name = "topic_model", artifact_path = "models")

'/var/mlflow/mlruns/1/0c6e7c0be2964efb9cf08607efbf81c9/artifacts/models/topic_model'

In [21]:
#| export
def register_model(run_id: str, # Run ID from MLflow where model resides (eg `73ab5c472b574813a18be82f5395c546`)
                   model_path: str, # Path of the model (eg `setfit_preliminary.pkl`)
                   model_name: str, # Name of the model
                  ):
    mlflow.register_model(f"runs:/{run_id}/{model_path}", model_name)

In [23]:
#| export
#TODO
def get_latest_registered_model(name: str, # Name of the registered model,
                                only_run_id: bool = True # returns only `run_id` rather than path to model artifact
                               ):
    if only_run_id:
        return MlflowClient().get_latest_versions(name)[0].run_id
    else:
        return MlflowClient().get_latest_versions(name)[0].source.replace("file://", "")
    

## DataFrame

In [26]:
#| export
def view_df(df: "pd.DataFrame", # Pandas DataFrame to be viewed
            min_rows: int = 60, # minimum row 
            max_colswidth: int = 500, # maximum width of the column
            max_cols: int = None # maximum columns
           ):
    "View dataframe in full columns in Jupyter! If `max_cols==None`, it will show the full column."
    
    with pd.option_context('display.max_columns', max_cols, 'display.min_rows', min_rows, 'display.max_colwidth', max_colswidth):
        display(df)

## Others

In [8]:
#| export
def get_now():
    " Returns the time now in 'yyyy-mm-dd_HHMMSS"
    return datetime.now().strftime("%Y%m%d_%H%M%S")

In [9]:
get_now()

'20231028_021425'

In [10]:
#| export
def get_today(fmt="%Y-%m-%d"):
    " Returns today's date (default format: 'yyyy-mm-dd')"
    return datetime.today().strftime(fmt)

In [27]:
#| hide
import nbdev; nbdev.nbdev_export()