In [1]:
from datetime import datetime
from typing import List, Optional
from pydantic import BaseModel


class User(BaseModel):
    id: int
    name = 'John Doe'
    signup_ts: Optional[datetime] = None
    friends: List[int] = []


external_data = {
    'id': '123',
    'signup_ts': '2019-06-01 12:22',
    'friends': [1, 2, '3'],
}
user = User(**external_data)
print(user.dict())

{'id': 123, 'signup_ts': datetime.datetime(2019, 6, 1, 12, 22), 'friends': [1, 2, 3], 'name': 'John Doe'}


In [2]:
config = User.parse_obj(external_data)

In [10]:
import os
from pathlib import Path, PurePath
from typing import Dict, List, Optional
from urllib.parse import urlparse

import mlflow
from kedro.config import MissingConfigException
from kedro.framework.session import KedroSession, get_current_session
from kedro.framework.startup import _is_project
from mlflow.entities import Experiment
from mlflow.tracking.client import MlflowClient
from pydantic import BaseModel, PrivateAttr, StrictBool, validator
from typing_extensions import Literal


class MlflowServerOptions(BaseModel):
    # mutable default is ok for pydantic : https://stackoverflow.com/questions/63793662/how-to-give-a-pydantic-list-field-a-default-value
    mlflow_tracking_uri: str = "mlruns"
    stores_environment_variables: Dict[str, str] = {}
    credentials: Optional[str] = None
    _mlflow_client: MlflowClient = PrivateAttr()

    class Config:
        extra = "forbid"


class DisableTrackingOptions(BaseModel):
    # mutable default is ok for pydantic : https://stackoverflow.com/questions/63793662/how-to-give-a-pydantic-list-field-a-default-value
    pipelines: List[str] = []

    class Config:
        extra = "forbid"


class ExperimentOptions(BaseModel):
    name: str = "Default"
    restore_if_deleted: StrictBool = True
    _experiment: Experiment = PrivateAttr()
    # do not create _experiment immediately to avoid creating
    # a database connection when creating the object
    # it will be instantiated on setup() call

    class Config:
        extra = "forbid"


class RunOptions(BaseModel):
    id: Optional[str] = None
    name: Optional[str] = None
    nested: StrictBool = True

    class Config:
        extra = "forbid"


class DictParamsOptions(BaseModel):
    flatten: StrictBool = False
    recursive: StrictBool = True
    sep: str = "."

    class Config:
        extra = "forbid"


class MlflowParamsOptions(BaseModel):
    dict_params: DictParamsOptions = DictParamsOptions()
    long_params_strategy: Literal["fail", "truncate", "tag"] = "fail"

    class Config:
        extra = "forbid"


class MlflowTrackingOptions(BaseModel):
    # mutable default is ok for pydantic : https://stackoverflow.com/questions/63793662/how-to-give-a-pydantic-list-field-a-default-value
    disable_tracking: DisableTrackingOptions = DisableTrackingOptions()
    experiment: ExperimentOptions = ExperimentOptions()
    run: RunOptions = RunOptions()
    params: MlflowParamsOptions = MlflowParamsOptions()

    class Config:
        extra = "forbid"


class UiOptions(BaseModel):

    port: str = "5000"
    host: str = "127.0.0.1"

    class Config:
        extra = "forbid"


class KedroMlflowConfig(BaseModel):
    project_path: Path  # if str, will be converted
    server: MlflowServerOptions = MlflowServerOptions()
    tracking: MlflowTrackingOptions = MlflowTrackingOptions()
    ui: UiOptions = UiOptions()

    class Config:
        # force triggering type control when setting value instead of init
        validate_assignment = True
        # raise an error if an unknown key is passed to the constructor
        extra = "forbid"

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.server.mlflow_tracking_uri = self._validate_uri(
            self.server.mlflow_tracking_uri
        )
        # init after validating the uri, else mlflow creates a mlruns folder at the root
        self.server._mlflow_client = MlflowClient(
            tracking_uri=self.server.mlflow_tracking_uri
        )

    def setup(self, session: KedroSession = None):
        """Setup all the mlflow configuration"""

        self._export_credentials(session)

        # we set the configuration now: it takes priority
        # if it has already be set in export_credentials
        mlflow.set_tracking_uri(self.server.mlflow_tracking_uri)

        self._set_experiment()

    def _export_credentials(self, session: KedroSession = None):
        session = session or get_current_session()
        context = session.load_context()
        conf_creds = context._get_config_credentials()
        mlflow_creds = conf_creds.get(self.server.credentials, {})
        for key, value in mlflow_creds.items():
            os.environ[key] = value

    def _set_experiment(self):
        """Best effort to get the experiment associated
        to the configuration

        Returns:
            mlflow.entities.Experiment -- [description]
        """

        # we retrieve the experiment manually to check if it exsits
        mlflow_experiment = self.server._mlflow_client.get_experiment_by_name(
            name=self.tracking.experiment.name
        )
        # Deal with two side case when retrieving the experiment
        if mlflow_experiment is not None:
            if (
                self.tracking.experiment.restore_if_deleted
                and mlflow_experiment.lifecycle_stage == "deleted"
            ):
                # the experiment was created, then deleted : we have to restore it manually before setting it as the active one
                self.server._mlflow_client.restore_experiment(
                    mlflow_experiment.experiment_id
                )

        # this creates the experiment if it does not exists
        # and creates a global variable with the experiment
        # but returns nothing
        mlflow.set_experiment(experiment_name=self.tracking.experiment.name)

        # we do not use "experiment" variable directly but we fetch again from the database
        # because if it did not exists at all, it was created by previous command
        self.tracking.experiment._experiment = (
            self.server._mlflow_client.get_experiment_by_name(
                name=self.tracking.experiment.name
            )
        )

    def _validate_uri(self, uri):
        """Format the uri provided to match mlflow expectations.

        Arguments:
            uri {Union[None, str]} -- A valid filepath for mlflow uri

        Returns:
            str -- A valid mlflow_tracking_uri
        """

        # this is a special reserved keyword for mlflow which should not be converted to a path
        # se: https://mlflow.org/docs/latest/tracking.html#where-runs-are-recorded
        if uri == "databricks":
            return uri

        # if no tracking uri is provided, we register the runs locally at the root of the project
        pathlib_uri = PurePath(uri)

        if pathlib_uri.is_absolute():
            valid_uri = pathlib_uri.as_uri()
        else:
            parsed = urlparse(uri)
            if parsed.scheme == "":
                # if it is a local relative path, make it absolute
                # .resolve() does not work well on windows
                # .absolute is undocumented and have known bugs
                # Path.cwd() / uri is the recommend way by core developpers.
                # See : https://discuss.python.org/t/pathlib-absolute-vs-resolve/2573/6
                valid_uri = (self.project_path / uri).as_uri()
            else:
                # else assume it is an uri
                valid_uri = uri

        return valid_uri

    @validator("project_path")
    def _is_kedro_project(cls, folder_path):
        return folder_path



In [26]:
mlflow_config = ''' 
# SERVER CONFIGURATION -------------------

# `mlflow_tracking_uri` is the path where the runs will be recorded.
# For more informations, see https://www.mlflow.org/docs/latest/tracking.html#where-runs-are-recorded
# kedro-mlflow accepts relative path from the project root.
# For instance, default `mlruns` will create a mlruns folder
# at the root of the project

# All credentials needed for mlflow must be stored in credentials .yml as a dict
# they will be exported as environment variable
# If you want to set some credentials,  e.g. AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
# > in `credentials.yml`:
# your_mlflow_credentials:
#   AWS_ACCESS_KEY_ID: 132456
#   AWS_SECRET_ACCESS_KEY: 132456
# > in this file `mlflow.yml`:
# credentials: mlflow_credentials

server:
  mlflow_tracking_uri: mlruns
  stores_environment_variables: {} # any valid mlflow variables either for backend store or artifact store (e.g. MLFLOW_S3_ENDPOINT_URL), see: https://www.mlflow.org/docs/latest/tracking.html#amazon-s3-and-s3-compatible-storage
  credentials: null  # must be a valid key in credentials.yml which refers to a dict of sensituve mlflow environment variables (password, tokens...)

tracking:
  # You can specify a list of pipeline names for which tracking will be disabled
  # Running "kedro run --pipeline=<pipeline_name>" will not log parameters
  # in a new mlflow run

  disable_tracking:
    pipelines: []

  experiment:
    name: kedro_spark
    restore_if_deleted: True  # if the experiment`name` was previously deleted experiment, should we restore it?

  run:
    id: null # if `id` is None, a new run will be created
    name:  "test_run" # if `name` is None, pipeline name will be used for the run name
    nested: True  # if `nested` is False, you won't be able to launch sub-runs inside your nodes

  params:
    dict_params:
      flatten: False  # if True, parameter which are dictionary will be splitted in multiple parameters when logged in mlflow, one for each key.
      recursive: True  # Should the dictionary flattening be applied recursively (i.e for nested dictionaries)? Not use if `flatten_dict_params` is False.
      sep: "." # In case of recursive flattening, what separator should be used between the keys? E.g. {hyperaparam1: {p1:1, p2:2}} will be logged as hyperaparam1.p1 and hyperaparam1.p2 in mlflow.
    long_params_strategy: fail # One of ["fail", "tag", "truncate" ] If a parameter is above mlflow limit (currently 250), what should kedro-mlflow do? -> fail, set as a tag instead of a parameter, or truncate it to its 250 first letters?


# UI-RELATED PARAMETERS -----------------

ui:
  port: "5000" # the port to use for the ui. Use mlflow default with 5000.
  host: "127.0.0.1"  # the host to use for the ui. Use mlflow efault of "127.0.0.1".
'''

In [27]:
import yaml
conf = yaml.load(mlflow_config, Loader=yaml.SafeLoader)
conf['project_path'] = '/storage/projects'

In [29]:
conf_obj = KedroMlflowConfig.parse_obj(conf)

In [30]:
conf_obj.setup()

RuntimeError: There is no active Kedro session.