In [1]:
import sys

sys.path.append("/home/jupyter-aamir09/mlops2_with_dagster/")



from mlops2_with_dagster import model_pipeline, features_pipeline
from pathlib import Path
from joblib import dump, load
from hamilton import driver, base
from mlops2_with_dagster.utils import get_project_dir, printse

from sklearn.preprocessing import (
    StandardScaler,
    LabelEncoder
)
from ortho.utils.logger import Logger
from ortho.ortho.decorators.task import task
from typing import Tuple

import pandas as pd 
import mlflow as mf
from ortho.ortho.callbacks.mlflow import MlFlowCallBack
from ortho.ortho.callbacks.logger import LoggerCallBack
import os


logger = Logger().logger()

   
ARTIFACT_PATH = "/home/jupyter-aamir09/mlops2_with_dagster/artifacts/mlflow_artfacts"
LOGGER_FOLDER_PATH = "/home/jupyter-aamir09/mlops2_with_dagster/notebooks"
        
        

None


2023-08-30 13:15:24.602208: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-30 13:15:24.634739: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-30 13:15:24.635542: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-30 13:15:25,111 - DEBUG - Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.
2023-08-30 13:15:25,290 - DEBUG - Creating converter from 7 to 5
2023-08-30 13:15:25,291 - DEBUG - Creating converter from 5 to 7
2023-08-30 13:15:25,292 - DEBUG - Creating converter from 7 to 5
2023-08-30 13:15:25,292 - DEBUG - Creating converter

In [2]:
project = 'mlops2_with_dagster'
project_dir = get_project_dir(project)
printse(f'project_dir: {project_dir}')

>>>> project_dir: /home/jupyter-aamir09/mlops2_with_dagster


In [3]:
train_data : Path = project_dir/"data/train.csv"
test_data : Path = project_dir/"data/test.csv"

In [None]:
train_features_file: str = f"{project_dir}/warehouse/featurestore_train.parquet"
target_file: str = f"{project_dir}/warehouse/target.parquet"

In [4]:
# parameters
train_features = pd.read_parquet(train_features_file)
target = pd.read_parquet(target_file)

In [5]:
class Train:
    
    def __init__(self, callbacks,  experiment_name, run_name, load_from_artifact=False):
        self.load_from_artifact = load_from_artifact
        self.callbacks = callbacks
        self.experiment_name = experiment_name
        self.run_name = run_name
        os.environ["MLFLOW_EXPERIMENT_NAME"] = self.experiment_name
        os.environ["MLFLOW_RUN_NAME"] = self.run_name
        
        self.index_col = 'passengerid'
        self.target_col = "survived"
        self.cat_cols = ["sex", "cabin", "embarked"]
        self.config = {
            'index_column': self.index_col,
            'target_column': self.target_col,
            'categorical_columns': self.cat_cols
        }
        
        self.config_model = {
            'index_column': self.index_col,
            'target_column': self.target_col,
            'random_state': 42,
            'max_depth': None,
            'validation_size_fraction': 0.33,
            't': 0.5
        }
    
    @task(build_on_previous_run=True, end_mlflow_run=False)       
    def fit(self, transformed_data=None, target=None):
        training_adapter = base.SimplePythonGraphAdapter(base.DictResult())
        training_dr = driver.Driver(self.config_model, 
                           model_pipeline,
                           adapter=training_adapter)
        dtraining = dict(
            final_feature_matrix = transformed_data,
            target = target.target
        )
        output_nodes = ['fit_clf', 'train_predictions', 'valid_predictions']
        
        output = training_dr.execute(output_nodes, inputs = dtraining)
        
        data = {"training_outputs": output }
        payload = {"artifact_path": ARTIFACT_PATH,
                "params":self.config_model}
        return data, payload
        
    def run(self, *args, **kwargs):
        return self.fit(*args, **kwargs)
        
        
        
        
    

In [6]:
import dagstermill
import pickle
trainer = Train(experiment_name="Mlflow_with_Dagster",
                          run_name="BadamBhum",
                          callbacks = [LoggerCallBack(LOGGER_FOLDER_PATH, kernel_names=["fit"]),
                                       MlFlowCallBack(kernel_names=["fit"])],
                         )

# mf.end_run() #Document why it his here 
output = trainer.run(transformed_data=train_features, target=target)[0]["training_outputs"]
dagstermill.yield_result(output, output_name="training_outputs")

2023-08-30 13:15:25,701 - DEBUG - Starting new HTTP connection (1): 127.0.0.1:5002
2023-08-30 13:15:25,706 - DEBUG - http://127.0.0.1:5002 "GET /api/2.0/mlflow/experiments/get-by-name?experiment_name=Mlflow_with_Dagster HTTP/1.1" 200 241
2023-08-30 13:15:25,708 - DEBUG - Resetting dropped connection: 127.0.0.1
2023-08-30 13:15:25,712 - DEBUG - http://127.0.0.1:5002 "GET /api/2.0/mlflow/runs/get?run_uuid=1f0a6e058efb4c5d9e9fbecc36928d78&run_id=1f0a6e058efb4c5d9e9fbecc36928d78 HTTP/1.1" 200 1260
2023-08-30 13:15:25,714 - DEBUG - Resetting dropped connection: 127.0.0.1
2023-08-30 13:15:25,718 - DEBUG - http://127.0.0.1:5002 "POST /api/2.0/mlflow/runs/update HTTP/1.1" 200 423
2023-08-30 13:15:25,720 - DEBUG - Resetting dropped connection: 127.0.0.1
2023-08-30 13:15:25,724 - DEBUG - http://127.0.0.1:5002 "GET /api/2.0/mlflow/runs/get?run_uuid=1f0a6e058efb4c5d9e9fbecc36928d78&run_id=1f0a6e058efb4c5d9e9fbecc36928d78 HTTP/1.1" 200 1260
2023-08-30 13:15:25,724 - INFO - Successfuly initiated run

['self', 'transformed_data', 'target']
/tmp/tmp2aoartol/transformed_data.pickle


2023-08-30 13:15:25,981 - INFO - Succesfully loaded file with name transformed_data.pickle
  response[fname.replace(".pickle", "")] = pickle.load(open(client.download_artifacts(mlflow_run_id, fname), "rb"))
2023-08-30 13:15:25,984 - DEBUG - Resetting dropped connection: 127.0.0.1
2023-08-30 13:15:25,986 - DEBUG - http://127.0.0.1:5002 "GET /api/2.0/mlflow-artifacts/artifacts?path=1%2F1f0a6e058efb4c5d9e9fbecc36928d78%2Fartifacts%2Ftransformed_data.pickle HTTP/1.1" 200 2
2023-08-30 13:15:25,989 - DEBUG - Resetting dropped connection: 127.0.0.1
2023-08-30 13:15:25,998 - DEBUG - http://127.0.0.1:5002 "GET /api/2.0/mlflow-artifacts/artifacts/1/1f0a6e058efb4c5d9e9fbecc36928d78/artifacts/transformed_data.pickle HTTP/1.1" 200 None
  print(client.download_artifacts(mlflow_run_id, fname))
2023-08-30 13:15:26,115 - DEBUG - Resetting dropped connection: 127.0.0.1
2023-08-30 13:15:26,117 - DEBUG - http://127.0.0.1:5002 "GET /api/2.0/mlflow-artifacts/artifacts?path=1%2F1f0a6e058efb4c5d9e9fbecc36928d

/tmp/tmpme_mwud4/target.pickle
{'transformed_data':              pclass    age   fare  cabin_category  sex_category  \
passengerid                                                       
0                 1   0.00  27.14               2             1   
1                 3   0.00  13.35               8             1   
2                 3   0.33  71.29               8             1   
3                 3  19.00  13.04               8             1   
4                 3  25.00   7.76               8             1   
...             ...    ...    ...             ...           ...   
99995             2  62.00  14.86               3             0   
99996             2  66.00  11.15               8             1   
99997             3  37.00   9.95               8             1   
99998             3  51.00  30.92               8             1   
99999             3  55.00  13.96               8             1   

             embarked_category  family  
passengerid                        

2023-08-30 13:15:26,438 - DEBUG - Succeed in sending telemetry consisting of [b'{"api_key": "phc_mZg8bkn3yvMxqvZKRlMlxjekFU5DFDdcdAsijJ2EH5e", "event": "os_hamilton_run_start", "properties": {"os_type": "posix", "os_version": "Linux-5.4.0-124-generic-x86_64-with-glibc2.31", "python_version": "3.10.12/CPython", "distinct_id": "e4e272c1-5845-4b12-8bd8-09d84d585dbc", "hamilton_version": [1, 27, 2], "telemetry_version": "0.0.1", "number_of_nodes": 22, "number_of_modules": 1, "number_of_config_items": 6, "decorators_used": {"parameterize_sources": 4, "extract_columns": 2, "extract_fields": 1}, "graph_adapter_used": "hamilton.base.SimplePythonGraphAdapter", "result_builder_used": "hamilton.base.DictResult", "driver_run_id": "1f5e43de-c8c5-4ae0-b944-8387d81c3b2f", "error": null, "graph_executor_class": "DefaultGraphExecutor"}}'].
2023-08-30 13:15:31,154 - DEBUG - Computing t.
2023-08-30 13:15:31,155 - DEBUG - Computing train_predictions.
2023-08-30 13:15:32,405 - DEBUG - Computing X_valid.
20

{'fit_clf': RandomForestClassifier(random_state=42),
 'train_predictions': (array([0.2   , 0.76  , 0.17  , ..., 0.6775, 0.84  , 0.83  ]),
  array([0, 1, 0, ..., 1, 1, 1])),
 'valid_predictions': (array([0.62, 0.01, 0.01, ..., 0.38, 0.28, 0.64]),
  array([1, 0, 0, ..., 0, 0, 1]))}