# Training the Model

In [1]:
from mlops2_with_dagster.utils import get_project_dir, printse
from pathlib import Path
project = 'mlops2_with_dagster'
project_dir = get_project_dir(project)
printse(f'project_dir: {project_dir}')

>>>> project_dir: /Users/rahul/Websites/mlops2_with_dagster


In [2]:
import pandas as pd
from joblib import dump, load
from hamilton import driver, base

In [3]:
index_col = 'passengerid'
target_col = "survived"
cat_cols = ["sex", "cabin", "embarked"]
config = {
    'index_column': index_col,
    'target_column': target_col,
    'categorical_columns': cat_cols
}

In [4]:
train_features_file: str = f"{project_dir}/warehouse/featurestore_train.parquet"
target_file: str = f"{project_dir}/warehouse/target.parquet"

In [5]:
# parameters
train_features = pd.read_parquet(train_features_file)
target = pd.read_parquet(target_file)

## Model Training Pipeline

In [6]:
from mlops2_with_dagster import model_pipeline, features_pipeline

In [7]:
config_model = {
    'index_column': index_col,
    'target_column': target_col,
    'random_state': 42,
    'max_depth': None,
    'validation_size_fraction': 0.33,
    't': 0.5
}

In [9]:
training_adapter = base.SimplePythonGraphAdapter(base.DictResult())
training_dr = driver.Driver(config_model, 
                           model_pipeline,
                           adapter=training_adapter)
dtraining = dict(
    final_feature_matrix = train_features,
    target = target.target
)
# training_dr.visualize_execution(['fit_clf', 'train_predictions', 'valid_predictions'],
#                                        f"{project_dir}/artifacts/training.dot",
#                                        {}, 
#                                        inputs = dtraining
# )

![](./training.dot.pdf)

In [10]:
rfdict = training_dr.execute(['fit_clf', 'train_predictions', 'valid_predictions'], inputs = dtraining)

In [11]:
rfdict

{'fit_clf': RandomForestClassifier(random_state=42),
 'train_predictions': (array([0.4 , 0.98, 0.96, ..., 0.74, 0.84, 0.  ]),
  array([0, 1, 1, ..., 1, 1, 0])),
 'valid_predictions': (array([0.06, 0.17, 0.  , ..., 0.08, 0.57, 0.05]),
  array([0, 0, 0, ..., 0, 1, 0]))}

In [12]:
# dump(rfdict, f"{project_dir}/models/rf.joblib")

['/Users/rahul/Websites/mlops2_with_dagster/models/rf.joblib']

In [13]:
import dagstermill

dagstermill.yield_result(rfdict, output_name="training_outputs")

{'fit_clf': RandomForestClassifier(random_state=42),
 'train_predictions': (array([0.4 , 0.98, 0.96, ..., 0.74, 0.84, 0.  ]),
  array([0, 1, 1, ..., 1, 1, 0])),
 'valid_predictions': (array([0.06, 0.17, 0.  , ..., 0.08, 0.57, 0.05]),
  array([0, 0, 0, ..., 0, 1, 0]))}