# Pipelines: Metaflow model training

##  Install dependencies

In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Set username

In [2]:
# Set username for workflows
import os
os.environ["USERNAME"] = "pablo"

In [3]:
%%writefile metaflow_trainingflow.py
from metaflow import FlowSpec, Parameter, step
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle


class TrainingFlow(FlowSpec):
    # Define the parameters for the flow
    max_depth = Parameter('max_depth', default=2, help='Max depth of the random forest classifier')
    n_estimators = Parameter('n_estimators', default=100, help='Number of estimators for the random forest classifier')
    random_state = Parameter('random_state', default=0, help='Random state for the random forest classifier')

    @step
    def start(self):
        # Start the flow
        self.next(self.ingest_data)

    @step
    def ingest_data(self):
        from sklearn.datasets import load_iris

        # Load the iris dataset
        iris = load_iris()

        #pylint: disable=no-member
        self.X = iris.data
        self.y = iris.target
        #pylint: enable=no-member

        self.next(self.split_data)

    @step
    def split_data(self):
        #Split the data into train and test
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, random_state=self.random_state)

        self.next(self.train)


    @step
    def train(self):
        # Train the model
       self.model = RandomForestClassifier(max_depth=self.max_depth,
            n_estimators=self.n_estimators,random_state=self.random_state)

       self.model.fit(self.X_train, self.y_train)

       self.next(self.show_metrics)


    @step
    def show_metrics(self):
        # Print some metrics
        self.y_pred = self.model.predict(self.X_test)

        # Compute and print metrics
        accuracy = accuracy_score(self.y_test, self.y_pred)
        report = classification_report(self.y_test, self.y_pred)

        print(f"Accuracy: {accuracy:.2f}")
        print("Classification Report:")
        print(report)

        self.next(self.register_model)


    @step
    def register_model(self):
        # Save the model in a pickle file in local storage
        model_path = os.path.join(os.getcwd(), "random_forest_model.pkl")
        with open(model_path, "wb") as f:
            pickle.dump(self.model, f)

        print(f"Model saved to: {model_path}")
        self.next(self.end)

    @step
    def end(self):
        pass

if __name__ == '__main__':
    TrainingFlow()

Overwriting metaflow_trainingflow.py


In [4]:
!python metaflow_trainingflow.py run --max_depth 2 --n_estimators 100 --random_state 0

Traceback (most recent call last):
  File "c:\Users\Master\AppData\Local\Programs\Python\Python311\Lib\site-packages\metaflow\extension_support\plugins.py", line 99, in get_plugin
    plugin_module = importlib.import_module(path)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Master\AppData\Local\Programs\Python\Python311\Lib\importlib\__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<frozen importlib._bootstrap>", line 1206, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1178, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1149, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 690, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 940, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "c:\Users\Master\AppData\Local\P