# Pipelines: Metaflow model training

##  Install dependencies

In [5]:
!pip install -r requirements.txt



## Set username

In [6]:
# Set username for workflows
import os
os.environ["USERNAME"] = "pau"

In [7]:
%%writefile metaflow_trainingflow.py
from metaflow import FlowSpec, Parameter, step
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pickle
import os

class TrainingFlow(FlowSpec):
    # Define the parameters for the flow
    max_depth = Parameter('max_depth', default=2, help='Max depth of the random forest classifier')
    n_estimators = Parameter('n_estimators', default=100, help='Number of estimators for the random forest classifier')
    random_state = Parameter('random_state', default=0, help='Random state for the random forest classifier')

    @step
    def start(self):
        # Start the flow
        self.next(self.ingest_data)

    @step
    def ingest_data(self):
        from sklearn.datasets import load_iris

        # Load the iris dataset
        iris = load_iris()

        #pylint: disable=no-member
        self.X = iris.data
        self.y = iris.target
        #pylint: enable=no-member

        self.next(self.split_data)

    @step
    def split_data(self):
        #Split the data into train and test
        # TODO: WRITE YOUR CODE HERE
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.8, random_state=self.random_state)
        self.next(self.train)

    @step
    def train(self):
        # Train the model
        # TODO: WRITE YOUR CODE HERE
        self.rfc = RandomForestClassifier(max_depth=self.max_depth, n_estimators=self.n_estimators, random_state=self.random_state)
        self.rfc.fit(self.X_train, self.y_train)
        self.next(self.show_metrics)


    @step
    def show_metrics(self):
        # Print some metrics
        # TODO: WRITE YOUR CODE HERE
        y_pred = self.rfc.predict(self.X_test)

        accuracy = accuracy_score(self.y_test, y_pred)
        print(f"Accuracy: {accuracy:.2f}")

        conf_matrix = confusion_matrix(self.y_test, y_pred)
        print("Confussion Matrix: ")
        print(conf_matrix)

        self.next(self.register_model)



    @step
    def register_model(self):
        # Save the model in a pickle file in local storage
        # TODO: WRITE YOUR CODE HERE
        model_path = os.path.join("model", "model.pkl")
        os.makedirs(os.path.dirname(model_path), exist_ok=True)
        with open(model_path, 'wb') as f:
          pickle.dump(self.rfc, f)

        self.next(self.end)

    @step
    def end(self):
        print("Flow finished")

if __name__ == '__main__':
    TrainingFlow()

Overwriting metaflow_trainingflow.py


In [8]:
!python metaflow_trainingflow.py run --max_depth 2 --n_estimators 100 --random_state 0

[35m[1mMetaflow 2.13.9[0m[35m[22m executing [0m[31m[1mTrainingFlow[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:pau[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[22m    Pylint not found, so extra checks are disabled.[K[0m[32m[22m[0m
[35m2025-04-12 16:04:57.203 [0m[1mWorkflow starting (run-id 1744473897201488):[0m
[35m2025-04-12 16:04:57.294 [0m[32m[1744473897201488/start/1 (pid 5784)] [0m[1mTask is starting.[0m
[35m2025-04-12 16:05:03.724 [0m[32m[1744473897201488/start/1 (pid 5784)] [0m[1mTask finished successfully.[0m
[35m2025-04-12 16:05:03.745 [0m[32m[1744473897201488/ingest_data/2 (pid 5819)] [0m[1mTask is starting.[0m
[35m2025-04-12 16:05:08.995 [0m[32m[1744473897201488/ingest_data/2 (pid 5819)] [0m[1mTask finished successfully.[0m
[35m2025-04-12 16:05:09.002 [0m[32m[1744473897201