# Pipelines: Metaflow model training

##  Install dependencies

In [1]:
!pip3 install -r requirements.txt



## Set username

In [2]:
# Set username for workflows
import os
os.environ["USERNAME"] = "cokecancook"

In [3]:
%%writefile metaflow_trainingflow.py
from metaflow import FlowSpec, Parameter, step
from sklearn.model_selection import train_test_split


class TrainingFlow(FlowSpec):
    # Define the parameters for the flow
    max_depth = Parameter('max_depth', default=2, help='Max depth of the random forest classifier')
    n_estimators = Parameter('n_estimators', default=100, help='Number of estimators for the random forest classifier')
    random_state = Parameter('random_state', default=0, help='Random state for the random forest classifier')
    
    @step
    def start(self):
        # Start the flow
        self.next(self.ingest_data)
        
    @step
    def ingest_data(self):
        from sklearn.datasets import load_iris
    
        # Load the iris dataset
        iris = load_iris()
        
        #pylint: disable=no-member
        self.X = iris.data
        self.y = iris.target
        #pylint: enable=no-member
        
        self.next(self.split_data)

    @step
    def split_data(self):
        #Split the data into train and test
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, random_state=int(self.random_state)
        )
        
        self.next(self.train)

    @step
    def train(self):
        # Train the model
        from sklearn.ensemble import RandomForestClassifier
        
        clf = RandomForestClassifier(
            max_depth=int(self.max_depth),
            n_estimators=int(self.n_estimators),
            random_state=int(self.random_state)
        )
        clf.fit(self.X_train, self.y_train)
        self.model = clf
        
        self.next(self.show_metrics)

    @step
    def show_metrics(self):
        # Print some metrics
        from sklearn.metrics import accuracy_score
        
        predictions = self.model.predict(self.X_test)
        accuracy = accuracy_score(self.y_test, predictions)
        print("Accuracy:", accuracy)
        self.accuracy = accuracy
        
        self.next(self.register_model)
        
    @step
    def register_model(self):
        # Save the model in a pickle file in local storage
        import pickle
        
        model_filename = "random_forest_model.pkl"
        with open(model_filename, "wb") as f:
            pickle.dump(self.model, f)
            
        print("Model saved as", model_filename)
        self.next(self.end)
        
    @step
    def end(self):
        print("Flow completed.")
    
if __name__ == '__main__':
    TrainingFlow()

Writing metaflow_trainingflow.py


In [4]:
!python metaflow_trainingflow.py run --max_depth 2 --n_estimators 100 --random_state 0

[35m[1mMetaflow 2.13.9[0m[35m[22m executing [0m[31m[1mTrainingFlow[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:cokestuyck[0m[35m[22m[K[0m[35m[22m[0m
[22mCreating local datastore in current directory (/Users/cokestuyck/Documents/GitHub/EDEM_MDA2425/ALUMNOS/MIA/COKE_STUYCK/MLOPS/03-pipelines/.metaflow)[K[0m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[22m    Pylint not found, so extra checks are disabled.[K[0m[32m[22m[0m
[35m2025-04-20 00:51:51.590 [0m[1mWorkflow starting (run-id 1745103111589740):[0m
[35m2025-04-20 00:51:51.605 [0m[32m[1745103111589740/start/1 (pid 8898)] [0m[1mTask is starting.[0m
[35m2025-04-20 00:51:52.591 [0m[32m[1745103111589740/start/1 (pid 8898)] [0m[1mTask finished successfully.[0m
[35m2025-04-20 00:51:52.596 [0m[32m[1745103111589740/ingest_data/2 (pid 8900)] [0m[1mTask is starting.[

Metaflow 2.13.9 executing TrainingFlow for user:cokestuyck
Creating local datastore in current directory (/Users/cokestuyck/Documents/GitHub/EDEM_MDA2425/ALUMNOS/MIA/COKE_STUYCK/MLOPS/03-pipelines/.metaflow)
Validating your flow...
    The graph looks good!
Running pylint...
    Pylint not found, so extra checks are disabled.
2025-04-20 00:51:51.590 Workflow starting (run-id 1745103111589740):
2025-04-20 00:51:51.605 [1745103111589740/start/1 (pid 8898)] Task is starting.
2025-04-20 00:51:52.591 [1745103111589740/start/1 (pid 8898)] Task finished successfully.
2025-04-20 00:51:52.596 [1745103111589740/ingest_data/2 (pid 8900)] Task is starting.
2025-04-20 00:51:53.637 [1745103111589740/ingest_data/2 (pid 8900)] Task finished successfully.
2025-04-20 00:51:53.642 [1745103111589740/split_data/3 (pid 8903)] Task is starting.
2025-04-20 00:51:54.539 [1745103111589740/split_data/3 (pid 8903)] Task finished successfully.
2025-04-20 00:51:54.546 [1745103111589740/train/4 (pid 8905)] Task is starting.
2025-04-20 00:51:55.765 [1745103111589740/train/4 (pid 8905)] Task finished successfully.
2025-04-20 00:51:55.771 [1745103111589740/show_metrics/5 (pid 8907)] Task is starting.
2025-04-20 00:51:56.610 [1745103111589740/show_metrics/5 (pid 8907)] Accuracy: 1.0
2025-04-20 00:51:56.731 [1745103111589740/show_metrics/5 (pid 8907)] Task finished successfully.
2025-04-20 00:51:56.736 [1745103111589740/register_model/6 (pid 8909)] Task is starting.
2025-04-20 00:51:57.638 [1745103111589740/register_model/6 (pid 8909)] Model saved as random_forest_model.pkl
2025-04-20 00:51:57.760 [1745103111589740/register_model/6 (pid 8909)] Task finished successfully.
2025-04-20 00:51:57.765 [1745103111589740/end/7 (pid 8911)] Task is starting.
2025-04-20 00:51:58.589 [1745103111589740/end/7 (pid 8911)] Flow completed.
2025-04-20 00:51:58.722 [1745103111589740/end/7 (pid 8911)] Task finished successfully.
2025-04-20 00:51:58.724 Done!

![MetaFlow Logs](metaflow-screenshot.png "MetaFlow Logs")

