# Pipelines: Metaflow model training

##  Install dependencies

In [None]:
# docker run -p 8888:8888 quay.io/jupyter/scipy-notebook

In [1]:
%pip install -r requirements.txt

[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
%pip install metaflow==2.13.9 pandas==2.2.3 numpy==1.26.4 scikit-learn==1.6.1 optuna==4.2.0

Collecting metaflow==2.13.9
  Downloading metaflow-2.13.9-py2.py3-none-any.whl.metadata (6.1 kB)
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting optuna==4.2.0
  Downloading optuna-4.2.0-py3-none-any.whl.metadata (17 kB)
Collecting boto3 (from metaflow==2.13.9)
  Downloading boto3-1.37.16-py3-none-any.whl.metadata (6.7 kB)
Collecting colorlog (from optuna==4.2.0)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting botocore<1.38.0,>=1.37.16 (from boto3->metaflow==2.13.9)
  Downloading botocore-1.37.16-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3->metaflow==2.13.9)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.12.0,>=0.11.0 (from boto3->metaflow==2.13.9)
  Downloading s3transfer-0.11.4-py3-none-any.whl.metadata (1.7 kB)
Downloading metaflow-2.13.9-py2.py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━

In [11]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


## Set username

In [4]:
# Set username for workflows
import os
os.environ["USERNAME"] = "pablo"

In [12]:
%%writefile metaflow_trainingflow.py
from metaflow import FlowSpec, Parameter, step

class TrainingFlow(FlowSpec):
    # Define the parameters for the flow
    max_depth = Parameter('max_depth', default=2, help='Max depth of the random forest classifier')
    n_estimators = Parameter('n_estimators', default=100, help='Number of estimators for the random forest classifier')
    random_state = Parameter('random_state', default=0, help='Random state for the random forest classifier')
    
    @step
    def start(self):
        # Start the flow
        self.next(self.ingest_data)
        
    @step
    def ingest_data(self):
        from sklearn.datasets import load_iris
    
        # Load the iris dataset
        iris = load_iris()
        
        #pylint: disable=no-member
        self.X = iris.data
        self.y = iris.target
        #pylint: enable=no-member
        
        self.next(self.split_data)

    @step
    def split_data(self):
        # Split the data into train and test
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=self.random_state)
        
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
        self.next(self.train)

    @step
    def train(self):
        # Train the model
        from sklearn.ensemble import RandomForestClassifier
        
        # Create the random forest classifier
        self.rf = RandomForestClassifier(max_depth=self.max_depth, n_estimators=self.n_estimators, random_state=self.random_state)
        # Train the model
        self.rf.fit(self.X_train, self.y_train)
        
        self.next(self.show_metrics)

    @step
    def show_metrics(self):
        # Print some metrics
        from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
        
        y_pred = self.rf.predict(self.X_test)
        accuracy = accuracy_score(self.y_test, y_pred)
        report = classification_report(self.y_test, y_pred)
        confusion = confusion_matrix(self.y_test, y_pred)
        
        print(f'Accuracy: {accuracy}')
        print(f'Report:\n{report}')
        print(f'Confusion matrix:\n{confusion}')

        self.next(self.register_model)
        
    @step
    def register_model(self):
        # Save the model in a pickle file in local storage
        import pickle
        with open('model.pkl', 'wb') as f:
            pickle.dump(self.rf, f)
        
        self.next(self.end)
        
    @step
    def end(self):
        pass
    
if __name__ == '__main__':
    TrainingFlow()

Overwriting metaflow_trainingflow.py


In [14]:
!python metaflow_trainingflow.py run --max_depth 2 --n_estimators 100 --random_state 0

[35m[1mMetaflow 2.13.9[0m[35m[22m executing [0m[31m[1mTrainingFlow[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:pablo[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[22m    Pylint not found, so extra checks are disabled.[K[0m[32m[22m[0m
[35m2025-03-20 18:59:11.409 [0m[1mWorkflow starting (run-id 1742497151406177):[0m
[35m2025-03-20 18:59:11.416 [0m[32m[1742497151406177/start/1 (pid 3712)] [0m[1mTask is starting.[0m
[35m2025-03-20 18:59:11.622 [0m[32m[1742497151406177/start/1 (pid 3712)] [0m[1mTask finished successfully.[0m
[35m2025-03-20 18:59:11.625 [0m[32m[1742497151406177/ingest_data/2 (pid 3714)] [0m[1mTask is starting.[0m
[35m2025-03-20 18:59:12.693 [0m[32m[1742497151406177/ingest_data/2 (pid 3714)] [0m[1mTask finished successfully.[0m
[35m2025-03-20 18:59:12.698 [0m[32m[17424971514