# Pipelines: Metaflow model training

##  Install dependencies

In [2]:
!pip install -r requirements.txt

Collecting metaflow==2.13.9 (from -r requirements.txt (line 1))
  Downloading metaflow-2.13.9-py2.py3-none-any.whl.metadata (6.1 kB)
Collecting pandas==2.2.3 (from -r requirements.txt (line 2))
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting optuna==4.2.0 (from -r requirements.txt (line 5))
  Downloading optuna-4.2.0-py3-none-any.whl.metadata (17 kB)
Collecting boto3 (from metaflow==2.13.9->-r requirements.txt (line 1))
  Downloading boto3-1.37.9-py3-none-any.whl.metadata (6.6 kB)
Collecting alembic>=1.5.0 (from optuna==4.2.0->-r requirements.txt (line 5))
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna==4.2.0->-r requirements.txt (line 5))
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna==4.2.0->

## Set username

In [3]:
# Set username for workflows
import os
os.environ["USERNAME"] = "eduardo"

In [12]:
%%writefile metaflow_trainingflow.py
from metaflow import FlowSpec, Parameter, step
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pickle
import os


class TrainingFlow(FlowSpec):
    # Define the parameters for the flow
    max_depth = Parameter('max_depth', default=2, help='Max depth of the random forest classifier')
    n_estimators = Parameter('n_estimators', default=100, help='Number of estimators for the random forest classifier')
    random_state = Parameter('random_state', default=0, help='Random state for the random forest classifier')

    @step
    def start(self):
        # Start the flow
        self.next(self.ingest_data)

    @step
    def ingest_data(self):
        from sklearn.datasets import load_iris

        # Load the iris dataset
        iris = load_iris()

        #pylint: disable=no-member
        self.X = iris.data
        self.y = iris.target
        #pylint: enable=no-member

        self.next(self.split_data)

    @step
    def split_data(self):
        #Split the data into train and test
        # TODO: WRITE YOUR CODE HERE
        self.X_train, self.X_test, self.y_train,self. y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42, stratify=self.y)

        self.next(self.train)
    @step

    def train(self):
        # Definir el modelo usando los parámetros de la clase TrainingFlow
        self.model = RandomForestClassifier(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            random_state=self.random_state
        )

        # Entrenar el modelo con los datos de entrenamiento
        self.model.fit(self.X_train, self.y_train)

        self.next(self.show_metrics)
    @step
    def show_metrics(self):
        # Print some metrics
        # TODO: WRITE YOUR CODE HERE
        # Hacer predicciones en el conjunto de prueba
        y_pred = self.model.predict(self.X_test)

        # Calcular métricas
        accuracy = accuracy_score(self.y_test, y_pred)
        precision = precision_score(self.y_test, y_pred, average='macro')  # Cambia a 'weighted' si hay desbalance
        recall = recall_score(self.y_test, y_pred, average='macro')
        f1 = f1_score(self.y_test, y_pred, average='macro')
        self.next(self.register_model)
    @step
    def register_model(self):
        # Save the model in a pickle file in local storage
        # TODO: WRITE YOUR CODE HERE
        # Definir el nombre del archivo
        model_filename = "trained_model.pkl"

        # Guardar el modelo en un archivo pickle
        with open(model_filename, "wb") as file:
            pickle.dump(self.model, file)
        self.next(self.end)
    @step
    def end(self):
        pass

if __name__ == '__main__':
    TrainingFlow()

Writing metaflow_trainingflow.py


In [13]:
!python metaflow_trainingflow.py run --max_depth 2 --n_estimators 100 --random_state 0

[35m[1mMetaflow 2.13.9[0m[35m[22m executing [0m[31m[1mTrainingFlow[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:eduardo[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[22m    Pylint not found, so extra checks are disabled.[K[0m[32m[22m[0m
[35m2025-03-10 12:23:21.666 [0m[1mWorkflow starting (run-id 1741609401665815):[0m
[35m2025-03-10 12:23:21.674 [0m[32m[1741609401665815/start/1 (pid 5124)] [0m[1mTask is starting.[0m
[35m2025-03-10 12:23:23.618 [0m[32m[1741609401665815/start/1 (pid 5124)] [0m[1mTask finished successfully.[0m
[35m2025-03-10 12:23:23.623 [0m[32m[1741609401665815/ingest_data/2 (pid 5139)] [0m[1mTask is starting.[0m
[35m2025-03-10 12:23:25.553 [0m[32m[1741609401665815/ingest_data/2 (pid 5139)] [0m[1mTask finished successfully.[0m
[35m2025-03-10 12:23:25.558 [0m[32m[174160940