<font color="#CA3532"><h1 align="left">Master Data Analytics. EDEM.</h1></font>
<font color="#6E6E6E"><h2 align="left">Herramientas MLOps.</h2></font>
<font color="#6E6E6E"><h2 align="left">Tarea 1. Pipeline entrenamiento de modelos.</h2></font>
#### Daniel Ruiz Riquelme
https://docs.metaflow.org/metaflow/basics

##  Install dependencies

In [1]:
!pip install metaflow
!pip install scikit-learn
!pip install pandas

Collecting metaflow
  Downloading metaflow-2.11.16-py2.py3-none-any.whl (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.3 MB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.3/1.3 MB[0m [31m20.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting boto3 (from metaflow)
  Downloading boto3-1.34.110-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.35.0,>=1.34.110 (from boto3->metaflow)
  Downloading botocore-1.34.110-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m5

## Set username

In [2]:
# Set username for workflows
import os
os.environ["USERNAME"] = "daniel"

In [6]:
%%writefile metaflow_trainingflow.py
from metaflow import FlowSpec, Parameter, step
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

class TrainingFlow(FlowSpec):
    # Define the parameters for the flow
    max_depth = Parameter('max_depth', default=2, help='Max depth of the random forest classifier')
    n_estimators = Parameter('n_estimators', default=100, help='Number of estimators for the random forest classifier')
    random_state = Parameter('random_state', default=0, help='Random state for the random forest classifier')

    @step
    def start(self):
        # Start the flow
        self.next(self.ingest_data)

    @step
    def ingest_data(self):
        # Load the iris dataset
        iris = load_iris()

        # pylint: disable=no-member
        self.X = iris.data
        self.y = iris.target
        # pylint: enable=no-member

        self.next(self.split_data)

    @step
    def split_data(self):
        # Split the data into train and test
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=self.random_state)
        self.next(self.train)

    @step
    def train(self):
        # Train the model
        self.clf = RandomForestClassifier(max_depth=self.max_depth, n_estimators=self.n_estimators, random_state=self.random_state)
        self.clf.fit(self.X_train, self.y_train)
        self.next(self.show_metrics)

    @step
    def show_metrics(self):
        # Print some metrics
        y_pred = self.clf.predict(self.X_test)
        print(accuracy_score(self.y_test, y_pred))
        self.next(self.register_model)

    @step
    def register_model(self):
        # Save the model in a pickle file in local storage
        import pickle
        with open('model.pkl', 'wb') as f:
            pickle.dump(self.clf, f)
        self.next(self.end)

    @step
    def end(self):
        pass

if __name__ == '__main__':
    TrainingFlow()


Overwriting metaflow_trainingflow.py


In [7]:
!python metaflow_trainingflow.py run --max_depth 2 --n_estimators 100 --random_state 0

[35m[1mMetaflow 2.11.16[0m[35m[22m executing [0m[31m[1mTrainingFlow[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:daniel[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[22m    Pylint not found, so extra checks are disabled.[K[0m[32m[22m[0m
[35m2024-05-22 14:46:36.751 [0m[1mWorkflow starting (run-id 1716389196750093):[0m
[35m2024-05-22 14:46:36.759 [0m[32m[1716389196750093/start/1 (pid 4212)] [0m[1mTask is starting.[0m
[35m2024-05-22 14:46:38.360 [0m[32m[1716389196750093/start/1 (pid 4212)] [0m[1mTask finished successfully.[0m
[35m2024-05-22 14:46:38.364 [0m[32m[1716389196750093/ingest_data/2 (pid 4243)] [0m[1mTask is starting.[0m
[35m2024-05-22 14:46:39.927 [0m[32m[1716389196750093/ingest_data/2 (pid 4243)] [0m[1mTask finished successfully.[0m
[35m2024-05-22 14:46:39.931 [0m[32m[171638919