# MLOps Workflow con MLflow

## 1. Importar librerías y configuración de MLflow

In [1]:
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

mlflow.set_tracking_uri("sqlite:///mlflow.db")


* 'schema_extra' has been renamed to 'json_schema_extra'


## 2. Cargar y preprocesar la data 

este es un hipotetico paso que se realiza pero para efectos practicos cargamos las bases de train y test que se preprocesaron en la notebook e2e_construccion.ipynb

In [2]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

# Entrenar el modelo con MLFlow

In [3]:
mlflow.set_experiment("Repayment_plan_acceptance")

with mlflow.start_run():
    n_estimators = 100
    max_depth = None

    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)


    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Registrar parámetros
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)

    # Registrar métricas
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    # Registrar el modelo
    mlflow.sklearn.log_model(rf, "random_forest_model")

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    #pd.DataFrame(y_pred, columns=['predictions']).to_csv('predictions.csv', index=False)

print("Modelo entrenado y registrado en MLflow.")

  return fit_method(estimator, *args, **kwargs)


Accuracy: 0.9043182186672432
Precision: 0.9051112454236506
Recall: 0.9043182186672432
F1 Score: 0.9041625339127102
Modelo entrenado y registrado en MLflow.


# Definir el pipeline del modelo para inferencia

- Extraer los datos de las diferentes fuente de datos.
- preprocesar y transformar los datos numéricos y categóricos, recordar que primero de hace una transformación logartimica y luego una transformación de normalización.



In [24]:
from sklearn.base import BaseEstimator, TransformerMixin


df_demografico = pd.read_csv('processed_files/demografica.csv')
df_historic = pd.read_csv('processed_files/historic.csv')
df_evaluation = pd.read_csv('files/prueba_op_base_pivot_var_rpta_alt_enmascarado_oot.csv')

class DataExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
    
        results = []
        for i in range(len(X)):
            nit_enmascarado = X.iloc[i]['nit_enmascarado']
            num_oblig_enmascarado = X.iloc[i]['num_oblig_enmascarado']
            
            tot_patrimonio = df_demografico['tot_patrimonio'][(df_demografico['nit_enmascarado'] == nit_enmascarado)].median()
            total_ing = df_demografico['total_ing'][(df_demografico['nit_enmascarado'] == nit_enmascarado)].median()
            min_mora = df_historic['dias_mora'][(df_historic['nit_enmascarado'] == nit_enmascarado) & (df_historic['num_oblig_enmascarado'] == num_oblig_enmascarado)].min()
            max_mora = df_historic['dias_mora'][(df_historic['nit_enmascarado'] == nit_enmascarado) & (df_historic['num_oblig_enmascarado'] == num_oblig_enmascarado)].max()
            valor_cuota_mes = df_historic['valor_cuota_mes'][(df_historic['nit_enmascarado'] == nit_enmascarado) & (df_historic['num_oblig_enmascarado'] == num_oblig_enmascarado)].median()
            producto = df_historic['producto'][(df_historic['nit_enmascarado'] == nit_enmascarado) & (df_historic['num_oblig_enmascarado'] == num_oblig_enmascarado)].mode()[0]
            moda_marca_pago = df_historic['marca_pago'][(df_historic['nit_enmascarado'] == nit_enmascarado) & (df_historic['num_oblig_enmascarado'] == num_oblig_enmascarado)].mode()[0]
            
            data =[tot_patrimonio, total_ing, min_mora, max_mora, valor_cuota_mes, producto, moda_marca_pago]
            results.append(data)
        
        return np.array(results)

def log_trans(valor):
    return np.log(valor + 1)

numeric_features = ['tot_patrimonio', 'total_ing', 'min_mora', 'max_mora', 'valor_cuota_mes']
categorical_features = ['producto', 'marca_pago']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('log', FunctionTransformer(log_trans, validate=False)),
            ('scaler', MinMaxScaler())
        ]), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

model = Pipeline([
    ('data_extraction', DataExtractor()),
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

In [18]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

client = MlflowClient()

experiments = client.list_experiments(view_type=ViewType.ACTIVE_ONLY)
all_runs = []

for experiment in experiments:
    runs = client.search_runs(
        experiment_ids=[experiment.experiment_id],
        filter_string="",
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=500,
        order_by=["metrics.f1_score DESC"]
    )
    all_runs.extend(runs)

if all_runs:
    best_run = all_runs[0]
    print(f"El mejor run tiene un f1_score de: {best_run.data.metrics['f1_score']}")

    # Registrar y promover el modelo asociado al mejor run
    model_uri = f"runs:/{best_run.info.run_id}/random_forest_model"
    model_version = mlflow.register_model(model_uri, model_name)

    # Promover a producción
    client.transition_model_version_stage(
        name=model_name,
        version=model_version.version,
        stage="Production"
    )

    print(f"Modelo promovido a producción: {model_name} v{model_version.version}")
else:
    print("No se encontraron experimentos con métricas de f1_score.")