In [1]:
import json
import pickle
import logging
import os
import warnings

import joblib
import mlflow
import numpy as np
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt


from imblearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore")


In [2]:
def read_csv(path) -> pd.DataFrame:
    return pd.read_csv(path)

def data_split(df: pd.DataFrame):
    """This function do splitting of data"""
    X = df[:, :-1]
    y = df[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

    return X_train, X_val, X_test, y_train, y_val, y_test
   
def save_pickle(data, filename) -> None:
    """
    This function saves the data in a pickle file. Args:
        data (object): data to save, filename (str): filename
    Returns:
        None
    """
    filepath = os.path.join("data_processed", f"{filename}.pkl")
    with open(filepath, 'wb') as file:
        pickle.dump(data, file)

def data_transform(X_train, X_val, X_test, y_train, y_val, y_test):
    """This function do data transformation"""
    scaler= MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    save_pickle(scaler, "scaler")
    save_pickle((X_train, y_train), "train")
    save_pickle((X_val, y_val), "val")
    save_pickle((X_test, y_test), "test")
    #logger.info("data saved successfully in pickle files")
    return X_train, X_val, X_test

In [5]:
bostonHousing_Data=read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv')
print(bostonHousing_Data.shape)
bostonHousing_Data.head()


(506, 14)


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
# Check for missing values
print(bostonHousing_Data.isnull().sum())

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64


In [5]:
X_train, X_val, X_test, y_train, y_val, y_test = data_split(bostonHousing_Data.values)
X_train, X_val, X_test = data_transform(X_train, X_val, X_test, y_train, y_val, y_test)
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


X_train shape: (404, 13), y_train shape: (404,)
X_val shape: (51, 13), y_val shape: (51,)
X_test shape: (51, 13), y_test shape: (51,)


In [13]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

# Lista todos los experimentos, incluso los eliminados (si están)
experiments = client.search_experiments(view_type=2)  # 2 = ALL

for exp in experiments:
    print(f"ID: {exp.experiment_id}, Name: {exp.name}, Deleted: {exp.lifecycle_stage}")

ID: 4, Name: BostonHousing_v1, Deleted: deleted
ID: 3, Name: BostonHousing_basseline, Deleted: deleted
ID: 2, Name: BostonHousing_01, Deleted: deleted
ID: 1, Name: BostonHousing_baseline, Deleted: deleted


In [14]:
mlflow.set_tracking_uri("sqlite:///mlflow.db") #For run this is imporant to activate mlflow tracking server:  mlflow ui --backend-store-uri sqlite:///backend.db
mlflow.set_experiment("BostonHousing")

def display_regression_report(
    model: object,
    name_model: str,
    developer: str,
    X_train: pd.DataFrame,
    X_val: pd.DataFrame,
    y_train: pd.Series,
    y_val: pd.Series,
    use_cv = False
):
    """This function display the regression report
     Returns:
      metric (list): list with the metrics"""
    
    # star experiment in mlflow
    with mlflow.start_run(run_name=name_model):
        mlflow.log_param("model", name_model)
        mlflow.log_param("developer", developer)
        # empty list to store the metrics and then tracking them in mlflow
        metric = []
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        mse_train = round(mean_squared_error(y_train, y_train_pred), 2)
        mse_val = round(mean_squared_error(y_val, y_val_pred), 2)
        r2_train = round(r2_score(y_train, y_train_pred), 2)
        r2_val = round(r2_score(y_val, y_val_pred), 2)
        logger.info("Mean Squared Error Train:", mse_train)
        logger.info("Mean Squared Error Validation:", mse_val)
        logger.info("R2 Score Train:", r2_train)
        logger.info("R2 Score Validation:", r2_val)
        
        
        # adding the metrics to the list
        metric.extend([mse_train, mse_val, r2_train, r2_val])

        mlflow.log_metric("mse_train", mse_train)
        mlflow.log_metric("mse_val", mse_val)
        mlflow.log_metric("r2_train", r2_train)
        mlflow.log_metric("r2_val", r2_val)
        # Log the model parameters
        mlflow.log_params(model.get_params())
        # Log the model      
        mlflow.sklearn.log_model(model, f"model_{name_model}")

      
        return metric



2025/06/22 13:15:04 INFO mlflow.tracking.fluent: Experiment with name 'BostonHousing' does not exist. Creating a new experiment.


**RIDGE REGRESSION MODEL**

In [15]:
ridge = Ridge(alpha=0.5, fit_intercept=True, random_state=42)
ridge.fit(X_train, y_train)
display_regression_report(model=ridge, 
                              name_model="RidgeRegression", 
                              developer="YominJ", 
                              X_train=X_train, 
                              X_val=X_val, 
                              y_train=y_train, 
                              y_val=y_val,
                              use_cv=False)



[21.75, 20.99, 0.75, 0.75]

**DESCISION TREE REGRESSOR**

In [16]:
dtr = DecisionTreeRegressor(random_state=42, max_depth=10, min_samples_split=5)
dtr.fit(X_train,y_train)
display_regression_report(model=dtr,
                              name_model="DecisionTreeRegressor",
                              developer="YominJ",
                               X_train=X_train, 
                              X_val=X_val, 
                              y_train=y_train, 
                              y_val=y_val,
                              use_cv=False)



[2.16, 10.83, 0.98, 0.87]

**RANDOM FOREST REGRESSOR**

In [26]:
rfr= RandomForestRegressor(random_state=42, n_estimators=100, max_depth=9, min_samples_split=3)
rfr.fit(X_train,y_train)
display_regression_report(model=rfr,
                            name_model="RandomForestRegressor",
                            developer="YominJ",
                            X_train=X_train,
                            X_val=X_val,
                            y_train=y_train,
                            y_val=y_val,
                            use_cv=False)



[2.59, 5.39, 0.97, 0.93]

**kNN REGRESSOR**

In [18]:
knnr = KNeighborsRegressor(n_neighbors=5, weights='distance', algorithm='auto', p=2, metric='minkowski', n_jobs=-1)
knnr.fit(X_train, y_train)
display_regression_report(model=knnr,
                              name_model="KNeighborsRegressor",
                              developer="YominJ",
                              X_train=X_train,
                              X_val=X_val,
                              y_train=y_train,
                              y_val=y_val,
                              use_cv=False)



[0.0, 15.12, 1.0, 0.82]

ejecutar mlflow ui --backend-store-uri sqlite:///mlflow.db para visualizar los resultados

**HACER INFERENCIAS CON EL MODELO**

In [20]:
import mlflow


logged_model = "runs:/432888a8b30140588b16689d7093f85e/model_DecisionTreeRegressor"  # Replace <RUN_ID> with the actual run ID from your MLflow experiment

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 819.10it/s] 


In [21]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: /home/yominjaramillom/SalvaHealth/mlruns/5/models/m-53f552080a6942cf9680bd82d55e31c3/artifacts
  flavor: mlflow.sklearn
  run_id: 432888a8b30140588b16689d7093f85e

In [22]:
rf_model = mlflow.sklearn.load_model(logged_model)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 1503.98it/s] 


In [23]:
rf_model.predict(X_test)

array([33.175     , 21.02      , 14.23333333, 22.01428571, 10.5       ,
       21.05      , 21.02      , 22.01428571, 17.54285714, 16.75      ,
       33.2       , 17.96666667, 15.2       , 19.175     , 44.75      ,
       21.81428571, 13.56666667, 15.2       , 22.01428571, 33.2       ,
       11.36666667, 11.36666667, 29.625     , 21.26666667, 17.54285714,
       23.06      , 26.325     , 10.5       , 14.1       , 44.475     ,
       22.925     , 17.86666667, 15.975     , 19.48823529, 20.48571429,
       33.175     , 17.54285714, 23.625     , 16.25      , 24.325     ,
       23.625     , 19.48823529, 21.02      , 14.65      , 16.13333333,
       23.625     , 25.175     , 24.325     , 19.48823529, 19.175     ,
       35.5       ])

In [25]:
print(mean_squared_error(y_test, rf_model.predict(X_test)))
print(r2_score(y_test, rf_model.predict(X_test)))

11.056515514483374
0.8253113279938339


**CONECTARSE AL CLIENTE MLFLOW**

In [27]:
from mlflow.tracking import MlflowClient
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [30]:
runs = mlflow.search_runs()
# Extrae los IDs únicos de los experimentos
experiment_ids = runs['experiment_id'].unique()
print("Unique Experiment IDs:")
for exp_id in experiment_ids:
    exp = client.get_experiment(exp_id)
    print(f"Experiment ID: {exp.experiment_id}, Name: {exp.name}, Lifecycle Stage: {exp.lifecycle_stage}")  
    

Unique Experiment IDs:
Experiment ID: 5, Name: BostonHousing, Lifecycle Stage: active


In [33]:
runs = client.search_runs(experiment_ids=["5"])
for run in runs:
    print(run.info)

<RunInfo: artifact_uri='/home/yominjaramillom/SalvaHealth/mlruns/5/5eefa68c6dc44b67b899c7917c47388d/artifacts', end_time=1750618094381, experiment_id='5', lifecycle_stage='active', run_id='5eefa68c6dc44b67b899c7917c47388d', run_name='RandomForestRegressor', start_time=1750618091412, status='FINISHED', user_id='yominjaramillom'>
<RunInfo: artifact_uri='/home/yominjaramillom/SalvaHealth/mlruns/5/03abee0f79d5472696c74ca74ae8300e/artifacts', end_time=1750616117955, experiment_id='5', lifecycle_stage='active', run_id='03abee0f79d5472696c74ca74ae8300e', run_name='KNeighborsRegressor', start_time=1750616115624, status='FINISHED', user_id='yominjaramillom'>
<RunInfo: artifact_uri='/home/yominjaramillom/SalvaHealth/mlruns/5/8b41b001e12c40dcacb93548217b1f48/artifacts', end_time=1750616115598, experiment_id='5', lifecycle_stage='active', run_id='8b41b001e12c40dcacb93548217b1f48', run_name='RandomForestRegressor', start_time=1750616113329, status='FINISHED', user_id='yominjaramillom'>
<RunInfo: ar

**PROMOCION DEL MODELO**

In [37]:
runs= client.search_runs(
    experiment_ids='5',
    filter_string="", #se puede usar un tag en especial o algún parámetro de interés ej: 'tags.model = valor que usaste
    run_view_type=mlflow.entities.ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.r2_score DESC"]

)
for run in runs:
    print(f"run id: {run.info.run_id}, r2_score: {run.data.metrics['r2_val']}, mse: {run.data.metrics['mse_val']}, params: {run.data.params}")

run id: 5eefa68c6dc44b67b899c7917c47388d, r2_score: 0.93, mse: 5.39, params: {'model': 'RandomForestRegressor', 'developer': 'YominJ', 'bootstrap': 'True', 'ccp_alpha': '0.0', 'criterion': 'squared_error', 'max_depth': '9', 'max_features': '1.0', 'max_leaf_nodes': 'None', 'max_samples': 'None', 'min_impurity_decrease': '0.0', 'min_samples_leaf': '1', 'min_samples_split': '3', 'min_weight_fraction_leaf': '0.0', 'monotonic_cst': 'None', 'n_estimators': '100', 'n_jobs': 'None', 'oob_score': 'False', 'random_state': '42', 'verbose': '0', 'warm_start': 'False'}
run id: 03abee0f79d5472696c74ca74ae8300e, r2_score: 0.82, mse: 15.12, params: {'model': 'KNeighborsRegressor', 'developer': 'YominJ', 'algorithm': 'auto', 'leaf_size': '30', 'metric': 'minkowski', 'metric_params': 'None', 'n_jobs': '-1', 'n_neighbors': '5', 'p': '2', 'weights': 'distance'}
run id: 8b41b001e12c40dcacb93548217b1f48, r2_score: 0.94, mse: 5.05, params: {'model': 'RandomForestRegressor', 'developer': 'YominJ', 'bootstrap'

In [42]:
model_name = "RandomForestRegressor"
latest_versions = client.get_latest_versions(name=model_name)
for version in latest_versions:
    print(f" version: {version.version} , actual stage: {version._current_stage}")

 version: 1 , actual stage: None
 version: 2 , actual stage: Staging


In [43]:
client.transition_model_version_stage(
    name=model_name,
    version=2,
    stage="Production"
)

<ModelVersion: aliases=[], creation_timestamp=1750618210228, current_stage='Production', deployment_job_state=None, description='', last_updated_timestamp=1750627826278, metrics=None, model_id=None, name='RandomForestRegressor', params=None, run_id='', run_link='', source='/home/yominjaramillom/SalvaHealth/mlruns/5/models/m-8dbb06df4e4148eb8bc9866cbcbf3825/artifacts', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [44]:
client.update_model_version(
    name=model_name, 
    version=2,
    description=f"The model version {2} was transitioned to Production on {datetime.today().date()}"
)

<ModelVersion: aliases=[], creation_timestamp=1750618210228, current_stage='Production', deployment_job_state=None, description='The model version 2 was transitioned to Production on 2025-06-22', last_updated_timestamp=1750627828448, metrics=None, model_id=None, name='RandomForestRegressor', params=None, run_id='', run_link='', source='/home/yominjaramillom/SalvaHealth/mlruns/5/models/m-8dbb06df4e4148eb8bc9866cbcbf3825/artifacts', status='READY', status_message=None, tags={}, user_id=None, version=2>

**PRUEBA DEL MODELO EN PRODUCCION**

In [46]:
import scipy

In [49]:
def testint_model_from_mlflow(model_name: str, stage:str, X_test: scipy.sparse._csr.csr_matrix, Y_test: pd.Series):
    """this function tests a model from mlflow
    Args:
        model_name (str): name of the model
        stage (str): stage of the model
        X_test (scipy.sparse._csr.csr_matrix): test data
        Y_test (scipy.sparse._csr.csr_matrix): test target
    Returns:
        float: rmse of the model
    
    """
    model_uri = f"models:/{model_name}/{stage}"
    model = mlflow.pyfunc.load_model(model_uri)
    y_pred = model.predict(X_test)
    RFr2_score = round(r2_score(Y_test, y_pred), 2)
    return {"r2_score": RFr2_score}

testint_model_from_mlflow(model_name= "RandomForestRegressor", stage="Production", X_test=X_test, Y_test=y_test)

{'r2_score': 0.84}