In [1]:
import mlflow
import pandas as pd
import numpy as np
import xgboost as xgb
import os
from sklearn.metrics import root_mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
import json


In [2]:
import mlflow
from mlflow.exceptions import MlflowException

# Set tracking URI
mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Get the experiment details
experiment_name = "boston-housing-experiment"
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment.lifecycle_stage == "deleted":
    client = mlflow.tracking.MlflowClient()
    client.restore_experiment(experiment.experiment_id)
    print(f"Experiment '{experiment_name}' has been restored.")
else:
    print(f"Experiment '{experiment_name}' is already active.")


Experiment 'boston-housing-experiment' is already active.


In [10]:
# def preprocessing(x_train, x_test, y_train, y_test):
#     # Apply log transformation to the target variable
#     y_train = np.log1p(y_train)
#     y_test = np.log1p(y_test)

#     # Check skewness and apply log transformation if necessary
#     for col in x_train.columns:
#         if np.abs(x_train[col].skew()) > 0.3:
#             x_train[col] = np.log1p(x_train[col])
#             x_test[col] = np.log1p(x_test[col])

#     # Fit the scaler on the training data and transform both train and test data
#     min_max_scaler = MinMaxScaler()
#     x_train_scaled = pd.DataFrame(data=min_max_scaler.fit_transform(x_train), columns=x_train.columns)
#     x_test_scaled = pd.DataFrame(data=min_max_scaler.transform(x_test), columns=x_test.columns)

#     return x_train_scaled, x_test_scaled, y_train, y_test

# def read_dataframe(filename="data/housing.csv"):

#     df = pd.read_csv(filename, header=None, delimiter=r"\s+", names=column_names)
#     # Load the dataset
#     column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
#     # Split the dataset into train and test sets
#     train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) 
#     # Separate features and target for train and test sets
#     x_train = train_df.loc[:, ['LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE']]
#     y_train = train_df['MEDV']
#     x_test = test_df.loc[:, ['LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE']]
#     y_test = test_df['MEDV']
#     # Apply preprocessing
#     x_train_scaled, x_test_scaled, y_train, y_test = preprocessing(x_train, x_test, y_train, y_test)
#     return x_train_scaled, x_test_scaled, y_train, y_test


So with these analsis, we may try predict MEDV with 'LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE' features. Let's try to remove the skewness of the data trough log transformation.

Let's try Linear, Ridge Regression on dataset first.

In [3]:
def load_data(url="https://raw.githubusercontent.com/arunv22/zoomcamp_mlops_project/main/data/housing.csv"):
    column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
    df = pd.read_csv(url, header=None, delimiter=r"\s+", names=column_names)
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    
    x_train = train_df.loc[:, ['LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE']]
    y_train = train_df['MEDV']
    x_test = test_df.loc[:, ['LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE']]
    y_test = test_df['MEDV']
    
    return x_train, x_test, y_train, y_test



In [4]:
# Define a custom transformer for applying log transformation
class LogTransformer(FunctionTransformer):
    def __init__(self):
        super().__init__(func=np.log1p, validate=False)
    
    def inverse_transform(self, X):
        return np.expm1(X)


In [5]:
def create_pipeline(model):
    return Pipeline([
        ('preprocessor', ColumnTransformer(
            transformers=[
                ('log', LogTransformer(), ['LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE']),
                ('scaler', MinMaxScaler(), ['LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE'])
            ],
            remainder='passthrough'
        )),
        ('model', model)
    ])


In [6]:
def train_and_log_models(x_train, y_train, x_test, y_test):
    mlflow.set_experiment("boston-housing-experiment")
    
    models = {
        'Linear Regression': linear_model.LinearRegression(),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
        'Ridge Regression': linear_model.Ridge(alpha=0.1),
        'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
    }
    
    for name, model in models.items():
        with mlflow.start_run() as run:
            mlflow.set_tag("developer", "arun")
            mlflow.log_param("train-data-path", "./data/housing.csv")
            
            if name == 'XGBoost':
                model.fit(x_train, np.log1p(y_train))
                y_pred_log = model.predict(x_test)
            else:
                pipeline = create_pipeline(model)
                pipeline.fit(x_train, np.log1p(y_train))
                y_pred_log = pipeline.predict(x_test)
                
            y_pred = np.expm1(y_pred_log)
            y_test_original = np.expm1(np.log1p(y_test))
            
            rmse = root_mean_squared_error(y_test_original, y_pred)
            mlflow.log_metric("rmse", rmse)
            
            # Log the model using mlflow.sklearn or mlflow.xgboost
            model_name = name.replace(' ', '_')
            if name == 'XGBoost':
                mlflow.xgboost.log_model(model, artifact_path=f"models/{model_name}")
            else:
                mlflow.sklearn.log_model(pipeline, artifact_path=f"models/{model_name}")
            
            print(f"{name} logged with RMSE: {rmse:.2f}")
            print(f"Model artifact path: runs:/{run.info.run_id}/models/{model_name}")


In [7]:
def get_best_run_uri(experiment_name, metric_name, metric_goal='minimize'):
    mlflow.set_tracking_uri("sqlite:///mlflow.db")
    mlflow.set_experiment(experiment_name)
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if not experiment:
        raise ValueError("Experiment not found.")
    
    # Search for runs in the experiment
    runs = mlflow.search_runs(
        experiment_ids=[experiment.experiment_id],
        filter_string='',
        order_by=[f"metrics.{metric_name} {'asc' if metric_goal == 'minimize' else 'desc'}"]
    )

    if not runs.empty:
        best_run = runs.iloc[0]
        best_run_id = best_run.run_id
        best_artifact_path = json.loads(best_run["tags.mlflow.log-model.history"])[0]["artifact_path"]
        logged_model_uri = f"runs:/{best_run_id}/{best_artifact_path}"
        return best_run_id, logged_model_uri
    else:
        raise ValueError("No runs found for the specified experiment.")

In [37]:
def test_model(logged_model_uri, x_test, y_test):
    try:
        # Load model
        loaded_model = mlflow.pyfunc.load_model(logged_model_uri)
        
        # Predict on test data
        y_pred = loaded_model.predict(pd.DataFrame(x_test))
        print(f'x_test first row:{x_test.tail(1)}')
        print("******")
        print(y_pred)
        print("******")
        print(y_test)
        print("******")
        
        # Calculate RMSE on original scale
        rmse = np.sqrt(root_mean_squared_error(y_test, y_pred))
        print(f"RMSE for best model selected: {rmse:.2f}")
        
        return {"rmse": rmse}
    except Exception as e:
        print(f"Error loading model from {logged_model_uri}: {e}")




In [30]:
logged_model_uri

'runs:/ec53532a97e74b2eb5c4a1eeb6834be2/models/Random_Forest'

In [9]:
# Load data
x_train, x_test, y_train, y_test = load_data()

# Train and log models
train_and_log_models(x_train, y_train, x_test, y_test)

# Fetch the best model URI and test
experiment_name = "boston-housing-experiment"
metric_name = "rmse"
run_id, logged_model_uri = get_best_run_uri(experiment_name, metric_name)

print(f"Best Model Run ID: {run_id}")
print(f"Best Model URI: {logged_model_uri}")



Linear Regression logged with RMSE: 4.10
Model artifact path: runs:/e15c55606382490aa45da34c4a340f9d/models/Linear_Regression




Random Forest logged with RMSE: 3.00
Model artifact path: runs:/ec53532a97e74b2eb5c4a1eeb6834be2/models/Random_Forest




Ridge Regression logged with RMSE: 4.25
Model artifact path: runs:/e0f8657d73134008afa58e3c8cc1ca71/models/Ridge_Regression




XGBoost logged with RMSE: 3.54
Model artifact path: runs:/08f4a9caa97848c7b0c51fddaf3c958a/models/XGBoost
Best Model Run ID: ec53532a97e74b2eb5c4a1eeb6834be2
Best Model URI: runs:/ec53532a97e74b2eb5c4a1eeb6834be2/models/Random_Forest
Error loading model from runs:/ec53532a97e74b2eb5c4a1eeb6834be2/models/Random_Forest: name 'root_mean' is not defined
None


In [38]:
# file:///C:/Users/arunv/zoomcamp_mlops_project/mlruns/1/ec53532a97e74b2eb5c4a1eeb6834be2/artifacts/models/Random_Forest
# Test the best model
results = test_model(logged_model_uri, x_test, y_test)
print(results)


x_test first row:    LSTAT  INDUS    NOX  PTRATIO     RM    TAX     DIS   AGE
75   8.94  12.83  0.437     18.7  6.286  398.0  4.5026  45.0
******
[3.17314089 3.45087576 2.76276047 3.18285768 2.72964882 3.09453619
 2.97822443 2.77680446 3.07097259 3.08663772 3.01184286 2.96690352
 2.26903896 3.11557722 2.99929202 3.32994157 2.98072454 2.25853412
 3.86508423 2.74378273 3.23815603 3.23133083 2.73762392 3.16863007
 2.68233033 2.77344311 3.10625546 2.78095851 3.02356069 3.07795693
 3.00068813 3.19294452 3.51837611 3.01598395 2.66784189 2.79470007
 3.57011756 3.00797106 3.09308345 3.23245108 2.95264735 3.42401718
 3.86734815 2.97430415 3.16155045 2.72921829 2.79336388 3.23264316
 2.89529214 3.37191871 3.09570134 3.58748767 2.8446581  3.28735454
 3.81640617 3.12360743 2.78962417 3.50025094 3.16277102 3.00820203
 3.2668612  3.55551292 3.37580004 2.98958591 3.3075351  2.90779616
 2.67966415 3.18317764 3.37355151 2.84567092 3.05998591 3.32217376
 2.48530019 3.09525834 3.13935336 2.104712   3.046

In [20]:
def register_and_promote_model(run_id, model_name, stage="Production"):
    model_uri = f"runs:/{run_id}/models/{model_name}"
    registered_model_name = model_name.replace("_", "-").lower()

    # Register the model
    model_version = mlflow.register_model(model_uri, registered_model_name)

    # Transition model to specified stage
    client = mlflow.tracking.MlflowClient()
    client.transition_model_version_stage(
        name=registered_model_name,
        version=model_version.version,
        stage=stage
    )
    
    return registered_model_name, model_version.version

In [21]:
# Register and promote the best model
model_name = logged_model_uri.split('/')[-1]
registered_model_name, model_version = register_and_promote_model(run_id, model_name)
print(f"Registered Model Name: {registered_model_name}")
print(f"Model Version: {model_version}")


Registered Model Name: random-forest
Model Version: 5


Registered model 'random-forest' already exists. Creating a new version of this model...
Created version '5' of model 'random-forest'.
  client.transition_model_version_stage(


In [26]:
model_name

'Random_Forest'

In [25]:
# file:///C:/Users/arunv/zoomcamp_mlops_project/mlruns/1/ec53532a97e74b2eb5c4a1eeb6834be2/artifacts/models/Random_Forest/MLmodel
def download_models(run_id, model_names, output_dir="models"):
    os.makedirs(output_dir, exist_ok=True)
    client = mlflow.tracking.MlflowClient()
    
    for model_name in model_names:
        artifact_path = f"models/{model_name}/"
        local_path = os.path.join(output_dir, model_name)
        print(f'local path:{local_path}')
        os.makedirs(local_path, exist_ok=True)
        client.download_artifacts(run_id, artifact_path, local_path)
        print(f"Downloaded {model_name} to local path: /n{local_path}")


# # Download all models trained
download_models(run_id, model_name)


local path:models\R


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

MlflowException: The following failures occurred while downloading one or more artifacts from file:///C:/Users/arunv/zoomcamp_mlops_project/mlruns/1/ec53532a97e74b2eb5c4a1eeb6834be2/artifacts:
##### File models/R/ #####
[Errno 2] No such file or directory: 'C:\\Users\\arunv\\zoomcamp_mlops_project\\mlruns\\1\\ec53532a97e74b2eb5c4a1eeb6834be2\\artifacts\\models\\R'

In [29]:
run_id

'ec53532a97e74b2eb5c4a1eeb6834be2'

In [49]:
x_test.iloc[-1,:].to_dict()

{'LSTAT': 8.94,
 'INDUS': 12.83,
 'NOX': 0.437,
 'PTRATIO': 18.7,
 'RM': 6.286,
 'TAX': 398.0,
 'DIS': 4.5026,
 'AGE': 45.0}

173    23.6
Name: MEDV, dtype: float64

In [76]:
# client.transition_model_version_stage(
#     name = 'boston_housing_models',
#     version = 1,
#     stage='Production',
#     archive_existing_versions=True
# )
    

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1720605968257, current_stage='Production', description='', last_updated_timestamp=1720610422305, name='boston_housing_xgboost', run_id='3732dd5781ce4d129b4a4436be05de19', run_link='', source='file:///C:/Projects/zoomcamp_mlops_project/mlruns/1/3732dd5781ce4d129b4a4436be05de19/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [5]:
# import mlflow
# from mlflow.tracking import MlflowClient
# MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
# # Initialize the MLflow client
# client = MlflowClient(tracking_uri = MLFLOW_TRACKING_URI)

# # List all registered models using search_registered_models
# models = client.search_registered_models()

# # Print the registered models and their versions
# for model in models:
#     print(f"Model Name: {model.name}")
#     for version in model.latest_versions:
#         print(f"  Version: {version.version}, Stage: {version.current_stage}")


Model Name: boston_housing_models
  Version: 4, Stage: None
  Version: 3, Stage: Production


In [6]:
# # List all experiments
# experiments = mlflow.search_experiments()

# # Print out the experiments
# for experiment in experiments:
#     print(f"Experiment ID: {experiment.experiment_id}, Name: {experiment.name}")

Experiment ID: 1, Name: boston-housing-experiment
Experiment ID: 0, Name: Default


In [50]:
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient

# Initialize the MLflow client
client = MlflowClient()

# Define the experiment ID
experiment_id = '1'

# Search runs
runs = client.search_runs(
    experiment_ids=[experiment_id],  # Note: experiment_ids should be a list
    filter_string="",  # No filter, get all runs
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]  # Note: order_by should be a list
)

# Print the runs with their RMSE values
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")


MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)


run id: ec53532a97e74b2eb5c4a1eeb6834be2, rmse: 3.0005
run id: 08f4a9caa97848c7b0c51fddaf3c958a, rmse: 3.5355
run id: e15c55606382490aa45da34c4a340f9d, rmse: 4.0983
run id: e0f8657d73134008afa58e3c8cc1ca71, rmse: 4.2489


In [65]:
from mlflow.tracking import MlflowClient
import os
import shutil

# Define the MLflow tracking URI
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

# Replace with your run ID and model name
run_id = 'ec53532a97e74b2eb5c4a1eeb6834be2'
model_name = 'Random_Forest'

# Define the artifact path and local path
artifact_path = f"models/{model_name}"  # Ensure this matches your actual artifact structure
temp_download_path = 'temp_model_download'  # Temporary directory for downloading artifacts
final_model_path = 'model.pkl'  # Final destination for the .pkl file

# Ensure the temporary download path exists
os.makedirs(temp_download_path, exist_ok=True)
print(f"Temporary download path created: {temp_download_path}")

# Print artifact path
print(f"Artifact path: {artifact_path}")

# Download the model artifacts to the temporary directory
try:
    client.download_artifacts(run_id, artifact_path, temp_download_path)
    print(f"Model artifacts downloaded to {temp_download_path}")
except Exception as e:
    print(f"Failed to download model artifacts: {e}")
    exit(1)

# Find and move the .pkl file to the final destination
pkl_file_path = None
for root, dirs, files in os.walk(temp_download_path):
    for file in files:
        if file.endswith('.pkl'):
            pkl_file_path = os.path.join(root, file)
            break
    if pkl_file_path:
        break

if pkl_file_path:
    shutil.move(pkl_file_path, final_model_path)
    print(f"Model .pkl file moved to {final_model_path}")
else:
    print("No .pkl file found in the downloaded artifacts")

# Clean up temporary directory
shutil.rmtree(temp_download_path)
print("Temporary download path cleaned up")



Temporary download path created: temp_model_download
Artifact path: models/Random_Forest


Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Model artifacts downloaded to temp_model_download
Model .pkl file moved to model.pkl
Temporary download path cleaned up
