In [297]:
import mlflow
from mlflow.models import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import pandas as pd

mlflow.set_experiment("Housing Prices Prediction")

# Load the Housing.csv dataset
import dabl
data_raw = pd.read_csv('data/Housing.csv')
target = data_raw['price']
data = data_raw.drop(['price'], axis=1)
data_raw.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


Data Processing 1 - OHE, keep all features

In [298]:
# Clean the data using dabl
#cleaned_data = dabl.clean(data)
#cleaned_data_encoded = pd.get_dummies(cleaned_data)

# One-hot encode the categorical features
cleaned_data_encoded = pd.get_dummies(data)
cleaned_data_encoded.head()

# train test split
X_train, X_test, y_train, y_test = train_test_split(cleaned_data_encoded, target, test_size=0.2, random_state=42)

Data Processing 2 - check for multicolinearity, remove features with high values

In [224]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["feature"] = cleaned_data_encoded.columns
vif_data["VIF"] = [variance_inflation_factor(cleaned_data_encoded.values, i) for i in range(cleaned_data_encoded.shape[1])]

# Display VIF
print(vif_data)


# Remove features with high VIF > 10
high_vif_features = vif_data[vif_data["VIF"] > 10]
cleaned_data_encoded_selected = cleaned_data_encoded.drop(columns=high_vif_features["feature"])
print("Selected Features: ", cleaned_data_encoded_selected.columns)


                            feature       VIF
0                              area  1.325250
1                          bedrooms  1.369477
2                         bathrooms  1.286621
3                           stories  1.478055
4                           parking  1.212837
5                       mainroad_no       inf
6                      mainroad_yes       inf
7                      guestroom_no       inf
8                     guestroom_yes       inf
9                       basement_no       inf
10                     basement_yes       inf
11               hotwaterheating_no       inf
12              hotwaterheating_yes       inf
13               airconditioning_no       inf
14              airconditioning_yes       inf
15                      prefarea_no       inf
16                     prefarea_yes       inf
17       furnishingstatus_furnished       inf
18  furnishingstatus_semi-furnished       inf
19     furnishingstatus_unfurnished       inf
Selected Features:  Index(['area',

  vif = 1. / (1. - r_squared_i)


In [225]:
# uncomment the following lines to run processing 2

#adjust the size of the target based on the index of the cleaned_data_encoded_selected
#target2 = target[:len(cleaned_data_encoded_selected)]

# train test split
#X_train, X_test, y_train, y_test = train_test_split(cleaned_data_encoded_selected, target2, test_size=0.2, random_state=42)

In [277]:
# Linear Regression model
from sklearn.linear_model import LinearRegression

# Create a Linear Regression model
lr_model = LinearRegression()

# Train the model
lr_model.fit(X_train, y_train)

# Make predictions
lr_pred = lr_model.predict(X_test)

# Compute the Root Mean Squared Error
rmse = np.sqrt(metrics.mean_squared_error(y_test, lr_pred))

print("The Root Mean Squared Error for the linear regression model:", rmse)



The Root Mean Squared Error for the linear regression model: 1324506.960091438


In [229]:

# Implement MLflow
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(lr_model.get_params())

    # Log the RMSE metric
    mlflow.log_metric("rmse", rmse)

    # Set a tag for tracking information
    mlflow.set_tag("Training Info", "Dataset 2")

    # Infer the model signature
    signature = infer_signature(X_train, lr_model.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=lr_model,
        artifact_path="Linear_Regression_Model",
        signature=signature,
        input_example=X_train.head(),
        registered_model_name="Linear Regression Model",
    )

Registered model 'Linear Regression Model' already exists. Creating a new version of this model...
Created version '4' of model 'Linear Regression Model'.


Simple Dabl Model

In [230]:
# Fit a simple regressor
simple_regressor = dabl.SimpleRegressor().fit(X_train, y_train)

# Predict the target variable
pred = simple_regressor.predict(X_test)

# Compute the Root Mean Squared Error
rmse = np.sqrt(metrics.mean_squared_error(y_test, pred))

print("The Root Mean Squared Error for the random forest regression model with the best parameters:", rmse)

Running DummyRegressor()
r2: -0.018 neg_mean_squared_error: -3105569656113.664
=== new best DummyRegressor() (using r2):
r2: -0.018 neg_mean_squared_error: -3105569656113.664

Running DecisionTreeRegressor(max_depth=1)
r2: 0.311 neg_mean_squared_error: -2085951184459.935
=== new best DecisionTreeRegressor(max_depth=1) (using r2):
r2: 0.311 neg_mean_squared_error: -2085951184459.935

Running DecisionTreeRegressor(max_leaf_nodes=8)
r2: 0.540 neg_mean_squared_error: -1387749905514.373
=== new best DecisionTreeRegressor(max_leaf_nodes=8) (using r2):
r2: 0.540 neg_mean_squared_error: -1387749905514.373

Running DecisionTreeRegressor(max_leaf_nodes=16)
r2: 0.531 neg_mean_squared_error: -1415293931339.369
Running DecisionTreeRegressor(max_leaf_nodes=32)
r2: 0.458 neg_mean_squared_error: -1606663096588.816
Running DecisionTreeRegressor(max_depth=5)
r2: 0.524 neg_mean_squared_error: -1439531650928.209
Running Ridge(alpha=10)
r2: 0.518 neg_mean_squared_error: -1476361045154.512
Running Lasso(alp

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [231]:
# Implement MLflow
with mlflow.start_run():

    # Log the RMSE metric
    mlflow.log_metric("rmse", rmse)

    # Set a tag for tracking information
    mlflow.set_tag("Training Info", "Dataset 2")

    # Infer the model signature
    signature = infer_signature(X_train, simple_regressor.predict(X_train))
    
    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=simple_regressor,
        artifact_path="Dabl_simple_regressor",
        signature=signature,
        input_example=X_train.head(),
        registered_model_name="Dabl simple regressor",
    )

Registered model 'Dabl simple regressor' already exists. Creating a new version of this model...
Created version '9' of model 'Dabl simple regressor'.


Random Forest Regressor

In [232]:
# Create a base model
rf_model = RandomForestRegressor(random_state=0)

# Train the model using the best parameters
rf_model.fit(X_train, y_train)

# Predict the target variable
rf_pred = rf_model.predict(X_test)

# Compute the Root Mean Squared Error
rmse = np.sqrt(metrics.mean_squared_error(y_test, rf_pred))

print("The Root Mean Squared Error for the random forest regression model:", rmse)

The Root Mean Squared Error for the random forest regression model: 1625515.8711564096


In [233]:
# Implement MLflow
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(rf_model.get_params())

    # Log the RMSE metric
    mlflow.log_metric("rmse", rmse)

    # Set a tag for tracking information
    mlflow.set_tag("Training Info", "Dataset 2")

    # Infer the model signature
    signature = infer_signature(X_train, rf_model.predict(X_train))

    # Visualize feature importance
    feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance': rf_model.feature_importances_})
    feature_importance = feature_importance.sort_values(by='importance', ascending=False)
    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance)
    plt.title('Feature Importance')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.savefig("feature_importance.png")
    plt.close()
    
    # Log artifact (feature importance plot)
    mlflow.log_artifact("feature_importance.png")

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=rf_model,
        artifact_path="Random_Forest_Model",
        signature=signature,
        input_example=X_train.head(),
        registered_model_name="Random Forest Model",
    )

Registered model 'Random Forest Model' already exists. Creating a new version of this model...
Created version '5' of model 'Random Forest Model'.


Light Gradient Boost Model

In [234]:
import lightgbm as lgb

# Train the model with the best parameters
lgb_model = lgb.LGBMRegressor(random_state=0)

# Fit the model
lgb_model.fit(X_train, y_train)

# Make predictions
lgb_pred = lgb_model.predict(X_test)

# Compute the Root Mean Squared Error
rmse = np.sqrt(metrics.mean_squared_error(y_test, lgb_pred))

print("The Root Mean Squared Error for the LGB model:", rmse)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001107 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 123
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 5
[LightGBM] [Info] Start training from score 4706527.385321
The Root Mean Squared Error for the LGB model: 1509663.1888506506


In [235]:
# Implement MLflow
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(lgb_model.get_params())

    # Log the RMSE metric
    mlflow.log_metric("rmse", rmse)

    # Set a tag for tracking information
    mlflow.set_tag("Training Info", "Dataset 2")

    # Infer the model signature
    signature = infer_signature(X_train, lgb_model.predict(X_train))

    # Visualize feature importance
    feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance': lgb_model.feature_importances_})
    feature_importance = feature_importance.sort_values(by='importance', ascending=False)
    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance)
    plt.title('Feature Importance')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.savefig("feature_importance.png")
    plt.close()
    
    # Log artifact (feature importance plot)
    mlflow.log_artifact("feature_importance.png")

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=lgb_model,
        artifact_path="LGB_Model",
        signature=signature,
        input_example=X_train.head(),
        registered_model_name="LGB_Model",
    )

Registered model 'LGB_Model' already exists. Creating a new version of this model...
Created version '6' of model 'LGB_Model'.


Ridge Regression Model

In [236]:
from sklearn.linear_model import Ridge

# Initialize the Ridge Regression model
ridge = Ridge(random_state=0)

# Train the model using the best parameters
ridge.fit(X_train, y_train)

# Get predictions
ridge_pred = ridge.predict(X_test)

# Compute the Root Mean Squared Error
rmse = np.sqrt(metrics.mean_squared_error(y_test, ridge_pred))

print("The Root Mean Squared Error for the Ridge Regression model:", rmse)

The Root Mean Squared Error for the Ridge Regression model: 1514047.544478292


In [237]:
# Implement MLflow
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(ridge.get_params())

    # Log the RMSE metric
    mlflow.log_metric("rmse", rmse)

    # Set a tag for tracking information
    mlflow.set_tag("Training Info", "Dataset 2")

    # Infer the model signature
    signature = infer_signature(X_train, ridge.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=ridge,
        artifact_path="Ridge_Regression_Model",
        signature=signature,
        input_example=X_train.head(),
        registered_model_name="Ridge_Regression_Model",
    )

Registered model 'Ridge_Regression_Model' already exists. Creating a new version of this model...
Created version '5' of model 'Ridge_Regression_Model'.


In [305]:
!mlflow ui

[2024-02-08 12:10:53 -0500] [26124] [INFO] Starting gunicorn 21.2.0
[2024-02-08 12:10:53 -0500] [26124] [INFO] Listening at: http://127.0.0.1:5000 (26124)
[2024-02-08 12:10:53 -0500] [26124] [INFO] Using worker: sync
[2024-02-08 12:10:53 -0500] [26125] [INFO] Booting worker with pid: 26125
[2024-02-08 12:10:53 -0500] [26126] [INFO] Booting worker with pid: 26126
[2024-02-08 12:10:53 -0500] [26127] [INFO] Booting worker with pid: 26127
[2024-02-08 12:10:53 -0500] [26128] [INFO] Booting worker with pid: 26128
^C
[2024-02-08 12:11:45 -0500] [26124] [INFO] Handling signal: int
[2024-02-08 12:11:45 -0500] [26125] [INFO] Worker exiting (pid: 26125)
[2024-02-08 12:11:45 -0500] [26126] [INFO] Worker exiting (pid: 26126)
[2024-02-08 12:11:45 -0500] [26127] [INFO] Worker exiting (pid: 26127)
[2024-02-08 12:11:45 -0500] [26128] [INFO] Worker exiting (pid: 26128)
