In [2]:

# tracking with mlflow
import os
from mlflow.models import infer_signature
import pandas as pd
#from urlib.parse import urlparse
import mlflow
#from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from dotenv import load_dotenv
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import make_column_selector, make_column_transformer
import numpy as np
import mlflow.sklearn
import joblib
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
# Preprocessing pipeline 

# defining numerical and categorical columns

# preprocessing pipeline the numerical features that is all the features in the dataset
# defining pipeline

# pipeline for  the log transformation to handle skew features
log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler())



# building the preprocessing pipeline to apply the transformations such as
# imputation of missing values using median values, scaling and log transformation to the numerical features
preprocessing = make_column_transformer(
    (log_pipeline, make_column_selector(dtype_include=np.number)),
)

In [None]:
data = pd.read_csv("../artifacts/ml_training_data.csv")
data.columns, data.shape

In [None]:
test_data = pd.read_csv("../artifacts/ml_test_data.csv")
test_data.columns, test_data.shape

In [None]:
X_test = test_data.drop(columns=["calories"])
y_test = test_data.calories

In [None]:
# check the preprocessing pipeline
processor = preprocessing.fit(X_train)
X_train_processed = processor.transform(X_train)
processor.get_feature_names_out()


In [None]:
# model pipeline - from our training the best model was random forest regressor
# check the default parameters of the random forest regressor
rf = make_pipeline(
    preprocessing, RandomForestRegressor(random_state=42)
)

In [None]:
# checking cross validation scores
from sklearn.model_selection import cross_val_score

rf_rmses = -cross_val_score(
    rf, X_train, y_train, scoring="neg_mean_absolute_error", cv=5
)

In [None]:
pd.Series(rf_rmses).describe() # check the mean and std of the cross validation scores

In [None]:
# checking the performance of a simple linear regression model as a baseline
from sklearn.linear_model import LinearRegression
lgr = make_pipeline(
    preprocessing, LinearRegression()
)
lgr.fit(X_train, y_train)

In [None]:
# baseline model
x_train_preds = lgr.predict(X_train)
mean_squared_error(y_train, x_train_preds)

In [None]:
mean_absolute_error(y_train, x_train_preds)

In [None]:
r2_score(y_train, x_train_preds)

In [None]:
# defining hyparameter grid search space to get the best performance for the random forest regressor

param_grid = {
    'randomforestregressor__n_estimators': [100, 200, 500],       # Number of trees in the forest
    'randomforestregressor__max_features': ['sqrt', 'log2', 0.5], # Number of features to consider at each split
    'randomforestregressor__max_depth': [10, 20, 30, None],       # Maximum depth of the tree
    'randomforestregressor__min_samples_split': [2, 5, 10],       # Minimum number of samples required to split
    'randomforestregressor__min_samples_leaf': [1, 2, 4],         # Minimum number of samples required at a leaf node
    'randomforestregressor__bootstrap': [True, False],            # Whether bootstrap samples are used
}


grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=5,
    n_jobs=-1,
    verbose=2
)

#grid_search.fit(X_train, y_train) : takes time to train so i switch to random search to save computation time


In [None]:
from scipy.stats import randint
# Using RandomizedSearchCV for hyperparameter tuning

param_distributions = {
    'randomforestregressor__n_estimators': randint(100, 600),
    'randomforestregressor__max_features': ['sqrt', 'log2', 0.5],
    'randomforestregressor__max_depth': [10, 20, 30, None],
    'randomforestregressor__min_samples_split': randint(2, 11),
    'randomforestregressor__min_samples_leaf': randint(1, 5),
    'randomforestregressor__bootstrap': [True, False],
}

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=25,              # try 25 random combinations
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

random_search.fit(X_train, y_train)
print("Best parameters:", random_search.best_params_)
print("Best R² score:", random_search.best_score_)


In [None]:
best_model = random_search.best_estimator_

# Predict on train set
y_pred = best_model.predict(X_train)

# Compute metrics
r2 = r2_score(y_train, y_pred)
mae = mean_absolute_error(y_train, y_pred)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))

print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")

In [None]:
# evaluate the model on the test set

best_model = random_search.best_estimator_


# Predict on test set
y_pred = best_model.predict(X_test)

# Compute metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")


Comparing the performance of the baseline model and the tuned random forest regressor model, we can see a significant improvement in all metrics. The R² score has increased, while both MAE and RMSE have decreased, indicating that the hyperparameter tuning has effectively enhanced the model's predictive capabilities.

* Also the model performance is not overfitting the dataset as it perform reasonable well on the test set


In [None]:
# You can skip this code

#mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
mlflow.set_experiment("RandomForest_Regression_Pipeline")



# ===  The trained model ===
best_model = random_search.best_estimator_  # or grid_search.best_estimator_
best_params = random_search.best_params_     # or grid_search.best_params_
best_score = random_search.best_score_

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# === Save locally ===
local_model_path = "../models/best_random_forest_pipeline.joblib"
os.makedirs("models", exist_ok=True)
joblib.dump(best_model, local_model_path)
print(f" Model saved locally at: {local_model_path}")

# === Log to MLflow on DagsHub ===
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
with mlflow.start_run():
    signature=infer_signature(X_train,y_train)
    # Log best hyperparameters
    mlflow.log_params(best_params)
    
    input_example = X_train.iloc[[0]]
    
    # Log performance metrics
    mlflow.log_metric("r2_score", best_score)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("rmse", rmse)
    
    # Log model
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="model",
        registered_model_name="RandomForest_Regression_Pipeline_v1",
        input_example=input_example
    )

print(" Model and params logged successfully to DagsHub MLflow!")
