In [1]:
import pandas as pd
import mlflow
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from commons.data_loading import (load_raw_data,
                                  transformed_employee_performance,
                                  feature_engineered_employee_performance)
from commons.commons import get_features

In [2]:
def save_scaler(scaler, name="standard_scaler",
                filename='scaler'):
    """Save and log the StandardScaler to MLflow."""
    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment("hw3-first_experiment")

    with mlflow.start_run(run_name=name):
        joblib.dump(scaler, f"{filename}.pkl")
        mlflow.log_artifact(f"{filename}.pkl")

        mlflow.sklearn.log_model(scaler, name)

In [3]:
def get_data():
    data_df = load_raw_data()[:2000]
    data_df = transformed_employee_performance(data_df=data_df)
    new_df, X_scaler, y_scaler = feature_engineered_employee_performance(data_df=data_df,
                                                    return_scaler=True)
    save_scaler(X_scaler, 'x_standard_scaler',
                'x_scaler')
    save_scaler(y_scaler, 'y_standard_scaler',
                'y_scaler')
    return new_df, X_scaler, y_scaler

In [4]:
new_df, X_scaler, y_scaler = get_data()
new_df.head()

Found




🏃 View run x_standard_scaler at: http://localhost:5000/#/experiments/1/runs/db2b9390cb4e44f1a41b27d711e0d1fb
🧪 View experiment at: http://localhost:5000/#/experiments/1




🏃 View run y_standard_scaler at: http://localhost:5000/#/experiments/1/runs/adcbf0af14a840e2aa09045d9787f9f9
🧪 View experiment at: http://localhost:5000/#/experiments/1


Unnamed: 0,Employee_ID,Age,Years_At_Company,Performance_Score,Monthly_Salary,Work_Hours_Per_Week,Projects_Handled,Overtime_Hours,Sick_Days,Remote_Work_Frequency,...,Gender_Other,Job_Title_Consultant,Job_Title_Developer,Job_Title_Engineer,Job_Title_Manager,Job_Title_Specialist,Job_Title_Technician,Education_Level_High School,Education_Level_Master,Education_Level_PhD
0,1,1.304487,-0.905767,1.459238,6750.0,-1.338554,0.579189,0.857861,-1.129268,-1.420325,...,False,False,False,False,False,True,False,True,False,False
1,2,-1.07296,-1.611744,1.459238,7500.0,-1.225629,0.719795,-0.179665,1.637134,1.394988,...,False,False,True,False,False,False,False,True,False,False
2,3,1.304487,1.212161,0.026506,5850.0,-0.886855,0.227676,-0.98663,-0.898735,-0.012669,...,False,False,False,False,False,True,False,True,False,False
3,4,0.664405,0.859173,-0.689861,4800.0,0.807017,-0.967472,1.549546,1.176067,1.394988,...,False,False,False,False,False,False,False,False,False,False
4,5,-0.432878,-0.552779,-0.689861,4800.0,-0.77393,-0.897169,1.664826,1.4066,1.394988,...,False,False,False,False,False,False,False,False,False,False


In [5]:
numeric_columns = get_features()
newer_df = y_scaler.inverse_transform(
    new_df['Employee_Satisfaction_Score'].values.reshape(-1, 1)
)
pd.DataFrame(newer_df, columns=['Employee_Satisfaction_Score']).head()

Unnamed: 0,Employee_Satisfaction_Score
0,2.63
1,1.72
2,3.17
3,1.86
4,1.25


In [None]:
def trained_regression_models(new_df: pd.DataFrame):
    """
    Train multiple regression models with hyperparameter tuning and log them to MLflow.

    Parameters
    ----------
    new_df : pd.DataFrame
        Feature-engineered dataset.

    Returns
    -------
    dict
        Dictionary containing trained models and test data.
    """
    mlflow.set_tracking_uri('http://localhost:5000')
    mlflow.set_experiment("hw3-first_experiment")

    X = new_df.drop(columns=["Employee_Satisfaction_Score"])
    y = new_df["Employee_Satisfaction_Score"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    models = {
        "linear_regression": (LinearRegression(), {}),
        "rf_regressor": (RandomForestRegressor(random_state=42), {
            "n_estimators": [50, 100, 200],
            "max_depth": [10, 20, None]
        }),
        "gboost_regressor": (GradientBoostingRegressor(random_state=42), {
            "n_estimators": [50, 100, 200],
            "learning_rate": [0.01, 0.1, 0.2]
        })
    }

    trained_models = {}
    for name, (model, param_grid) in models.items():
        with mlflow.start_run(run_name=name):
            if param_grid:
                search = GridSearchCV(model, param_grid,
                                      scoring="neg_mean_squared_error",
                                      cv=3)
                search.fit(X_train, y_train)
                best_model = search.best_estimator_
                mlflow.log_params(search.best_params_)
            else:
                best_model = model.fit(X_train, y_train)


            y_pred = best_model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("r2", r2)
            mlflow.sklearn.log_model(best_model, name,
                                     registered_model_name=name)

            
            trained_models[name] = best_model
            print(f"✅ Trained and logged {name} model. MSE: {mse:.4f}, R²: {r2:.4f}")

    return {
        "models": trained_models,
        "X_test": X_test,
        "y_test": y_test
    }

In [None]:
def main(new_df):
    trained_models = trained_regression_models(new_df=new_df)
    return trained_models

In [None]:
main(new_df=new_df)

Found


Registered model 'linear_regression' already exists. Creating a new version of this model...
2025/02/21 16:35:37 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: linear_regression, version 2
Created version '2' of model 'linear_regression'.


✅ Trained and logged linear_regression model. MSE: 1.0190, R²: -0.0337
🏃 View run linear_regression at: http://localhost:5000/#/experiments/1/runs/fad4eeea96c2482aa2d89f1e4299776d
🧪 View experiment at: http://localhost:5000/#/experiments/1


Registered model 'rf_regressor' already exists. Creating a new version of this model...
2025/02/21 16:36:09 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf_regressor, version 2
Created version '2' of model 'rf_regressor'.


✅ Trained and logged rf_regressor model. MSE: 1.0282, R²: -0.0430
🏃 View run rf_regressor at: http://localhost:5000/#/experiments/1/runs/ab6f3632f35f41c9b9a52bd41c8be8b4
🧪 View experiment at: http://localhost:5000/#/experiments/1


Registered model 'gboost_regressor' already exists. Creating a new version of this model...
2025/02/21 16:36:22 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: gboost_regressor, version 2


✅ Trained and logged gboost_regressor model. MSE: 1.0005, R²: -0.0149
🏃 View run gboost_regressor at: http://localhost:5000/#/experiments/1/runs/e1a301e9f09542a8adb07d0c81cb69d5
🧪 View experiment at: http://localhost:5000/#/experiments/1


Created version '2' of model 'gboost_regressor'.


{'models': {'linear_regression': LinearRegression(),
  'rf_regressor': RandomForestRegressor(max_depth=10, n_estimators=200, random_state=42),
  'gboost_regressor': GradientBoostingRegressor(learning_rate=0.01, n_estimators=50, random_state=42)},
 'X_test':       Employee_ID       Age  Years_At_Company  Performance_Score  \
 1860         1861 -0.981520         -0.199791           0.026506   
 353           354  0.298644         -1.258756          -0.689861   
 1333         1334  0.298644         -0.552779           1.459238   
 905           906 -0.067117         -0.552779          -1.406227   
 1289         1290  0.024323         -0.552779           0.742872   
 ...           ...       ...               ...                ...   
 965           966  1.670248         -1.258756           1.459238   
 1284         1285  1.761689          0.153197          -1.406227   
 1739         1740 -0.981520         -0.905767           0.026506   
 261           262 -0.798639         -0.199791       