In [None]:

import optuna
import mlflow
import mlflow.sklearn
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

from commons.data_loading import (
    load_raw_data,
    transformed_employee_performance,
    feature_engineered_employee_performance
)
from commons.commons import get_features

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Optuna_Hyperparameter_Tuning")


In [2]:

def get_data():
    data_df = load_raw_data()[:2000]
    data_df = transformed_employee_performance(data_df=data_df)
    new_df, X_scaler, y_scaler = feature_engineered_employee_performance(
        data_df=data_df, return_scaler=True
    )
    return new_df, X_scaler, y_scaler

data_df, X_scaler, y_scaler = get_data()
X, y = get_features(data_df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


NameError: name 'load_raw_data' is not defined

In [None]:

def objective(trial):
    with mlflow.start_run(nested=True):
        model_type = trial.suggest_categorical('model_type', ['RandomForest', 'GradientBoosting'])
        
        if model_type == 'RandomForest':
            n_estimators = trial.suggest_int('n_estimators', 50, 200)
            max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            model = RandomForestRegressor(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                random_state=42
            )
            mlflow.log_params({
                'model_type': model_type,
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf
            })
            
        elif model_type == 'GradientBoosting':
            learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.1)
            n_estimators = trial.suggest_int('n_estimators', 50, 200)
            max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
            model = GradientBoostingRegressor(
                learning_rate=learning_rate,
                n_estimators=n_estimators,
                max_depth=max_depth,
                random_state=42
            )
            mlflow.log_params({
                'model_type': model_type,
                'learning_rate': learning_rate,
                'n_estimators': n_estimators,
                'max_depth': max_depth
            })

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)

        mlflow.log_metric('mse', mse)
        mlflow.log_metric('r2_score', r2)
        
        return mse
