In [11]:
import pandas as pd
import mlflow
import dagshub
from sklearn.model_selection import cross_val_score, KFold, cross_validate
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import TargetEncoder
from feature_engine.encoding import CountFrequencyEncoder, MeanEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
import optuna

In [None]:
# importing data 

train = pd.read_parquet('C:/Users/aksha/OneDrive/Desktop/AutoNexusMlOps/data/Exp/train.parquet')
validation = pd.read_parquet('C:/Users/aksha/OneDrive/Desktop/AutoNexusMlOps/data/Exp/validation.parquet')

In [7]:
df = pd.concat([train,validation])

xtrain = df.drop(columns='Price')
ytrain = df['Price'].copy()

In [None]:
def objective(trial):
    """
    Creating an objective function for HP in Optuna
    """
    # --- choose encoder ---
    encoder_type = trial.suggest_categorical(
        "encoder_type", ["Freq", "Count", "Target"]
    )

    # --- choose algorithm ---
    model_type = trial.suggest_categorical(
        "model", ["RandomForest", "XGBoost", "DecisionTree"]
    )

    # --- define transformer dynamically ---
    if encoder_type == "Freq":
        transformer = ColumnTransformer([
            ('ordinal_encoding',
             OrdinalEncoder(categories=[['New', 'Certified', 'Used']]),
             ['Stock_Type']),
            ('frequency',
             CountFrequencyEncoder(encoding_method='frequency'),
             ['Brand_Name', 'Model_Name', 'Exterior_Color',
              'Interior_Color', 'Drivetrain', 'Fuel_Type',
              'Cylinder_Config', 'City', 'STATE'])
        ], remainder='passthrough')

    elif encoder_type == "Count":
        transformer = ColumnTransformer([
            ('ordinal_encoding',
             OrdinalEncoder(categories=[['New', 'Certified', 'Used']]),
             ['Stock_Type']),
            ('count',
             CountFrequencyEncoder(encoding_method='count'),
             ['Brand_Name', 'Model_Name', 'Exterior_Color',
              'Interior_Color', 'Drivetrain', 'Fuel_Type',
              'Cylinder_Config', 'City', 'STATE'])
        ], remainder='passthrough')

    else:
        transformer = ColumnTransformer([
            ('ordinal_encoding',
             OrdinalEncoder(categories=[['New', 'Certified', 'Used']]),
             ['Stock_Type']),
            ('target',
             TargetEncoder(),
             ['Brand_Name', 'Model_Name', 'Exterior_Color',
              'Interior_Color', 'Drivetrain', 'Fuel_Type',
              'Cylinder_Config', 'City', 'STATE'])
        ], remainder='passthrough')

    # --- model selection and hyperparameters ---
    if model_type == "RandomForest":
        model = RandomForestRegressor(
            n_estimators=trial.suggest_int("rf_n_estimators", 100, 800, step=50),
            max_depth=trial.suggest_int("rf_max_depth", 5, 50),
            min_samples_split=trial.suggest_int("rf_min_samples_split", 2, 10),
            min_samples_leaf=trial.suggest_int("rf_min_samples_leaf", 1, 5),
            max_features=trial.suggest_categorical("rf_max_features", ["sqrt", "log2", None]),
            bootstrap=trial.suggest_categorical("rf_bootstrap", [True, False]),
            n_jobs=-1,
            random_state=42
        )

    elif model_type == "XGBoost":
        model = XGBRegressor(
            n_estimators=trial.suggest_int("xgb_n_estimators", 100, 1000, step=50),
            learning_rate=trial.suggest_float("xgb_learning_rate", 0.005, 0.3, log=True),
            max_depth=trial.suggest_int("xgb_max_depth", 3, 15),
            min_child_weight=trial.suggest_float("xgb_min_child_weight", 1.0, 10.0),
            subsample=trial.suggest_float("xgb_subsample", 0.5, 1.0),
            colsample_bytree=trial.suggest_float("xgb_colsample_bytree", 0.5, 1.0),
            gamma=trial.suggest_float("xgb_gamma", 0.0, 5.0),
            reg_alpha=trial.suggest_float("xgb_reg_alpha", 1e-5, 10.0, log=True),
            reg_lambda=trial.suggest_float("xgb_reg_lambda", 1e-5, 10.0, log=True),
            random_state=42,
            n_jobs=-1
        )

    else:  # DecisionTree
        model = DecisionTreeRegressor(
            max_depth=trial.suggest_int("dt_max_depth", 3, 50),
            min_samples_split=trial.suggest_int("dt_min_samples_split", 2, 10),
            min_samples_leaf=trial.suggest_int("dt_min_samples_leaf", 1, 5),
            max_features=trial.suggest_categorical("dt_max_features", ["sqrt", "log2", None]),
            criterion=trial.suggest_categorical("dt_criterion", ["squared_error", "friedman_mse"]),
            splitter=trial.suggest_categorical("dt_splitter", ["best", "random"]),
            random_state=42
        )

    # --- build pipeline ---
    pipe = Pipeline([
        ('Transformer', transformer),
        ('model', model)
    ])

    # --- cross-validation ---
    cv = KFold(n_splits=3, shuffle=True, random_state=42)
    score = cross_val_score(pipe, xtrain, ytrain, cv=cv, scoring='r2', n_jobs=-1).mean()

    return score

In [17]:
study = optuna.create_study(
    study_name='In Search of The Champion',
    direction='maximize',
    storage='sqlite:///../reports/autonexus_optuna.db',
    load_if_exists=True
)

study.optimize(objective,n_trials=100)

[I 2025-10-31 19:33:19,533] A new study created in RDB with name: In Search of The Champion
[I 2025-10-31 19:33:33,470] Trial 0 finished with value: 0.8823638235740129 and parameters: {'encoder_type': 'Freq', 'model': 'XGBoost', 'xgb_n_estimators': 550, 'xgb_learning_rate': 0.07110984732756406, 'xgb_max_depth': 3, 'xgb_min_child_weight': 4.4158717242792145, 'xgb_subsample': 0.6144921878273514, 'xgb_colsample_bytree': 0.5531618154276072, 'xgb_gamma': 2.7122352385815898, 'xgb_reg_alpha': 0.0005245638390492034, 'xgb_reg_lambda': 1.2377152232690528e-05}. Best is trial 0 with value: 0.8823638235740129.
[I 2025-10-31 19:34:38,032] Trial 1 finished with value: 0.8807216034731314 and parameters: {'encoder_type': 'Freq', 'model': 'RandomForest', 'rf_n_estimators': 500, 'rf_max_depth': 10, 'rf_min_samples_split': 6, 'rf_min_samples_leaf': 3, 'rf_max_features': 'sqrt', 'rf_bootstrap': False}. Best is trial 0 with value: 0.8823638235740129.
[I 2025-10-31 19:34:59,478] Trial 2 finished with value: 

ValueError: 
All the 3 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\joblib\_utils.py", line 72, in __call__
    return self.func(**kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\joblib\parallel.py", line 607, in __call__
    return [func(*args, **kwargs) for func, args, kwargs in self.items]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\joblib\parallel.py", line 607, in <listcomp>
    return [func(*args, **kwargs) for func, args, kwargs in self.items]
            ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\sklearn\utils\parallel.py", line 147, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\sklearn\ensemble\_forest.py", line 196, in _parallel_build_trees
    tree._fit(
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\sklearn\tree\_classes.py", line 472, in _fit
    builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
  File "sklearn/tree/_tree.pyx", line 141, in sklearn.tree._tree.DepthFirstTreeBuilder.build
  File "sklearn/tree/_tree.pyx", line 256, in sklearn.tree._tree.DepthFirstTreeBuilder.build
  File "sklearn/tree/_tree.pyx", line 911, in sklearn.tree._tree.Tree._add_node
  File "sklearn/tree/_tree.pyx", line 879, in sklearn.tree._tree.Tree._resize_c
  File "sklearn/tree/_utils.pyx", line 29, in sklearn.tree._utils.safe_realloc
MemoryError: could not allocate 8388608 bytes
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\sklearn\pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\sklearn\ensemble\_forest.py", line 486, in fit
    trees = Parallel(
            ^^^^^^^^^
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\sklearn\utils\parallel.py", line 82, in __call__
    return super().__call__(iterable_with_config_and_warning_filters)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\joblib\parallel.py", line 2072, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\joblib\parallel.py", line 1682, in _get_outputs
    yield from self._retrieve()
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\joblib\parallel.py", line 1784, in _retrieve
    self._raise_error_fast()
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\joblib\parallel.py", line 1859, in _raise_error_fast
    error_job.get_result(self.timeout)
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\joblib\parallel.py", line 758, in get_result
    return self._return_or_raise()
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\aksha\OneDrive\Desktop\AutoNexusMlOps\ANenv\Lib\site-packages\joblib\parallel.py", line 773, in _return_or_raise
    raise self._result
MemoryError: could not allocate 8388608 bytes


In [None]:
# tracking the records

mlflow.set_tracking_uri('https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow')
dagshub.init(repo_owner='akshatsharma2407', repo_name='AutoNexusMlOps', mlflow=True)

In [20]:
mlflow.set_experiment(experiment_name='In Search of Champion')
for trial in study.trials:
    with mlflow.start_run(run_name=f"trial_{trial.number}"):
        mlflow.log_params(trial.params)
        
        if trial.value:
            mlflow.log_metric("objective", trial.value)
        
        mlflow.set_tag("state", trial.state.name)
        mlflow.set_tag("trial_number", trial.number)

2025/11/01 00:28:58 INFO mlflow.tracking.fluent: Experiment with name 'In Search of Champion' does not exist. Creating a new experiment.


🏃 View run trial_0 at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/5/runs/7cf18f74389340d198f9f1bc34049b4e
🧪 View experiment at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/5
🏃 View run trial_1 at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/5/runs/1199a876e310494186856faa569a84b9
🧪 View experiment at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/5
🏃 View run trial_2 at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/5/runs/871e41388df7407ab07fc249350676db
🧪 View experiment at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/5
🏃 View run trial_3 at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/5/runs/2ff787b520ab45729014025fee963e52
🧪 View experiment at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/5
🏃 View run trial_4 at: https://dagshub.com/akshatsharma2407/AutoNexu