In [1]:
import pandas as pd
import mlflow
import dagshub
from sklearn.model_selection import cross_val_score, KFold, cross_validate
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from feature_engine.encoding import CountFrequencyEncoder, MeanEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
import optuna

In [2]:
train = pd.read_parquet('C:/Users/aksha/OneDrive/Desktop/AutoNexusMlOps/data/Exp/train.parquet')
test = pd.read_parquet('C:/Users/aksha/OneDrive/Desktop/AutoNexusMlOps/data/Exp/test.parquet')

xtrain = train.drop(columns=['Price'])
ytrain = train['Price'].copy()

In [None]:
def objective(trial):
    # --- choose encoder ---
    encoder_type = trial.suggest_categorical("encoder_type", ["Freq", "Count","Target"])

    # --- define transformer dynamically ---
    if encoder_type == "Freq":
        transformer = ColumnTransformer([
            ('ordinal_encoding',
             OrdinalEncoder(categories=[['New', 'Certified', 'Used']]),
             ['Stock_Type']),
            ('frequency',
             CountFrequencyEncoder(encoding_method='frequency'),
             ['Brand_Name', 'Model_Name', 'Exterior_Color',
              'Interior_Color', 'Drivetrain', 'Fuel_Type',
              'Cylinder_Config', 'City', 'STATE'])
        ],remainder='passthrough')
        
    elif encoder_type == "Count":
        transformer = ColumnTransformer([
            ('ordinal_encoding',
             OrdinalEncoder(categories=[['New', 'Certified', 'Used']]),
             ['Stock_Type']),
            ('Count',
             CountFrequencyEncoder(encoding_method='count'),
             ['Brand_Name', 'Model_Name', 'Exterior_Color',
              'Interior_Color', 'Drivetrain', 'Fuel_Type',
              'Cylinder_Config', 'City', 'STATE'])
        ],remainder='passthrough')
    
    else:
        transformer = ColumnTransformer([
            ('ordinal_encoding',
             OrdinalEncoder(categories=[['New', 'Certified', 'Used']]),
             ['Stock_Type']),
            ('target',
             MeanEncoder(),
             ['Brand_Name', 'Model_Name', 'Exterior_Color',
              'Interior_Color', 'Drivetrain', 'Fuel_Type',
              'Cylinder_Config', 'City', 'STATE'])
        ], remainder='passthrough')

    model_type = trial.suggest_categorical("model", ["RandomForest", "XGBoost"])

    if model_type == "RandomForest":
        model = RandomForestRegressor(
            n_estimators=trial.suggest_int("rf_n_estimators", 100, 500),
            max_depth=trial.suggest_categorical("rf_max_depth", [None,5, 30]),
            random_state=42
        )
    else:
        model = XGBRegressor(
            n_estimators=trial.suggest_int("xgb_n_estimators", 100, 500),
            learning_rate=trial.suggest_float("xgb_learning_rate", 0.01, 0.2),
            max_depth=trial.suggest_categorical("xgb_max_depth", [None, 3, 10]),
            random_state=42
        )

    # --- build pipeline ---
    pipe = Pipeline([
        ('Transformer', transformer),
        ('model', model)
    ])

    # --- cross-validation ---
    cv = KFold(n_splits=3, shuffle=True, random_state=42)
    score = cross_val_score(pipe, xtrain, ytrain, cv=cv, scoring='r2', n_jobs=-1).mean()

    return score

In [20]:
study = optuna.create_study(
    study_name='Tree Algo with Different Encoding for String Columns',
    direction='maximize',
    storage='sqlite:///../reports/autonexus_optuna.db',
    load_if_exists=True
)

study.optimize(objective,n_trials=20)

[I 2025-10-31 12:26:23,824] Using an existing study with name 'Tree Algo with Different Encoding for String Columns' instead of creating a new one.
[I 2025-10-31 12:26:41,083] Trial 2 finished with value: 0.9145226751563134 and parameters: {'encoder_type': 'Freq', 'model': 'XGBoost', 'xgb_n_estimators': 310, 'xgb_learning_rate': 0.15248928208194867, 'xgb_max_depth': 10}. Best is trial 2 with value: 0.9145226751563134.
[I 2025-10-31 12:28:28,675] Trial 3 finished with value: 0.9079404680469069 and parameters: {'encoder_type': 'Count', 'model': 'RandomForest', 'rf_n_estimators': 108, 'rf_max_depth': 30}. Best is trial 2 with value: 0.9145226751563134.
[I 2025-10-31 12:28:42,082] Trial 4 finished with value: -1.4697926161880754 and parameters: {'encoder_type': 'Target', 'model': 'XGBoost', 'xgb_n_estimators': 420, 'xgb_learning_rate': 0.03749064615071, 'xgb_max_depth': None}. Best is trial 2 with value: 0.9145226751563134.
[I 2025-10-31 12:28:47,945] Trial 5 finished with value: 0.8662709

In [4]:
mlflow.set_tracking_uri('https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow')
dagshub.init(repo_owner='akshatsharma2407', repo_name='AutoNexusMlOps', mlflow=True)

In [40]:
mlflow.set_experiment(experiment_name='Tree_Algo_Exp')
for trial in study.trials:
    with mlflow.start_run(run_name=f"trial_{trial.number}"):
        mlflow.log_params(trial.params)
        
        if trial.value:
            mlflow.log_metric("objective", trial.value)
        
        mlflow.set_tag("state", trial.state.name)
        mlflow.set_tag("trial_number", trial.number)

🏃 View run trial_0 at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/2/runs/d1c1fc1908964912b842170db33f0e18
🧪 View experiment at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/2
🏃 View run trial_1 at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/2/runs/e7421fca87404c71b5a9dad0f09ed8d9
🧪 View experiment at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/2
🏃 View run trial_2 at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/2/runs/0fef05cbbc0f4d2a96a8100cb9d0b0bb
🧪 View experiment at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/2
🏃 View run trial_3 at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/2/runs/81417db2d67e4283985fabd724b8a6cb
🧪 View experiment at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/2
🏃 View run trial_4 at: https://dagshub.com/akshatsharma2407/AutoNexu

In [None]:
mlflow.set_experiment(experiment_name='Tree_Algo_Exp')
with mlflow.start_run(run_name='RandomForest_BestModel'):
    rf_trf = ColumnTransformer([
                ('ordinal_encoding',
                OrdinalEncoder(categories=[['New', 'Certified', 'Used']]),
                ['Stock_Type']),
                ('target',
                MeanEncoder(),
                ['Brand_Name', 'Model_Name', 'Exterior_Color',
                'Interior_Color', 'Drivetrain', 'Fuel_Type',
                'Cylinder_Config', 'City', 'STATE'])
            ], remainder='passthrough')

    rf = RandomForestRegressor(n_estimators=236, max_depth=30)

    pipe_rf = Pipeline(
        [
            ('transformer', rf_trf),
            ('random_forest',rf)
        ]
    )

    scoring = {
        'r2': 'r2',
        'mae': 'neg_mean_absolute_error',
        'mse': 'neg_mean_squared_error'
    }

    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_validate(pipe_rf, xtrain, ytrain, cv=cv, n_jobs=-1, verbose=245, scoring=scoring)


    mlflow.log_metrics(
        pd.DataFrame(score).mean().to_dict()
    )

    mlflow.sklearn.log_model(pipe_rf, 'model')

    mlflow.log_params(pipe_rf.get_params())

In [7]:
mlflow.set_experiment(experiment_name='Tree_Algo_Exp')
with mlflow.start_run(run_name='XGBoost_BestModel'):
    xg_trf = ColumnTransformer([
                ('ordinal_encoding',
                OrdinalEncoder(categories=[['New', 'Certified', 'Used']]),
                ['Stock_Type']),
                ('target',
                CountFrequencyEncoder(encoding_method='frequency'),
                ['Brand_Name', 'Model_Name', 'Exterior_Color',
                'Interior_Color', 'Drivetrain', 'Fuel_Type',
                'Cylinder_Config', 'City', 'STATE'])
            ], remainder='passthrough')

    xg = XGBRegressor(n_estimators=310, max_depth=10)

    pipe_xg = Pipeline(
        [
            ('transformer', xg_trf),
            ('random_forest',xg)
        ]
    )

    scoring = {
        'r2': 'r2',
        'mae': 'neg_mean_absolute_error',
        'mse': 'neg_mean_squared_error'
    }

    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_validate(pipe_xg, xtrain, ytrain, cv=cv, n_jobs=-1, verbose=20, scoring=scoring)


    mlflow.log_metrics(
        pd.DataFrame(score).mean().to_dict()
    )

    mlflow.sklearn.log_model(pipe_xg, 'model')

    mlflow.log_params(pipe_xg.get_params())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   32.7s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   32.9s remaining:   49.5s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   33.1s remaining:   22.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   33.1s finished


🏃 View run XGBoost_BestModel at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/2/runs/3851971f95d946c781f17f5abe690093
🧪 View experiment at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/2
