In [1]:
import pandas as pd
import mlflow
import dagshub
from sklearn.model_selection import cross_val_score, KFold, cross_validate
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from feature_engine.encoding import CountFrequencyEncoder, MeanEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
import optuna

In [2]:
train = pd.read_parquet('C:/Users/aksha/OneDrive/Desktop/AutoNexusMlOps/data/Exp/train.parquet')
validation = pd.read_parquet('C:/Users/aksha/OneDrive/Desktop/AutoNexusMlOps/data/Exp/validation.parquet')

In [7]:
df = pd.concat([train,validation])

xtrain = df.drop(columns='Price')
ytrain = df['Price'].copy()

In [8]:
def objective(trial):
    # --- choose encoder ---
    encoder_type = trial.suggest_categorical(
        "encoder_type", ["Freq", "Count", "Target"]
    )

    # --- choose algorithm ---
    model_type = trial.suggest_categorical(
        "model", ["RandomForest", "XGBoost", "GradientBoosting", "DecisionTree"]
    )

    # --- define transformer dynamically ---
    if encoder_type == "Freq":
        transformer = ColumnTransformer([
            ('ordinal_encoding',
             OrdinalEncoder(categories=[['New', 'Certified', 'Used']]),
             ['Stock_Type']),
            ('frequency',
             CountFrequencyEncoder(encoding_method='frequency'),
             ['Brand_Name', 'Model_Name', 'Exterior_Color',
              'Interior_Color', 'Drivetrain', 'Fuel_Type',
              'Cylinder_Config', 'City', 'STATE'])
        ], remainder='passthrough')

    elif encoder_type == "Count":
        transformer = ColumnTransformer([
            ('ordinal_encoding',
             OrdinalEncoder(categories=[['New', 'Certified', 'Used']]),
             ['Stock_Type']),
            ('count',
             CountFrequencyEncoder(encoding_method='count'),
             ['Brand_Name', 'Model_Name', 'Exterior_Color',
              'Interior_Color', 'Drivetrain', 'Fuel_Type',
              'Cylinder_Config', 'City', 'STATE'])
        ], remainder='passthrough')

    else:
        transformer = ColumnTransformer([
            ('ordinal_encoding',
             OrdinalEncoder(categories=[['New', 'Certified', 'Used']]),
             ['Stock_Type']),
            ('target',
             MeanEncoder(),
             ['Brand_Name', 'Model_Name', 'Exterior_Color',
              'Interior_Color', 'Drivetrain', 'Fuel_Type',
              'Cylinder_Config', 'City', 'STATE'])
        ], remainder='passthrough')

    # --- model selection and hyperparameters ---
    if model_type == "RandomForest":
        model = RandomForestRegressor(
            n_estimators=trial.suggest_int("rf_n_estimators", 100, 800, step=50),
            max_depth=trial.suggest_int("rf_max_depth", 5, 50),
            min_samples_split=trial.suggest_int("rf_min_samples_split", 2, 10),
            min_samples_leaf=trial.suggest_int("rf_min_samples_leaf", 1, 5),
            max_features=trial.suggest_categorical("rf_max_features", ["sqrt", "log2", None]),
            bootstrap=trial.suggest_categorical("rf_bootstrap", [True, False]),
            n_jobs=-1,
            random_state=42
        )

    elif model_type == "XGBoost":
        model = XGBRegressor(
            n_estimators=trial.suggest_int("xgb_n_estimators", 100, 1000, step=50),
            learning_rate=trial.suggest_float("xgb_learning_rate", 0.005, 0.3, log=True),
            max_depth=trial.suggest_int("xgb_max_depth", 3, 15),
            min_child_weight=trial.suggest_float("xgb_min_child_weight", 1.0, 10.0),
            subsample=trial.suggest_float("xgb_subsample", 0.5, 1.0),
            colsample_bytree=trial.suggest_float("xgb_colsample_bytree", 0.5, 1.0),
            gamma=trial.suggest_float("xgb_gamma", 0.0, 5.0),
            reg_alpha=trial.suggest_float("xgb_reg_alpha", 1e-5, 10.0, log=True),
            reg_lambda=trial.suggest_float("xgb_reg_lambda", 1e-5, 10.0, log=True),
            random_state=42,
            n_jobs=-1
        )

    elif model_type == "GradientBoosting":
        model = GradientBoostingRegressor(
            n_estimators=trial.suggest_int("gb_n_estimators", 100, 1000, step=50),
            learning_rate=trial.suggest_float("gb_learning_rate", 0.01, 0.3, log=True),
            max_depth=trial.suggest_int("gb_max_depth", 3, 10),
            min_samples_split=trial.suggest_int("gb_min_samples_split", 2, 10),
            min_samples_leaf=trial.suggest_int("gb_min_samples_leaf", 1, 5),
            subsample=trial.suggest_float("gb_subsample", 0.6, 1.0),
            max_features=trial.suggest_categorical("gb_max_features", ["sqrt", "log2", None]),
            random_state=42
        )

    else:  # DecisionTree
        model = DecisionTreeRegressor(
            max_depth=trial.suggest_int("dt_max_depth", 3, 50),
            min_samples_split=trial.suggest_int("dt_min_samples_split", 2, 10),
            min_samples_leaf=trial.suggest_int("dt_min_samples_leaf", 1, 5),
            max_features=trial.suggest_categorical("dt_max_features", ["sqrt", "log2", None]),
            criterion=trial.suggest_categorical("dt_criterion", ["squared_error", "friedman_mse"]),
            splitter=trial.suggest_categorical("dt_splitter", ["best", "random"]),
            random_state=42
        )

    # --- build pipeline ---
    pipe = Pipeline([
        ('Transformer', transformer),
        ('model', model)
    ])

    # --- cross-validation ---
    cv = KFold(n_splits=3, shuffle=True, random_state=42)
    score = cross_val_score(pipe, xtrain, ytrain, cv=cv, scoring='r2', n_jobs=-1).mean()

    return score

In [None]:
study = optuna.create_study(
    study_name='In Search of Champion',
    direction='maximize',
    storage='sqlite:///../reports/autonexus_optuna.db',
    load_if_exists=True
)

study.optimize(objective,n_trials=100)

[I 2025-10-31 18:37:49,628] Using an existing study with name 'In Search of Champion' instead of creating a new one.


In [None]:
mlflow.set_tracking_uri('https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow')
dagshub.init(repo_owner='akshatsharma2407', repo_name='AutoNexusMlOps', mlflow=True)

In [None]:
mlflow.set_experiment(experiment_name='In Search of Champion')
for trial in study.trials:
    with mlflow.start_run(run_name=f"trial_{trial.number}"):
        mlflow.log_params(trial.params)
        
        if trial.value:
            mlflow.log_metric("objective", trial.value)
        
        mlflow.set_tag("state", trial.state.name)
        mlflow.set_tag("trial_number", trial.number)