In [63]:
import numpy as np
import pandas as pd
import optuna
import dagshub
import mlflow
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import OrdinalEncoder, TargetEncoder
from feature_engine.encoding import MeanEncoder
from category_encoders import BinaryEncoder
from feature_engine.encoding import MeanEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
train = pd.read_parquet('C:/Users/aksha/OneDrive/Desktop/AutoNexusMlOps/data/Exp/train.parquet')
xtrain = train.drop(columns=['Price'])
ytrain = train['Price'].copy()

In [9]:
pd.set_option('display.max_columns', None)

In [10]:
xtrain.sample()

Unnamed: 0,Model_Year,Mileage,Brand_Name,Model_Name,Stock_Type,Exterior_Color,Interior_Color,Drivetrain,Km/L,Fuel_Type,Accidents_Or_Damage,Clean_Title,One_Owner_Vehicle,Personal_Use_Only,Level2_Charging,Dc_Fast_Charging,Battery_Capacity,Expected_Range,Gear_Spec,Engine_Size,Cylinder_Config,Valves,Km/L_e_City,Km/L_e_Hwy,City,STATE
155603,2020,91380,Toyota,Tacoma SR5,Used,white,gray,4WD,8.0,Gasoline,False,True,False,True,0.0,0.0,0.0,0.0,6,3.5,V6,24,0.0,0.0,garnett,kansas


In [60]:
def objective(trial):
    encoder_type = trial.suggest_categorical("encoder_type", ["Binary", "Target"])

    if encoder_type == "Binary":
        encoder = ColumnTransformer([
            ('ordinal_encoding',
             OrdinalEncoder(categories=[['New', 'Certified', 'Used']]),
             ['Stock_Type']),
            ('BinaryEncoder',
             BinaryEncoder(),
             ['Brand_Name', 'Model_Name', 'Exterior_Color',
              'Interior_Color', 'Drivetrain', 'Fuel_Type',
              'Cylinder_Config', 'City', 'STATE'])
        ],remainder='passthrough')
    
    else:
        encoder = ColumnTransformer([
            ('ordinal_encoding',
             OrdinalEncoder(categories=[['New', 'Certified', 'Used']]),
             ['Stock_Type']),
            ('TargetEncoder',
             TargetEncoder(),
             ['Brand_Name', 'Model_Name', 'Exterior_Color',
              'Interior_Color', 'Drivetrain', 'Fuel_Type',
              'Cylinder_Config', 'City', 'STATE'])
        ], remainder='passthrough')

    model_type = trial.suggest_categorical("model", ["Linear", "Ridge", "Lasso", "ElasticNet"])

    if model_type == "Linear":
        model = LinearRegression()

    elif model_type == "Ridge":
        model = Ridge(
            alpha=trial.suggest_float("ridge_alpha", 1e-3, 10.0, log=True),
            solver=trial.suggest_categorical("ridge_solver",  ["svd", "cholesky", "lsqr"])
        )

    elif model_type == "Lasso":
        model = Lasso(
            alpha=trial.suggest_float("lasso_alpha", 1e-3, 10.0, log=True),
            max_iter=trial.suggest_int("lasso_max_iter", 1000, 10000),
            tol=trial.suggest_float("lasso_tol", 1e-5, 1e-2, log=True),
            selection=trial.suggest_categorical("lasso_selection", ["cyclic", "random"])
        )

    else:  
        model = ElasticNet(
            alpha=trial.suggest_float("elastic_alpha", 1e-3, 10.0, log=True),
            l1_ratio=trial.suggest_float("elastic_l1_ratio", 0.0, 1.0),
            max_iter=trial.suggest_int("elastic_max_iter", 1000, 10000),
            tol=trial.suggest_float("elastic_tol", 1e-5, 1e-2, log=True),
            selection=trial.suggest_categorical("elastic_selection", ["cyclic", "random"])
        )
    
    scaler = ColumnTransformer(
        [
            ('std', StandardScaler(), ['Km/L']),
            ('norm', MinMaxScaler(), ['Model_Year','Valves']),
            ('robust', RobustScaler(),['Mileage']),
            ('maxabs', MaxAbsScaler(), ['Level2_Charging','Dc_Fast_Charging','Battery_Capacity','Gear_Spec','Km/L_e_City','Km/L_e_Hwy'])
        ],
        remainder='passthrough'
    )
    
    cols_for_encoding = ['Brand_Name', 'Model_Name', 'Exterior_Color',
                'Interior_Color', 'Drivetrain', 'Fuel_Type',
                'Cylinder_Config', 'City', 'STATE', 'Stock_Type']
    
    cols_for_scaling = ['Model_Year','Valves','Km/L','Mileage','Level2_Charging','Dc_Fast_Charging','Battery_Capacity','Gear_Spec','Km/L_e_City','Km/L_e_Hwy']
    
    transformer = ColumnTransformer([
        ('encoder',encoder,cols_for_encoding),
        ('scaler', scaler,cols_for_scaling),
    ])

    pipe = Pipeline(
        [
            ('transformer',transformer),
            ('model',model)
        ]
    )

    cv = KFold(n_splits=3, shuffle=True, random_state=42)
    score = cross_val_score(pipe, xtrain, ytrain, cv=cv, scoring='r2', n_jobs=-1).mean()
    mlflow.log_metric("mean_r2",score)

    return score

In [64]:
mlflow.set_tracking_uri('https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow')
dagshub.init(repo_owner='akshatsharma2407', repo_name='AutoNexusMlOps', mlflow=True)

In [None]:
study = optuna.create_study(
      study_name='Linear Algo with Different Encoding & Scaling technique',
      direction='maximize',
      storage='sqlite:///../reports/autonexus_optuna.db',
      load_if_exists=True
    )

study.optimize(objective,n_trials=25)

[I 2025-10-31 17:14:27,574] Using an existing study with name 'Linear Algo with Different Encoding & Scaling technique' instead of creating a new one.
[I 2025-10-31 17:14:35,149] Trial 7 finished with value: 0.8032984278058315 and parameters: {'encoder_type': 'Target', 'model': 'Linear'}. Best is trial 7 with value: 0.8032984278058315.
[I 2025-10-31 17:14:46,349] Trial 8 finished with value: 0.4987450555182276 and parameters: {'encoder_type': 'Binary', 'model': 'Linear'}. Best is trial 7 with value: 0.8032984278058315.
[I 2025-10-31 17:14:53,637] Trial 9 finished with value: 0.750360077726881 and parameters: {'encoder_type': 'Target', 'model': 'Ridge', 'ridge_alpha': 0.20654566547132155, 'ridge_solver': 'lsqr'}. Best is trial 7 with value: 0.8032984278058315.
[I 2025-10-31 17:14:56,059] Trial 10 finished with value: 0.8030610174928235 and parameters: {'encoder_type': 'Target', 'model': 'Linear'}. Best is trial 7 with value: 0.8032984278058315.
[I 2025-10-31 17:15:00,233] Trial 11 finis

🏃 View run adventurous-doe-477 at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/3/runs/83463fda178f4ef8bf156223b2e187c2
🧪 View experiment at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/3


In [68]:
mlflow.set_experiment(experiment_name='Linear Algos with Different Encoding & Scaling technique')
for trial in study.trials:
    with mlflow.start_run(run_name=f"trial_{trial.number}"):
        mlflow.log_params(trial.params)
        
        if trial.value:
            mlflow.log_metric("objective", trial.value)
        
        mlflow.set_tag("state", trial.state.name)
        mlflow.set_tag("trial_number", trial.number)

🏃 View run trial_0 at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/3/runs/6825aa41c73e449ba17f72af34b51c3b
🧪 View experiment at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/3
🏃 View run trial_1 at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/3/runs/754d35ca9c8a411f8fd40193343fb61e
🧪 View experiment at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/3
🏃 View run trial_2 at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/3/runs/5da0c88a4cf4453196649ff3027833d0
🧪 View experiment at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/3
🏃 View run trial_3 at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/3/runs/9fdb170a2cf441709726921e0f95b97f
🧪 View experiment at: https://dagshub.com/akshatsharma2407/AutoNexusMlOps.mlflow/#/experiments/3
🏃 View run trial_4 at: https://dagshub.com/akshatsharma2407/AutoNexu