In [1]:
import pandas as pd
import numpy as np
from lazypredict.Supervised import LazyRegressor
import warnings


from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import optuna

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, ElasticNet, GammaRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgbm
import catboost as cb



warnings.filterwarnings('ignore')



RS=42
TARGET='price'
N_SPLITS=5


data = './Data/used_car_fe'
df = pd.read_csv(data)
df.head()

Unnamed: 0,year,make,model,trim,mileage,location_state,location_city,num_accidents,num_owners,usage,...,int_color_Black,int_color_Blue,int_color_Brown,int_color_Gray,int_color_Orange,int_color_Red,int_color_Silver,int_color_Unknown,int_color_White,int_color_Yellow
0,1997,BMW,Z3,Roadster 1.9L,89677,CA,Corona,0.0,3,Personal use,...,0,0,0,0,0,0,0,1,0,0
1,1997,BMW,Z3,Roadster 1.9L,76790,IL,McCook,0.0,4,Personal use,...,1,0,0,0,0,0,0,0,0,0
2,1997,Mercedes-Benz,SL,SL 320 Roadster,134000,ND,Dickinson,0.0,3,Personal use,...,0,0,0,0,0,0,0,0,0,0
3,1997,Mercedes-Benz,SL,SL 500 Roadster,156753,IL,Chicago,0.0,9,Personal use,...,0,0,0,0,0,0,0,1,0,0
4,1997,Mercedes-Benz,SL,SL 500 Roadster,76923,AZ,Tempe,0.0,8,Personal use,...,0,0,0,0,0,0,0,1,0,0


In [2]:
trim_counts = df.trim.value_counts()
trim_counts = trim_counts[trim_counts >= 25]  # filter trims with counts >= 25
df_filtered = df[df.trim.isin(trim_counts.index)]  # filter rows with remaining trimm

In [3]:
df_filtered = pd.get_dummies(df_filtered, columns=['make','usage','mileage_binned'])
df = df_filtered

In [4]:
LE = LabelEncoder()

In [5]:
df['trim'] = LE.fit_transform(df['trim'])
df['model'] = LE.fit_transform(df['model'])
df['location_state'] = LE.fit_transform(df['location_state'])
df['location_city'] = LE.fit_transform(df['location_city'])

In [6]:
temp = df[TARGET]
df = df.drop([TARGET], axis=1)
df[TARGET]= temp
df.shape

(27592, 47)

In [7]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [8]:



def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

# Lets make some inital modeling and define some Quality of Life improving functions.
def print_model_locations(model_list):
    print("-"*80)
    for i,model in enumerate(model_list):
        print(model)
        print(f"{model.__class__.__name__:50} at index {i}")
        print("-"*80)
        
        
        
# K Fold Model Evaluator
def evaluate_model_kf(model, train, target, n_splits=N_SPLITS,  random_state=RS):
    n= 0
    RMSES = []
    kf = KFold(
        n_splits=n_splits, random_state=random_state
    )
    for trn_idx, test_idx in kf.split(train,target):
        X_tr,X_val=train.iloc[trn_idx],train.iloc[test_idx]
        y_tr,y_val=target.iloc[trn_idx],target.iloc[test_idx]
        model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
        RMSES.append(rmse(y_val, model.predict(X_val)))
        print(f"fold: {n+1} ==> RMSE: {RMSES[n]}")
        n+=1
    return np.mean(RMSES)


# Model Evaluator
def model_evaluator(model_list):
    print("-"*80)
    for reg in model_list:
        reg_name=reg.__class__.__name__
        score = evaluate_model_kf(reg,X, y, n_splits=N_SPLITS, random_state=RS)
        print(f"Fitting Baseline {reg_name} done")
        print(f"RMSES: {score}")
        print("-"*80)

In [9]:


reg_list=[
    xgb.XGBRegressor(tree_method='gpu_hist'),
    lgbm.LGBMRegressor(metric='rmse'),
    cb.CatBoostRegressor(verbose=0)
]
print_model_locations(reg_list)

--------------------------------------------------------------------------------
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)
XGBRegressor                                       at index 0
--------------------------------------------------------------------------------
LGBMRegressor(metric='rmse')
LGBMRe

In [11]:
%%time
model_evaluator(reg_list)

--------------------------------------------------------------------------------
fold: 1 ==> RMSE: 0.2781232912789587
fold: 2 ==> RMSE: 0.17751371012458025
fold: 3 ==> RMSE: 0.12652306318754816
fold: 4 ==> RMSE: 0.12578896506191303
fold: 5 ==> RMSE: 0.1450093276589908
Fitting Baseline XGBRegressor done
RMSES: 0.17059167146239818
--------------------------------------------------------------------------------
fold: 1 ==> RMSE: 0.3216394140998013
fold: 2 ==> RMSE: 0.18397193889967212
fold: 3 ==> RMSE: 0.15348546392484258
fold: 4 ==> RMSE: 0.13919539561991634
fold: 5 ==> RMSE: 0.1775525027337844
Fitting Baseline LGBMRegressor done
RMSES: 0.19516894305560337
--------------------------------------------------------------------------------
fold: 1 ==> RMSE: 0.31861852713565003
fold: 2 ==> RMSE: 0.1450250116480755
fold: 3 ==> RMSE: 0.10864511641394556
fold: 4 ==> RMSE: 0.12760023250417346
fold: 5 ==> RMSE: 0.1399621931447064
Fitting Baseline CatBoostRegressor done
RMSES: 0.16797021616931018
-

In [12]:
# Declare optuna objective
def objective(trial,data=X,target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=RS)
    param = {
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': trial.suggest_categorical('n_estimators', [6000]),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        'random_state': trial.suggest_categorical('random_state', [RS]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = xgb.XGBRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    RMSE = rmse(test_y, preds)
    
    return RMSE

In [13]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30, gc_after_trial=True)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print(f"Best RMSE: {study.best_value}")

[32m[I 2023-03-20 00:48:41,606][0m A new study created in memory with name: no-name-c4aab506-d566-469a-8e2f-662536020672[0m
[32m[I 2023-03-20 00:48:48,800][0m Trial 0 finished with value: 0.09671211332825916 and parameters: {'lambda': 1.0663419530723752, 'alpha': 0.6059350528571593, 'colsample_bytree': 0.6, 'subsample': 0.7, 'learning_rate': 0.02, 'n_estimators': 6000, 'max_depth': 11, 'random_state': 42, 'min_child_weight': 37}. Best is trial 0 with value: 0.09671211332825916.[0m
[32m[I 2023-03-20 00:49:02,640][0m Trial 1 finished with value: 0.10965333696247077 and parameters: {'lambda': 0.10038736715669085, 'alpha': 9.42574995917851, 'colsample_bytree': 0.9, 'subsample': 0.6, 'learning_rate': 0.008, 'n_estimators': 6000, 'max_depth': 11, 'random_state': 42, 'min_child_weight': 16}. Best is trial 0 with value: 0.09671211332825916.[0m
[32m[I 2023-03-20 00:49:20,904][0m Trial 2 finished with value: 0.10504389921882995 and parameters: {'lambda': 0.007063889571092111, 'alpha':

Number of finished trials: 30
Best trial: {'lambda': 0.03886751006294093, 'alpha': 0.21057654805358594, 'colsample_bytree': 0.6, 'subsample': 1.0, 'learning_rate': 0.018, 'n_estimators': 6000, 'max_depth': 17, 'random_state': 42, 'min_child_weight': 25}
Best RMSE: 0.09400678594181001
