In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
import optuna
from sklearn.metrics import mean_squared_error
import xgboost as xgb

cat_types = ["model", "brand", "ext_col", "int_col", "accident", 
             "clean_title", "body_style",
             'engine','fuel_type']
df = pd.read_csv('cars_train_enriched_acc_noassumption.csv')
df['miles_per_year'] = df['milage']
df['miles_per_year'] = df.apply(lambda x: x['miles_per_year'] / x['age'] if x['age']>0 else 0, axis=1)
df = df.astype({col: "category" for col in cat_types})

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
df

Unnamed: 0,brand,model,model_year,fuel_type,engine,ext_col,int_col,accident,clean_title,price,body_style,age,reliability,adjusted_msrp,miles_per_year
0,mini,coopersbase,2007,gasoline,1720hp16l4cylinderenginegasolinefuel,yellow,gray,nonereported,yes,4200,hatchback,17,0.215360,38563.525691,12529.411765
1,lincoln,lsv8,2002,gasoline,2520hp39l8cylinderenginegasolinefuel,silver,beige,atleast1accidentordamagereported,yes,4999,sedan,22,-0.432958,65723.566907,6511.363636
2,chevrolet,silverado2500lt,2002,e85flexfuel,3200hp53l8cylinderengineflexfuelcapability,blue,gray,nonereported,yes,13900,truck,22,0.461481,47927.808775,6215.045455
3,genesis,g9050ultimate,2017,gasoline,4200hp50l8cylinderenginegasolinefuel,black,black,nonereported,yes,45000,sedan,7,0.781159,87971.248994,2785.714286
4,mercedesbenz,metrisbase,2021,gasoline,2080hp20l4cylinderenginegasolinefuel,black,beige,nonereported,yes,97500,cargovan,3,-0.469551,36466.517539,2462.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188528,cadillac,escaladeesvplatinum,2017,gasoline,4200hp62l8cylinderenginegasolinefuel,white,beige,nonereported,yes,27500,suv,7,0.105448,106920.384453,7000.000000
188529,mercedesbenz,amgc43amgc434matic,2018,gasoline,3850hp30lv6cylinderenginegasolinefuel,white,black,atleast1accidentordamagereported,yes,30000,sedan,6,-0.469551,66911.638768,4766.666667
188530,mercedesbenz,amgglc63base4matic,2021,gasoline,4690hp40l8cylinderenginegasolinefuel,white,black,nonereported,yes,86900,suv,3,-0.469551,87535.906230,4550.000000
188531,audi,s530tprestige,2022,gasoline,30l,white,black,nonereported,no,84900,coupe,2,-0.933362,65937.618342,6947.500000


In [7]:
import xgboost as xgb

y = df['price']
X = df.drop(['price'], axis=1)
feature_importances = []
threshold_opt_cats = ["model", "ext_col", "accident", 
             "clean_title", "body_style",
             'engine','fuel_type']
def objective(trial):
    # Suggest hyperparameters for tuning
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'n_estimators': trial.suggest_int('n_estimators', 50, 1200),
        'eta':0.3, #trial.suggest_float('eta', 0.0001, 0.5, log = True),  # learning rate
        'max_depth': 7, #trial.suggest_int('max_depth', 3, 8),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-4, 10, log = True),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.05, 1.0),
        'lambda': trial.suggest_float('lambda', 0.1, 50, log = True),
        'alpha': trial.suggest_float('alpha', 1e-4, 10, log = True),
        'tree_method': 'hist',  
        'device':'cuda'
    }
    threshold = {cat:trial.suggest_int(f'{cat}_threshold', 1, 1000) for cat in cat_types}
    threshold['brand'] = 1
    threshold['int_col'] = 1
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.15, random_state=1219)
    

    for cat in cat_types:
        value_counts = X_train[cat].value_counts().to_dict()
        X_train[cat] = X_train[cat].apply(lambda x: x if value_counts[x] > threshold[cat] else "unknown")
        X_valid[cat] = X_valid[cat].apply(lambda x: x if (x in value_counts) and (value_counts[x] > threshold[cat]) else "unknown")

    X_train = X_train.astype({col: "category" for col in cat_types})
    X_valid = X_valid.astype({col: "category" for col in cat_types})
    
    dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
    dvalid = xgb.DMatrix(X_valid, label=y_valid, enable_categorical=True)
    
    # Train the model
    model = xgb.train(params, dtrain, evals=[(dvalid, 'validation')], num_boost_round=1500, early_stopping_rounds=35, verbose_eval=False)
    feature_importances.append(model.get_score(importance_type='gain'))  # get feature importance
    # Predict on the validation set
    y_pred_valid = model.predict(dvalid)
    
    # Calculate RMSE on the validation set
    rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
    
    return rmse



study = optuna.create_study(sampler = optuna.samplers.GPSampler(), direction='minimize')
study.optimize(objective, n_trials=50)


best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

print("Parameter importance:\n", optuna.importance.get_param_importances(study))


  study = optuna.create_study(sampler = optuna.samplers.GPSampler(), direction='minimize')
[I 2024-09-26 18:55:17,720] A new study created in memory with name: no-name-57760fa9-6131-4b83-9176-b4c04a7856ae
Parameters: { "n_estimators" } are not used.

[I 2024-09-26 18:55:18,882] Trial 0 finished with value: 83323.4427389309 and parameters: {'n_estimators': 1188, 'min_child_weight': 2.063692996459721, 'subsample': 0.32836061630749247, 'colsample_bytree': 0.7487972365209301, 'lambda': 8.383793665032995, 'alpha': 0.00022390733975486123, 'model_threshold': 435, 'brand_threshold': 119, 'ext_col_threshold': 366, 'int_col_threshold': 626, 'accident_threshold': 85, 'clean_title_threshold': 582, 'body_style_threshold': 461, 'engine_threshold': 196, 'fuel_type_threshold': 790}. Best is trial 0 with value: 83323.4427389309.
Parameters: { "n_estimators" } are not used.

[I 2024-09-26 18:55:20,024] Trial 1 finished with value: 82091.09269780238 and parameters: {'n_estimators': 1027, 'min_child_weigh

NameError: name 'cat' is not defined

In [9]:
f_importances = {feature: sum([fi[feature] for fi in feature_importances if feature in fi])/100 for feature in feature_importances[0]}
total = sum(f_importances.values())
f_importances_standard = {feature: f_importances[feature]/total for feature in f_importances}
print("Feature importance: \n", f_importances_standard)

Feature importance: 
 {'brand': 0.07200130657595478, 'model': 0.03599074756328742, 'model_year': 0.10793240908349448, 'fuel_type': 0.022526260745851383, 'engine': 0.057558793656742706, 'ext_col': 0.02189535043553052, 'int_col': 0.07208678545350937, 'accident': 0.08620985953568445, 'clean_title': 0.33842225132696374, 'body_style': 0.028619161087551467, 'age': 0.03034336200104892, 'reliability': 0.011735095806003293, 'adjusted_msrp': 0.06584348721771022, 'miles_per_year': 0.048835129510667215}


In [8]:
optuna.visualization.plot_param_importances(study)

In [2]:
import xgboost as xgb

drop_cols = ['id', 'price_diff', 'adjusted_price_diff', 'transmission', 'full_name', 'brand_model']
y = df['price']
X = df.drop(['price'], axis=1).drop(drop_cols, axis=1)
feature_importances = []
threshold_opt_cats = ["model", "ext_col", "accident", 
             "clean_title", "body_style",
             'engine','fuel_type', 'int_col', 'brand']
def objective(trial):
    # Suggest hyperparameters for tuning
    params = {
        'include_mileage': trial.suggest_int('include_mileage', 0, 1),
        'include_msrp': trial.suggest_int('include_msrp', 0, 1),
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'n_estimators': trial.suggest_int('n_estimators', 50, 1200),
        'eta':trial.suggest_float('eta', 0.0001, 0.5, log = True),  # learning rate
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-5, 100, log = True),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.05, 1.0),
        'lambda': trial.suggest_float('lambda', 0.1, 1000, log = True),
        'alpha': trial.suggest_float('alpha', 1e-4, 100, log = True),
        'tree_method': 'hist',  
        'device':'cpu'
    }
    threshold = {cat:trial.suggest_int(f'{cat}_threshold', 1, 1000) for cat in threshold_opt_cats}

    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.15, random_state=1219)
    

    for cat in cat_types:
        value_counts = X_train[cat].value_counts().to_dict()
        X_train[cat] = X_train[cat].apply(lambda x: x if value_counts[x] > threshold[cat] else "unknown")
        X_valid[cat] = X_valid[cat].apply(lambda x: x if (x in value_counts) and (value_counts[x] > threshold[cat]) else "unknown")

    X_train = X_train.astype({col: "category" for col in cat_types})
    X_valid = X_valid.astype({col: "category" for col in cat_types})

    if params['include_mileage'] == 1:
        X_train.drop(['milage'], axis=1, inplace=True)
        X_valid.drop(['milage'], axis=1, inplace=True)
    
    if params['include_msrp'] == 1:
        X_train.drop(['msrp'], axis=1, inplace=True)
        X_valid.drop(['msrp'], axis=1, inplace=True)
    

    dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
    dvalid = xgb.DMatrix(X_valid, label=y_valid, enable_categorical=True)
    
    # Train the model
    model = xgb.train(params, dtrain, evals=[(dvalid, 'validation')], num_boost_round=1500, early_stopping_rounds=35, verbose_eval=False)
    feature_importances.append(model.get_score(importance_type='gain'))  # get feature importance
    # Predict on the validation set
    y_pred_valid = model.predict(dvalid)
    
    # Calculate RMSE on the validation set
    rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
    
    return rmse



study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=250)


best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

print("Parameter importance:\n", optuna.importance.get_param_importances(study))


[I 2024-09-26 19:39:27,649] A new study created in memory with name: no-name-659e1cfa-489a-4d51-a0aa-cd09d5af9b73
Parameters: { "include_mileage", "include_msrp", "n_estimators" } are not used.

[I 2024-09-26 19:39:43,762] Trial 0 finished with value: 80961.05602314687 and parameters: {'include_mileage': 0, 'include_msrp': 1, 'n_estimators': 583, 'eta': 0.004934075484535698, 'max_depth': 7, 'min_child_weight': 9.076982728045543e-05, 'subsample': 0.46898673043348904, 'colsample_bytree': 0.4798163301945596, 'lambda': 1.207834293296568, 'alpha': 0.0007443984556959671, 'model_threshold': 286, 'ext_col_threshold': 959, 'accident_threshold': 428, 'clean_title_threshold': 911, 'body_style_threshold': 842, 'engine_threshold': 67, 'fuel_type_threshold': 327, 'int_col_threshold': 177, 'brand_threshold': 911}. Best is trial 0 with value: 80961.05602314687.
Parameters: { "include_mileage", "include_msrp", "n_estimators" } are not used.

[I 2024-09-26 19:40:10,959] Trial 1 finished with value: 8240

KeyboardInterrupt: 

In [None]:
def objective(trial):
    # Suggest hyperparameters for tuning
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'n_estimators': trial.suggest_int('n_estimators', 50, 1200),
        'eta': trial.suggest_float('eta', 0.0001, 0.5, log = True),  # learning rate
        'max_depth': 7,
        'min_child_weight': 6.375,
        'subsample': 0.15,
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.05, 1.0),
        'lambda': trial.suggest_float('lambda', 0.1, 50, log = True),
        'alpha': trial.suggest_float('alpha', 1e-4, 10, log = True),
        'tree_method': 'hist',  
        'device':'cpu'
    }
    threshold = trial.suggest_int('threshold', 1, 500)

    y = df['price']
    X = df.drop(['price'], axis=1)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.15, random_state=1219)
    

    for cat in cat_types:
        value_counts = X_train[cat].value_counts().to_dict()
        X_train[cat] = X_train[cat].apply(lambda x: x if value_counts[x] > threshold else "unknown")
        X_valid[cat] = X_valid[cat].apply(lambda x: x if (x in value_counts) and (value_counts[x] > threshold) else "unknown")

    X_train = X_train.astype({col: "category" for col in cat_types})
    X_valid = X_valid.astype({col: "category" for col in cat_types})
    dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
    dvalid = xgb.DMatrix(X_valid, label=y_valid, enable_categorical=True)
    
    # Train the model
    model = xgb.train(params, dtrain, evals=[(dvalid, 'validation')], num_boost_round=params['n_estimators'], early_stopping_rounds=35, verbose_eval=False)
    
    # Predict on the validation set
    y_pred_valid = model.predict(dvalid)
    
    # Calculate RMSE on the validation set
    rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
    
    return rmse



study = optuna.create_study(sampler = optuna.samplers.GPSampler(), direction='minimize')
study.optimize(objective, n_trials=250)


best_params = study.best_params
print(f"Best hyperparameters: {best_params}")



[I 2024-09-22 20:48:21,679] Trial 63 finished with value: 81080.7867679816 and parameters: {'n_estimators': 1200, 'eta': 0.0044651579940966895, 'colsample_bytree': 1.0, 'lambda': 0.10000000000000002, 'alpha': 0.00010000000000000009, 'threshold': 354}. Best is trial 57 with value: 80727.7368290466.

Parameters: { "n_estimators" } are not used.


[I 2024-09-22 20:48:27,428] Trial 64 finished with value: 80949.60388861551 and parameters: {'n_estimators': 665, 'eta': 0.032942769716334114, 'colsample_bytree': 1.0, 'lambda': 49.99999999999999, 'alpha': 0.00010000000000000009, 'threshold': 277}. Best is trial 57 with value: 80727.7368290466.

Parameters: { "n_estimators" } are not used.


[I 2024-09-22 20:48:46,210] Trial 65 finished with value: 80879.47822443333 and parameters: {'n_estimators': 1200, 'eta': 0.008446401126167765, 'colsample_bytree': 0.594887052589133, 'lambda': 49.99999999999999, 'alpha': 9.999999999999993, 'threshold': 500}. Best is trial 57 with value: 80727.7368290466.



KeyboardInterrupt: 

In [12]:
import xgboost as xgb
best_params = {'include_mileage': 0, 'include_msrp': 1, 'n_estimators': 994, 'eta': 0.015373037895620294, 'max_depth': 5, 'min_child_weight': 0.11357559673815384, 'subsample': 0.9793735367721236, 'colsample_bytree': 0.3377000630669105, 'lambda': 43.146286704054816, 'alpha': 46.88655118854743, 'model_threshold': 573, 'ext_col_threshold': 454, 'accident_threshold': 94, 'clean_title_threshold': 1, 'body_style_threshold': 485, 'engine_threshold': 56, 'fuel_type_threshold': 741, 'int_col_threshold': 886, 'brand_threshold': 909}
best_params['objective'] = 'reg:squarederror'
best_params['eval_metric'] = 'rmse'
best_params['device'] = 'cpu'

drop_cols = ['id', 'price_diff', 'adjusted_price_diff', 'transmission', 'full_name', 'brand_model', 'msrp']
y = df['price']
X = df.drop(['price'], axis=1).drop(drop_cols, axis=1)


drop_cols_test = ['transmission', 'full_name', 'brand_model', 'msrp']
dt = pd.read_csv('cars_test_enriched_acc_noassumption.csv')
dt['miles_per_year'] = dt['milage']
dt['miles_per_year'] = dt.apply(lambda x: x['miles_per_year'] / x['age'] if x['age']>0 else 0, axis=1)
dt = dt.astype({col: "category" for col in cat_types})
dt.drop(drop_cols_test, axis=1, inplace=True)

for cat in cat_types:
        value_counts = X[cat].value_counts().to_dict()
        X[cat] = X[cat].apply(lambda x: x if value_counts[x] > best_params[f'{cat}_threshold'] else "unknown")
        dt[cat] = dt[cat].apply(lambda x: x if (x in value_counts) and (value_counts[x] > best_params[f'{cat}_threshold']) else "unknown")

X = X.astype({col: "category" for col in cat_types})

dtrain = xgb.DMatrix(X, label=y, enable_categorical=True)
model = xgb.train(best_params, dtrain, num_boost_round=best_params['n_estimators'])

Parameters: { "accident_threshold", "body_style_threshold", "brand_threshold", "clean_title_threshold", "engine_threshold", "ext_col_threshold", "fuel_type_threshold", "include_mileage", "include_msrp", "int_col_threshold", "model_threshold", "n_estimators" } are not used.



In [15]:

ids = dt.pop('id')
dt = dt.astype({col: "category" for col in cat_types})
dtest = xgb.DMatrix(dt, enable_categorical=True)  
pred = model.predict(dtest)



# Create a submission DataFrame
submission_df = pd.DataFrame({
    'id': ids,
    'price': pred
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission11.csv', index=False)
