In [0]:
# !pip install tables --upgrade
# !pip install eli5
# !pip install xgboost
# !pip install hyperopt

In [3]:
cd 'drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car/'

/content/drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car


In [0]:
import re
import pandas as pd
import numpy as np

import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold

from hyperopt import hp, fmin, tpe, STATUS_OK

import eli5
from eli5.sklearn import PermutationImportance

#Wczytanie danych

In [6]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

In [7]:
# Skupiam się jedynie na samochodach używanych
df = df[df['param_stan'] == 'Używane']
df.shape

(96920, 155)

#Feature Engineering

Druga iteracja

In [0]:
def extract_number(text):
  if text:
    return int(re.findall('[0-9]+', str(text).replace(' ', '').replace(',', ''))[0])
  else:
    return -1

In [0]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: extract_number(x))

In [0]:
df['param_moc'] = df['param_moc'].map(lambda x: extract_number(x))

In [0]:
# moc > 1000 jest dzielona przez 100
df['param_moc'] = df['param_moc'].map(lambda x: int(x/100) if x > 1000 else x)

In [0]:
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: extract_number(x))

In [0]:
# pojemność-skokowa > 10000 jest dzielona przez 100
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: int(x/100) if x > 10000 else x)

In [0]:
# zamiana euro na pln
df['price_value_pln'] = df.apply(lambda x: x.price_value if x.price_currency == 'EUR' else x.price_value, axis = 1)

In [0]:
SUFFIX_CAT = '__cat'
for feat in df.columns:
  if isinstance(df[feat].iloc[0], list): continue

  factorized_values = df[feat].factorize()[0]
  if SUFFIX_CAT in feat:
    df[feat] = factorized_values
  else:
    df[feat + SUFFIX_CAT] = factorized_values

In [16]:
cat_feats = [x for x in df.columns if SUFFIX_CAT in x]
cat_feats = [x for x in cat_feats if 'price' not in x]

len(cat_feats)

151

In [17]:
feats = [x for x in cat_feats if x not in ['param_rok-produkcji__cat', 'param_moc__cat', 'param_pojemność-skokowa__cat']]
feats += ['param_rok-produkcji', 'param_moc', 'param_pojemność-skokowa']
len(feats)

151

#XGBoost

In [0]:
def run_model(data, model, feats):
  X = data[feats].values
  y = data['price_value_pln'].values

  scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [19]:
feats_top = [
'param_rok-produkcji',
'param_moc',
'param_pojemność-skokowa',
'param_marka-pojazdu__cat',
'param_napęd__cat',
'param_rodzaj-paliwa__cat',
'param_typ__cat',
'param_uszkodzony__cat',
'param_kod-silnika__cat',
'seller_name__cat',
'param_wersja__cat',
'param_faktura-vat__cat',
'param_model-pojazdu__cat',
'feature_czujniki-parkowania-przednie__cat',
'param_skrzynia-biegów__cat',
'param_bezwypadkowy__cat',
'param_kraj-pochodzenia__cat',
'feature_kamera-cofania__cat',
'feature_asystent-pasa-ruchu__cat',
'feature_system-start-stop__cat',
'feature_regulowane-zawieszenie__cat',
'param_liczba-miejsc__cat',
'feature_światła-led__cat',
'seller_address__cat',
'feature_nawigacja-gps__cat',
]

xgb_params = {
    'max_depth': 5,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'seed': 0,
    'objective': 'reg:squarederror',
    'tree_method': 'gpu_hist',
}

model = xgb.XGBRegressor(**xgb_params)
run_model(df, model, feats_top)

(-8017.078059361083, 64.18769375618292)

# Hyperopt

In [20]:
def obj_func(params):
  print("Training with params: ")
  print(params)

  mean_mae,score_std = run_model(df, xgb.XGBRegressor(**params), feats_top)

  print('MAE: {}'.format(np.abs(mean_mae)))
  print('---------------- \n')

  return {'loss': np.abs(mean_mae), 'status': STATUS_OK}

# space
xgb_reg_params = {
    'learning_rate': hp.choice('learning_rate', np.arange(0.05, 0.31, 0.05)),
    'max_depth': hp.choice('max_depth', np.arange(5,16,1, dtype=int)),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'n_estimators': hp.choice('n_estimators', np.arange(60,140,1, dtype=int)),
    'objective': 'reg:squarederror',
    'tree_method': 'gpu_hist',
    'seed': 0,
}

# run
best = fmin(obj_func, xgb_reg_params, algo=tpe.suggest, max_evals=30, return_argmin=False)

print('---------------- \n')

best

Training with params: 
{'colsample_bytree': 0.6000000000000001, 'learning_rate': 0.25, 'max_depth': 12, 'n_estimators': 114, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.6000000000000001, 'tree_method': 'gpu_hist'}
MAE: 7992.9620936197825
---------------- 

Training with params: 
{'colsample_bytree': 0.6000000000000001, 'learning_rate': 0.3, 'max_depth': 6, 'n_estimators': 71, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8500000000000001, 'tree_method': 'gpu_hist'}
MAE: 7561.473382126637
---------------- 

Training with params: 
{'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 104, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8, 'tree_method': 'gpu_hist'}
MAE: 6882.390526942246
---------------- 

Training with params: 
{'colsample_bytree': 0.55, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 61, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8, 'tree_method': 'gpu_hist'}
MAE: 9376.056787191328


{'colsample_bytree': 1.0,
 'learning_rate': 0.05,
 'max_depth': 13,
 'n_estimators': 124,
 'objective': 'reg:squarederror',
 'seed': 0,
 'subsample': 0.6000000000000001,
 'tree_method': 'gpu_hist'}

# Final

In [21]:
feats_top = [
'param_rok-produkcji',
'param_moc',
'param_pojemność-skokowa',
'param_marka-pojazdu__cat',
'param_rodzaj-paliwa__cat',
'param_napęd__cat',
'param_uszkodzony__cat',
'param_typ__cat',
'param_kod-silnika__cat',
'seller_name__cat',
'param_wersja__cat',
'param_faktura-vat__cat',
'feature_czujniki-parkowania-przednie__cat',
'param_model-pojazdu__cat',
'param_skrzynia-biegów__cat',
'param_bezwypadkowy__cat',
'feature_kamera-cofania__cat',
'param_kraj-pochodzenia__cat',
'feature_asystent-pasa-ruchu__cat',
'feature_łopatki-zmiany-biegów__cat',
'seller_address__cat',
'feature_system-start-stop__cat',
'param_liczba-miejsc__cat',
'feature_regulowane-zawieszenie__cat',
'feature_nawigacja-gps__cat',
]

xgb_params = {
 'colsample_bytree': 1.0,
 'learning_rate': 0.05,
 'max_depth': 13,
 'n_estimators': 124,
 'objective': 'reg:squarederror',
 'seed': 0,
 'subsample': 0.6000000000000001,
 'tree_method': 'gpu_hist'}

model = xgb.XGBRegressor(**xgb_params)
run_model(df, model, feats_top)

(-6502.608094013108, 91.81094865928064)