<a href="https://colab.research.google.com/github/aranjaka/dw_matrix_car/blob/master/day5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install --upgrade tables
!pip install eli5
!pip install xgboost
!pip install hyperopt

Requirement already up-to-date: tables in /usr/local/lib/python3.6/dist-packages (3.6.1)


In [0]:
import pandas as pd
import numpy as np

import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score

from hyperopt import hp, fmin, tpe, STATUS_OK

import eli5
from eli5.sklearn import PermutationImportance

In [0]:
!pwd

/content


In [0]:
cd '/content/drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car'

/content/drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car


In [0]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

# Feature engineering

In [0]:
SUFFIX_CAT = '_cat'
for feat in df.columns:
  if isinstance(df[feat][0], list): continue

  factorized_values = df[feat].factorize()[0]
  if SUFFIX_CAT in feat:
    df[feat] = factorized_values
  else:
    df[feat + SUFFIX_CAT] = factorized_values

In [0]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) == 'None' else int(x) )
df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x) == 'None' else int(str(x).replace(' ', '').replace('KM','')) )
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else int(str(x).replace(' ', '').replace('cm3','')) )

feats = ['param_napęd_cat','param_stan_cat','param_rok-produkcji','param_faktura-vat_cat','param_moc','param_skrzynia-biegów_cat','param_marka-pojazdu_cat','feature_kamera-cofania_cat','param_typ_cat','param_pojemność-skokowa','seller_name_cat','param_wersja_cat','feature_wspomaganie-kierownicy_cat','param_model-pojazdu_cat','feature_system-start-stop_cat','param_kod-silnika_cat','feature_asystent-pasa-ruchu_cat','feature_łopatki-zmiany-biegów_cat','feature_światła-led_cat','feature_czujniki-parkowania-przednie_cat']

In [0]:
def run_model(model, feats):
  X = df[feats].values
  y = df['price_value'].values

  scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [0]:
xgb_params = {
    'max_depth': 5,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'seed': 0
}


run_model(xgb.XGBRegressor(**xgb_params), feats )



(-9610.522516153344, 90.20631125485501)

# Hyperopt

In [0]:
def obj_func(params):
  print("Training with params: ")
  print(params)

  mean_mae, score_std = run_model(xgb.XGBRegressor(**params), feats)

  return {'loss': np.abs(mean_mae), 'status': STATUS_OK }

# space
xgb_reg_params = {
     'learning_rate': hp.choice('learning_rate', np.arange(0.05, 0.31, 0.05)),
     'max_depth': hp.choice('max_depth', np.arange(5, 16, 1, dtype=int)),
     'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
     'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
     'objective': 'reg:linear',
     'n_estimators': 100,
     'seed': 0,
  }

# run
best = fmin(obj_func, xgb_reg_params, algo=tpe.suggest, max_evals=25)

best

Training with params: 
{'colsample_bytree': 0.55, 'learning_rate': 0.2, 'max_depth': 14, 'n_estimators': 100, 'objective': 'reg:linear', 'seed': 0, 'subsample': 0.75}
Training with params: 
{'colsample_bytree': 1.0, 'learning_rate': 0.15000000000000002, 'max_depth': 15, 'n_estimators': 100, 'objective': 'reg:linear', 'seed': 0, 'subsample': 0.8}
Training with params: 
{'colsample_bytree': 0.9, 'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 100, 'objective': 'reg:linear', 'seed': 0, 'subsample': 0.65}
Training with params: 
{'colsample_bytree': 0.6000000000000001, 'learning_rate': 0.25, 'max_depth': 13, 'n_estimators': 100, 'objective': 'reg:linear', 'seed': 0, 'subsample': 0.7000000000000001}
Training with params: 
{'colsample_bytree': 0.55, 'learning_rate': 0.05, 'max_depth': 11, 'n_estimators': 100, 'objective': 'reg:linear', 'seed': 0, 'subsample': 0.8500000000000001}
Training with params: 
{'colsample_bytree': 0.9500000000000001, 'learning_rate': 0.05, 'max_depth': 6, 'n_es

{'colsample_bytree': 0.75,
 'learning_rate': 1,
 'max_depth': 10,
 'subsample': 1.0}

In [0]:
!git add day4.ipynb

In [0]:
!git config --global user.email "aranjaka@gmail.com"
!git config --global user.name "Marcin"
!git commit -m "day_4 xgboost"

[master 98c8085] day_4 xgboost
 1 file changed, 1 insertion(+)
 create mode 100644 day4.ipynb


In [0]:
!git push -u origin master

Counting objects: 1   Counting objects: 3, done.
Delta compression using up to 2 threads.
Compressing objects:  33% (1/3)   Compressing objects:  66% (2/3)   Compressing objects: 100% (3/3)   Compressing objects: 100% (3/3), done.
Writing objects:  33% (1/3)   Writing objects:  66% (2/3)   Writing objects: 100% (3/3)   Writing objects: 100% (3/3), 4.98 KiB | 1.66 MiB/s, done.
Total 3 (delta 1), reused 0 (delta 0)
remote: Resolving deltas:   0% (0/1)[Kremote: Resolving deltas: 100% (1/1)[Kremote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/aranjaka/dw_matrix_car.git
   55609ce..98c8085  master -> master
Branch 'master' set up to track remote branch 'master' from 'origin'.
