In [0]:
# !pip install tables --upgrade
# !pip install eli5
# !pip install xgboost

In [1]:
cd 'drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car/'

/content/drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car


In [0]:
import re
import pandas as pd
import numpy as np

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold

import eli5
from eli5.sklearn import PermutationImportance

#Wczytanie danych

In [53]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

In [54]:
# Skupiam się jedynie na samochodach używanych
df = df[df['param_stan'] == 'Używane']
df.shape

(96920, 155)

#Feature Engineering

Druga iteracja

In [0]:
def extract_number(text):
  if text:
    return int(re.findall('[0-9]+', str(text).replace(' ', '').replace(',', ''))[0])
  else:
    return -1

In [0]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: extract_number(x))

In [0]:
df['param_moc'] = df['param_moc'].map(lambda x: extract_number(x))

In [0]:
# moc > 1000 jest dzielona przez 100
df['param_moc'] = df['param_moc'].map(lambda x: int(x/100) if x > 1000 else x)

In [0]:
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: extract_number(x))

In [0]:
# pojemność-skokowa > 10000 jest dzielona przez 100
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: int(x/100) if x > 10000 else x)

In [0]:
# zamiana euro na pln
df['price_value_pln'] = df.apply(lambda x: x.price_value if x.price_currency == 'EUR' else x.price_value, axis = 1)

In [0]:
SUFFIX_CAT = '__cat'
for feat in df.columns:
  if isinstance(df[feat].iloc[0], list): continue

  factorized_values = df[feat].factorize()[0]
  if SUFFIX_CAT in feat:
    df[feat] = factorized_values
  else:
    df[feat + SUFFIX_CAT] = factorized_values

In [63]:
cat_feats = [x for x in df.columns if SUFFIX_CAT in x]
cat_feats = [x for x in cat_feats if 'price' not in x]

len(cat_feats)

151

In [64]:
feats = [x for x in cat_feats if x not in ['param_rok-produkcji__cat', 'param_moc__cat', 'param_pojemność-skokowa__cat']]
feats += ['param_rok-produkcji', 'param_moc', 'param_pojemność-skokowa']
len(feats)

151

#Model

In [0]:
def run_model(data, model, feats):
  X = data[feats].values
  y = data['price_value_pln'].values

  scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

## DecisionTree

In [66]:
model = DecisionTreeRegressor(max_depth=5)
run_model(df, model, feats)

(-12667.415296227156, 36.782121169561286)

##Random Forest

In [67]:
model = RandomForestRegressor(max_depth=5, n_estimators=50, random_state=0)
run_model(df, model, feats)

(-11602.172584545202, 153.77708738604215)

##XGBoost

In [68]:
xgb_params = {
    'max_depth': 5,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'seed': 0,
}

model = xgb.XGBRegressor(**xgb_params)
run_model(df, model, feats)



(-8082.2847899288245, 95.07786098405595)

In [69]:
X = df[feats].values
y = df['price_value_pln'].values
model = xgb.XGBRegressor(**xgb_params)
model.fit(X, y)

imp = PermutationImportance(model, random_state=0).fit(X, y)
eli5.show_weights(imp, feature_names=feats, top=25)



Weight,Feature
0.6031  ± 0.0084,param_rok-produkcji
0.3113  ± 0.0052,param_moc
0.0378  ± 0.0008,param_pojemność-skokowa
0.0220  ± 0.0010,param_marka-pojazdu__cat
0.0142  ± 0.0010,param_napęd__cat
0.0139  ± 0.0011,param_rodzaj-paliwa__cat
0.0128  ± 0.0002,param_typ__cat
0.0127  ± 0.0004,param_uszkodzony__cat
0.0122  ± 0.0003,param_kod-silnika__cat
0.0095  ± 0.0003,seller_name__cat


In [0]:
feats_top = [
'param_rok-produkcji',
'param_moc',
'param_pojemność-skokowa',
'param_marka-pojazdu__cat',
'param_napęd__cat',
'param_rodzaj-paliwa__cat',
'param_typ__cat',
'param_uszkodzony__cat',
'param_kod-silnika__cat',
'seller_name__cat',
'param_wersja__cat',
'param_faktura-vat__cat',
'param_model-pojazdu__cat',
'feature_czujniki-parkowania-przednie__cat',
'param_skrzynia-biegów__cat',
'param_bezwypadkowy__cat',
'param_kraj-pochodzenia__cat',
'feature_kamera-cofania__cat',
'feature_asystent-pasa-ruchu__cat',
'feature_system-start-stop__cat',
'feature_regulowane-zawieszenie__cat',
'param_liczba-miejsc__cat',
'feature_światła-led__cat',
'seller_address__cat',
'feature_nawigacja-gps__cat',
]

In [71]:
model = xgb.XGBRegressor(**xgb_params)
run_model(df, model, feats_top)



(-8077.081254752223, 83.34106719099744)