In [0]:
# !pip install tables --upgrade
# !pip install eli5

In [283]:
cd 'drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car/'

[Errno 2] No such file or directory: 'drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car/'
/content/drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car


In [0]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

# Wczytywanie danych

In [285]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

In [286]:
df.columns

Index(['breadcrumb', 'created_at', 'price_currency', 'price_details',
       'price_value', 'seller_address', 'seller_name', 'seller_type',
       'feature_czujniki-parkowania-przednie',
       'feature_poduszka-powietrzna-chroniąca-kolana',
       ...
       'param_pearl', 'param_stan', 'param_wersja', 'param_emisja-co2',
       'param_body-type', 'param_matowy', 'param_bezwypadkowy',
       'param_akryl-(niemetalizowany)', 'param_monthly-payment-value',
       'car_id'],
      dtype='object', length=155)

# Dummy Model

In [287]:
df.select_dtypes(np.number).columns

Index(['price_value', 'car_id'], dtype='object')

In [288]:
feats = ['car_id']
X = df[ feats ].values
y = df['price_value'].values

model = DummyRegressor()
model.fit(X, y)
y_pred = model.predict(X)

mae(y, y_pred)

39465.934630440985

In [289]:
[x for x in df.columns if 'price' in x]

['price_currency', 'price_details', 'price_value']

In [290]:
df['price_currency'].value_counts(normalize=True)*100

PLN    99.80844
EUR     0.19156
Name: price_currency, dtype: float64

In [0]:
df['price_value_pln'] = df.apply(lambda x: x.price_value if x.price_currency == 'EUR' else x.price_value, axis = 1)

In [292]:
# wyrzucam 1% najdroższych samochodów (dopuszczam np. błąd przy wprowadzaniu danych i 90000 robi się 900000)
np.quantile(df['price_value_pln'], 0.99)

339990.0

In [0]:
df = df[df['price_value_pln'] <= 339990]

# Features

In [0]:
SUFFIX_CAT = '__cat'
for feat in df.columns:
  if isinstance(df[feat][0], list): continue

  factorized_values = df[feat].factorize()[0]
  if SUFFIX_CAT in feat:
    df[feat] = factorized_values
  else:
    df[feat + SUFFIX_CAT] = factorized_values

In [295]:
cat_feats = [x for x in df.columns if SUFFIX_CAT in x]
cat_feats = [x for x in cat_feats if 'price' not in x]
len(cat_feats)

151

# Simple Model

In [296]:
X = df[cat_feats].values
y = df['price_value_pln'].values

model = DecisionTreeRegressor(max_depth=5)
scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
np.mean(scores), np.std(scores)

(-16248.783340189138, 75.15865682409559)

In [297]:
m = DecisionTreeRegressor(max_depth=5)
m.fit(X, y)

imp = PermutationImportance(m, random_state=0).fit(X, y)
eli5.show_weights(imp, feature_names=cat_feats)

Weight,Feature
0.1737  ± 0.0043,param_skrzynia-biegów__cat
0.1615  ± 0.0020,param_napęd__cat
0.1498  ± 0.0032,param_faktura-vat__cat
0.1175  ± 0.0024,param_rok-produkcji__cat
0.0924  ± 0.0031,param_stan__cat
0.0551  ± 0.0007,feature_światła-led__cat
0.0437  ± 0.0025,feature_kamera-cofania__cat
0.0153  ± 0.0002,feature_bluetooth__cat
0.0047  ± 0.0004,feature_klimatyzacja-manualna__cat
0.0044  ± 0.0003,feature_system-start-stop__cat


Logarytm ze zmniennej zależnej

In [302]:
X = df[cat_feats].values
y = np.log(df['price_value_pln'].values)

m = DecisionTreeRegressor(max_depth=5)
m.fit(X, y)

imp = PermutationImportance(m, random_state=0).fit(X, y)
eli5.show_weights(imp, feature_names=cat_feats)

Weight,Feature
0.1901  ± 0.0028,param_rok-produkcji__cat
0.1598  ± 0.0010,feature_bluetooth__cat
0.1375  ± 0.0014,param_faktura-vat__cat
0.0855  ± 0.0012,param_skrzynia-biegów__cat
0.0438  ± 0.0016,feature_tempomat__cat
0.0349  ± 0.0003,param_stan__cat
0.0238  ± 0.0007,feature_światła-led__cat
0.0230  ± 0.0006,feature_esp-(stabilizacja-toru-jazdy)__cat
0.0213  ± 0.0004,param_napęd__cat
0.0179  ± 0.0007,feature_kamera-cofania__cat


# Feature Engineering

Hipoteza: kolejność przy faktoryzacji może mieć wpływ na wyniki. Patrz zmienna kolor.

In [0]:
# kolejność według średnich cen
alternatywna_kolejnosc_kolorow = {
    'Biały': 0, 'Brązowy': 1, 'Czarny': 2, 'Beżowy': 3, 'Szary': 4,
    'Czerwony': 5, 'Żółty': 6, 'Niebieski': 7, 'Inny kolor': 8, 'Fioletowy': 9,
    'Bordowy': 10, 'Złoty': 11, 'Srebrny': 12, 'Zielony': 13}

In [0]:
df['param_kolor__cat_alt'] = df['param_kolor'].map(lambda x: faktoryzacja_koloru[x] if x else -1)

In [300]:
X = df[['param_kolor__cat']].values
y = df['price_value_pln'].values

model = DecisionTreeRegressor(max_depth=5)
scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
np.mean(scores), np.std(scores)

(-32038.861728068572, 224.9243232657796)

In [301]:
X = df[['param_kolor__cat_alt']].values
y = df['price_value_pln'].values

model = DecisionTreeRegressor(max_depth=5)
scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
np.mean(scores), np.std(scores)

(-31994.950593254096, 228.56268971976655)