In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor, Pool, cv
import xgboost as xgb

from sklearn.model_selection import train_test_split

In [2]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred-y_true)/y_true))

RANDOM_SEED=42

In [3]:
data = pd.read_csv('data_for_stacking.csv')


cat_features_ids = ['bodyType', 'brand', 'color','descr_labels', 'fuelType', 
                    'model_name', 'vehicleTransmission',
                    'pts', 'privod', 'wheel', 'state', 'descr_labels']

for colum in cat_features_ids:
    data[colum] = data[colum].astype('category').cat.codes

data.drop('Unnamed: 0', axis=1, inplace=True, errors='ignore')

In [4]:
sample_submission = pd.read_csv('../kaggle/sample_submission.csv')

In [5]:
data[data['sample']==1].equip_len.unique()

array([  0,   3,  37,  49,  40,  11,   9,   1,  43,  44,  35,  72,  20,
        16,  29,  32,  51,  25,  19,  39,  28,  36,  13,  12,  56,  54,
        18,  61,  65,  41,  57,  53,   7,  63,  38,  58,  75,  67,  83,
        34,  59,  68,  42,   2,  21,   5,  50,  17,  30,  27,  24,  23,
        55,  31,  26,   6,  14,  48,   8,   4,  15,  10,  22,  33,  66,
        64,  47,  45,  71,  52,  46,  78,  60,  89,  76,  62,  74,  77,
        70,  69,  84,  73,  80,  88,  81,  95,  85,  82,  90, 120,  92,
        97,  79,  94,  98,  99,  86,  93, 101, 100,  96,  87, 107,  91,
       104, 102, 106, 105, 103, 115, 110, 111, 118, 108, 113, 109],
      dtype=int64)

In [24]:
X = data.query('sample == 1').drop(['sample'], axis=1)
y = X['price']

X.drop('price', axis=1, inplace=True, errors='ignore')
X_sub = data.query('sample == 0').drop(['sample', 'price'], axis=1)

VAL_SIZE=0.20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=VAL_SIZE, shuffle=True, random_state=RANDOM_SEED)

In [7]:
y_train = np.log(y_train)
y_test = np.log(y_test)

In [10]:
catboost_params = {
    'iterations': 5000,
    'learning_rate': 0.05775539388456,
    'depth': 12,
    'random_seed': RANDOM_SEED,
    'eval_metric': 'MAPE',
    'custom_metric': ['R2', 'MAE'],
    'l2_leaf_reg': 1,
    "loss_function": "MAPE"
}

train_pool = Pool(
    X_train,
    y_train,
    cat_features=cat_features_ids,
)
test_pool = Pool(
    X_test,
    y_test,
    cat_features=cat_features_ids,
)

model_catb = CatBoostRegressor(iterations = catboost_params['iterations'],
                          learning_rate = catboost_params['learning_rate'],
                          random_seed = RANDOM_SEED,
                          eval_metric=catboost_params['eval_metric'],
                          custom_metric=catboost_params['custom_metric'],
                          l2_leaf_reg=catboost_params['l2_leaf_reg'],
                          depth=catboost_params['depth'],
                          metric_period=catboost_params['depth'],
                          od_type='Iter',
                          od_wait=20,
                          rsm=0.2,
                          devices='GPU'
                         )
model_catb.fit(train_pool,
         eval_set=test_pool,
         verbose_eval=catboost_params['depth']*4,
         use_best_model=True,
         plot=False
         )

0:	learn: 0.0691856	test: 0.0691192	best: 0.0691192 (0)	total: 355ms	remaining: 29m 34s
48:	learn: 0.0134941	test: 0.0137295	best: 0.0137295 (48)	total: 12.6s	remaining: 21m 9s
96:	learn: 0.0110109	test: 0.0115041	best: 0.0115041 (96)	total: 25.3s	remaining: 21m 16s
144:	learn: 0.0102494	test: 0.0109264	best: 0.0109264 (144)	total: 39s	remaining: 21m 45s
192:	learn: 0.0097042	test: 0.0105611	best: 0.0105611 (192)	total: 52.1s	remaining: 21m 37s
240:	learn: 0.0092372	test: 0.0102608	best: 0.0102608 (240)	total: 1m 4s	remaining: 21m 19s
288:	learn: 0.0088500	test: 0.0100297	best: 0.0100297 (288)	total: 1m 17s	remaining: 21m 9s
336:	learn: 0.0084923	test: 0.0098129	best: 0.0098129 (336)	total: 1m 31s	remaining: 21m
384:	learn: 0.0081832	test: 0.0096502	best: 0.0096502 (384)	total: 1m 44s	remaining: 20m 51s
432:	learn: 0.0079372	test: 0.0095245	best: 0.0095245 (432)	total: 1m 57s	remaining: 20m 38s
480:	learn: 0.0076905	test: 0.0093986	best: 0.0093986 (480)	total: 2m 10s	remaining: 20m 26s

<catboost.core.CatBoostRegressor at 0x2b16d526e48>

In [23]:
predict_catb = model_catb.predict(X_test)

predict_catb = np.e ** predict_catb
# y_test = np.e ** y_test

# оцениваем точность
MAPE = f'{(mape(y_test, predict_catb))*100:0.4f}'
print(f"Точность модели по метрике MAPE: {MAPE}%")

Точность модели по метрике MAPE: nan%


In [13]:
model_xgb = xgb.XGBRegressor(base_score=0.5, 
                             booster='gbtree',
                             colsample_bylevel=1,
                             colsample_bynode=1, 
                             colsample_bytree=1, 
                             gamma=0, #0.15340366103115533, #0, 
                             importance_type='gain',  
                             learning_rate=0.08, 
                             max_delta_step=0, 
                             max_depth=13, #12 #6, #7, 
                             min_child_weight=1,  
                             missing=None, 
                             n_estimators=5000,
                             n_jobs=20, 
                             nthread=None, 
                             random_state=0,
                             reg_alpha=0, #1.8400184528746324, #0, 
                             reg_lambda=1, #1.0868061353806249,#1, 
                             scale_pos_weight=1, 
                             seed=RANDOM_SEED,
                             silent=None, 
                             subsample=0.75, 
                             verbosity=0, 
                             objective='reg:squarederror',
                             verbose=True#, eval_metric=mape
                            )

model_xgb.fit(X_train, y_train, eval_metric=mape)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.08, max_delta_step=0, max_depth=13,
             min_child_weight=1, missing=None, monotone_constraints='()',
             n_estimators=5000, n_jobs=20, nthread=20, num_parallel_tree=1,
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=42, silent=None, subsample=0.75, tree_method='exact',
             validate_parameters=1, verbose=True, verbosity=0)

In [27]:
predict_xgb = model_xgb.predict(X_test)

predict_xgb = np.e ** predict_xgb
# y_test = np.e ** y_test

# оцениваем точность
MAPE = f'{(mape(y_test, predict_xgb))*100:0.4f}'
print(f"Точность модели по метрике MAPE: {MAPE}%")

Точность модели по метрике MAPE: 9.3244%


In [25]:
y_test

71443      255000.0
159732     580000.0
111481    1400000.0
107477     670000.0
56306      320000.0
            ...    
153098    1780000.0
209148     950000.0
176389    2850210.0
50169      105000.0
87392     4020000.0
Name: price, Length: 34973, dtype: float64

# FINAL PREDICT

In [28]:
predict_fin = np.mean(np.array([predict_catb, predict_xgb]), axis=0)

# оцениваем точность
MAPE = f'{(mape(y_test, predict_fin))*100:0.4f}'
print(f"Точность модели по метрике MAPE: {MAPE}%")

Точность модели по метрике MAPE: 9.0003%


In [29]:
predict_fin_1k = np.round(predict_fin // 1000) * 1000
MAPE = f'{(mape(y_test, predict_fin_1k))*100:0.4f}'
print(f"Точность модели по метрике MAPE: {MAPE}%")

Точность модели по метрике MAPE: 8.9740%


In [30]:
predict_fin_10k = np.round(predict_fin // 10000) * 10000
MAPE = f'{(mape(y_test, predict_fin_10k))*100:0.4f}'
print(f"Точность модели по метрике MAPE: {MAPE}%")

Точность модели по метрике MAPE: 8.8974%


In [35]:
new_version = 'xgb_13depth'

predict_xgb_sub = model_xgb.predict(X_sub)
predict_xgb_sub = np.e ** predict_xgb_sub

predict_catb_sub = model_catb.predict(X_sub)
predict_catb_sub = np.e ** predict_catb_sub

predict_submission = np.max(np.array([predict_xgb_sub, predict_catb_sub]), axis=0)

# predict_submission = np.e ** predict_submission
predict_submission = predict_submission // 10000 * 10000
sample_submission['price'] = predict_submission
sample_submission.to_csv(f'..\kaggle\submissions_kaggle\submission_{new_version}.csv', index=False)
sample_submission.head(5)

Unnamed: 0,sell_id,price
0,1100575026,620000.0
1,1100549428,1050000.0
2,1100658222,880000.0
3,1100937408,830000.0
4,1101037972,810000.0


In [16]:
np.mean(np.array([predict_xgb_sub, predict_catb_sub]), axis=0)

array([ 606542.03818756, 1045658.76530822,  880418.20330087, ...,
        245842.70040454, 1204897.50550932, 1078203.02652019])

In [17]:
# estimators = [
#     ('catboost', model_catb),
#     ('xgboost', model_xgb)
# ]

# stacker = StackingRegressor(
#     estimators=estimators,
#     final_estimator=RandomForestRegressor(n_estimators=10,
#                                           random_state=RANDOM_SEED),
#     n_jobs=-1,
#     cv=4
# )

# stacker.fit(X_train, y_train).score(X_test, y_test)

In [18]:
# predict = stacker.predict(X_test)

# predict = np.e ** predict
# y_test = np.e ** y_test

# # оцениваем точность
# MAPE = f'{(mape(y_test, predict))*100:0.4f}'
# print(f"Точность модели по метрике MAPE: {MAPE}%")

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
34681   NaN
34682   NaN
34683   NaN
34684   NaN
34685   NaN
Name: price, Length: 34686, dtype: float64