In [73]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sys
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostRegressor

from tqdm.notebook import tqdm

from joblib import Parallel, delayed
from bs4 import BeautifulSoup  as bs
from pprint import pprint  
import requests  
import json
from ast import literal_eval
import json

import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler

import my_func
import importlib
importlib.reload(my_func)

import huperopt_class
importlib.reload(huperopt_class)

from huperopt_class import HyperOpt

from IPython.display import Audio
from russian_names import RussianNames

from catboost import CatBoostRegressor, Pool, cv

In [75]:
class HyperOpt:
    def __init__(self, **kwargs):
        super(HyperOpt, self).__init__()
        self.kwargs = kwargs
    
    def hyperopt_catb_score(self, params):
        model = CatBoostRegressor(l2_leaf_reg=int(params['l2_leaf_reg']),
                                  learning_rate=params['learning_rate'],
                                  iterations=self.kwargs['iterations'],
                                #   ignored_features = self.kwargs['ignored_features'],
                                  eval_metric='MAPE',
                                  random_seed=42,
                                  task_type='GPU',
                                  logging_level='Silent'
                                 )
    
        cv_data = cv(Pool(self.kwargs['X_train'], self.kwargs['y_train'], 
        
                          cat_features=self.kwargs['categorical_features_indices']),
                     model.get_params())
        best_MAPE = np.min(cv_data['test-MAPE-mean'])

        return best_MAPE

    def hyperopt_xgb_score(self, params):
        
        model = XGBRegressor(l2_leaf_reg=int(params['l2_leaf_reg']),
                             learning_rate=params['learning_rate'],
                             max_depth=int(params['max_depth']),
                             gamma = params['gamma'],
                             reg_alpha = params['reg_alpha'],
                             reg_lambda = params['reg_lambda'],
                             n_estimators=self.kwargs['n_estimators'],
                             objective='reg:squarederror',
                             verbosity=0,
                             random_seed=42,
                             task_type=DEVICE)
        fit_params={'early_stopping_rounds': self.kwargs['rounds'], 
                    'eval_metric': 'rmse',
                    'verbose': self.kwargs['verbose'],
                    'eval_set': [[self.kwargs['X_val'],  self.kwargs['y_val']]]}
        
        xgb_cv = cross_val_score(model, self.kwargs['X_train'], self.kwargs['y_train'], 
                                 cv = self.kwargs['cv'], 
                                 scoring = 'neg_mean_squared_error',
                                 fit_params = fit_params)
        best_rmse = np.mean([(-x)**0.5 for x in xgb_cv])
        print(f'Best RMSE: {best_rmse}', params)
        return best_rmse

In [None]:
RANDOM_SEED = 42

In [None]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred-y_true)/y_true))

In [77]:
data = pd.read_csv('data_versions\data_for_stacking.csv')
cat_features_ids = ['bodyType', 'brand', 'color','descr_labels', 'fuelType', 
                    'model_name', 'vehicleTransmission',
                    'pts', 'privod', 'wheel', 'state']

In [None]:
X = data.query('sample == 1').drop(['sample'], axis=1)
y = X['price']

X.drop('price', axis=1, inplace=True, errors='ignore')
X_sub = data.query('sample == 0').drop(['sample'], axis=1)

In [None]:
VAL_SIZE=0.20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=VAL_SIZE, shuffle=True, random_state=RANDOM_SEED)

In [None]:
y_train = np.log(y_train)
y_test = np.log(y_test)

In [None]:
# {'l2_leaf_reg': 1.0, 'learning_rate': 0.08343057119918101, 'max_depth': 10.184618939984698}



# CATBOOST
ITERATIONS = 5000
LR         = 0.09399760402267439 #0.08343057119918101 #0.1
depth = 12

In [None]:
train_pool = Pool(
    X_train,
    y_train,
    cat_features=cat_features_ids,
) 
test_pool = Pool(
    X_test,
    y_test,
    cat_features=cat_features_ids,
)

model = CatBoostRegressor(iterations = ITERATIONS,
                          learning_rate = LR,
                          random_seed = RANDOM_SEED,
                          eval_metric='MAPE',
                          custom_metric=['R2', 'MAE'],
                          l2_leaf_reg=1,
                          depth=12,
                          metric_period=depth,
                          od_type='Iter',
                          od_wait=20,
                          rsm=0.2,
                          devices='GPU'
                         )
model.fit(train_pool,
         eval_set=test_pool,
         verbose_eval=depth,
         use_best_model=True,
         plot=False
         )

In [None]:
predict = model.predict(X_test)

predict = np.e ** predict
y_test = np.e ** y_test

# оцениваем точность
MAPE = f'{(mape(y_test, predict))*100:0.4f}'
print(f"Точность модели по метрике MAPE: {MAPE}%")

In [76]:
import hyperopt 

catb_kwargs = {'iterations':5000,'X_train':X,'y_train':y,
               'categorical_features_indices':cat_features_ids,
               'random_seed':RANDOM_SEED, 'eval_metric':'MAPE'
               }

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 1e-1),
    'max_depth': hyperopt.hp.uniform('max_depth', 2, 16)}

rstate = np.random.RandomState(RANDOM_SEED)
trials = hyperopt.Trials()

catb_hyperopt_inst = HyperOpt(**catb_kwargs)


trials = hyperopt.Trials()
best = hyperopt.fmin(
    catb_hyperopt_inst.hyperopt_catb_score,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=2,
    trials=trials,
    rstate=rstate
)
print(best)

  0%|          | 0/2 [00:02<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [None]:
import winsound
freq = 2000 # Set frequency To 2500 Hertz
dur = 700 # Set duration To 1000 ms == 1 second
winsound.Beep(freq, dur) 

# XGBoost

In [None]:
data = pd.read_csv('data_for_stacking.csv')
cat_features_ids = ['bodyType', 'brand', 'color','descr_labels', 'fuelType', 
                    'model_name', 'vehicleTransmission',
                    'pts', 'privod', 'wheel', 'state']

for colum in cat_features_ids:
    data[colum] = data[colum].astype('category').cat.codes

In [None]:
X = data.query('sample == 1').drop(['sample'], axis=1)
y = X['price']

X.drop('price', axis=1, inplace=True, errors='ignore')
X_sub = data.query('sample == 0').drop(['sample'], axis=1)
VAL_SIZE=0.20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=VAL_SIZE, shuffle=True, random_state=RANDOM_SEED)
y_train = np.log(y_train)
y_test = np.log(y_test)

In [None]:
import xgboost as xgb
model_xgb = xgb.XGBRegressor(base_score=0.5, 
                             booster='gbtree',
                             colsample_bylevel=1,
                             colsample_bynode=1, 
                             colsample_bytree=1, 
                             gamma=0,
                             importance_type='gain', 
                             learning_rate=0.08, 
                             max_delta_step=0,
                             max_depth=7, 
                             min_child_weight=1, 
                             missing=None, 
                             n_estimators=100,
                             n_jobs=1, 
                             nthread=None, 
                             random_state=0,
                             reg_alpha=0, 
                             reg_lambda=1, 
                             scale_pos_weight=1, 
                             seed=RANDOM_SEED,
                             silent=None, 
                             subsample=0.75, 
                             verbosity=1, 
                             objective='reg:squarederror', 
                             eval_metric='MAPE')


In [None]:
model.fit(X_train, y_train)

In [None]:
predict = model.predict(X_test)

predict = np.e ** predict
y_test = np.e ** y_test

# оцениваем точность
MAPE = f'{(mape(y_test, predict))*100:0.4f}'
print(f"Точность модели по метрике MAPE: {MAPE}%")

In [None]:
catb_kwargs = {'iterations':5000,'X_train':X,'y_train':y,
               'categorical_features_indices':cat_features_ids,
               'random_seed':RANDOM_SEED, 'eval_metric':'MAPE'
               }

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 1e-1),
    'max_depth': hyperopt.hp.uniform('max_depth', 2, 16)}

rstate = np.random.RandomState(RANDOM_SEED)
trials = hyperopt.Trials()

catb_hyperopt_inst = HyperOpt(**catb_kwargs)


trials = hyperopt.Trials()
best = hyperopt.fmin(
    catb_hyperopt_inst.hyperopt_catb_score,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=2,
    trials=trials,
    rstate=rstate
)
print(best)git