In [1]:
# нужные библиотеки
import numpy as np
import pandas as pd
import gc
import random
import re
import typing
import numpy as np

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error, accuracy_score

import lightgbm
import xgboost
import catboost

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import sklearn
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score, make_scorer

sklearn.set_config(display='diagram')

## Read Data

In [5]:
data = pd.read_csv('../data/car_train.csv')
data.head()

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class
0,y13744087j,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,109.99,another_bug
1,O41613818T,VW Polo VI,economy,petrol,3.9,2015,78218,2021,34.48,electro_bug
2,d-2109686j,Renault Sandero,standart,petrol,6.3,2012,23340,2017,34.93,gear_stick
3,u29695600e,Mercedes-Benz GLC,business,petrol,4.04,2011,1263,2020,32.22,engine_fuel
4,N-8915870N,Renault Sandero,standart,petrol,4.7,2012,26428,2017,27.51,engine_fuel


In [6]:
data['car_id'].nunique()

2337

In [7]:
data.shape

(2337, 10)

### Split into train and val subset

In [8]:
class_names = np.unique(data['target_class'])
data['target_class'] = data['target_class'].replace(class_names, np.arange(data['target_class'].nunique()))

In [9]:
COLS_BEST = data.drop(columns=['car_id', 'target_reg', 'target_class']).columns

In [10]:
X_train, X_val, y_train, y_val = train_test_split(data.drop(['target_class'], axis=1)[COLS_BEST], 
                                                    data['target_class'],
                                                    test_size=.25,
                                                    stratify=data['target_class'],
                                                    random_state=42)
X_train.shape, X_val.shape

((1752, 7), (585, 7))

## Define models

### Train catboost

In [75]:
params_cat = {'n_estimators' : 100,
          'learning_rate': .03,
          'depth' : 3,
          'verbose': False,
          # 'use_best_model': True,
          # 'cat_features' : cat_cols,
          'text_features': [],
          # 'train_dir' : '/home/jovyan/work/catboost',
          # 'border_count' : 64,
          # 'l2_leaf_reg' : 1,
          # 'bagging_temperature' : 2,
          # 'rsm' : .1,
          'loss_function': 'MultiClass',
          # 'auto_class_weights' : 'Balanced', #try not balanced
          'random_state': 42,
          'use_best_model': False,
          # 'custom_metric' : ['AUC', 'MAP']
         }

cat_model = catboost.CatBoostClassifier(**params_cat)

### LightGbm

In [44]:
params_lgbm = {'num_leaves': 887,
               'n_estimators': 480,
               'max_depth': 7,
               # 'min_child_samples': 1073,
               'learning_rate': 0.05348257149091985,
               # 'min_data_in_leaf': 2,
               # 'feature_fraction': 0.9529134909800754,
               # 'categorical_feature': cat_cols
              }

In [45]:
lgbm_model = lightgbm.LGBMClassifier(**params_lgbm)

### Xgboost

In [14]:
params_xgb = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    # 'objective': 'reg:linear',
    # 'eval_metric': 'accuracy'
}

In [15]:
xgb_model = xgboost.XGBClassifier(**params_xgb)

## Make pipeline

### Features preprocessing

In [16]:
categorical_features = ['model', 'car_type', 'fuel_type']
numerical_features = ['car_rating', 'year_to_start', 'riders', 'year_to_work']

In [17]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

In [18]:
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler()),
    # ("feature_selector", SelectKBest(score_func=f_classif, k="all"))
])

In [19]:
data_transformer = ColumnTransformer(transformers=[
    ("numerical", numerical_transformer, numerical_features),
    ("categorical", categorical_transformer, categorical_features)])

# preprocessor = Pipeline(steps=[("data_transformer", data_transformer)])
preprocessor = data_transformer

In [20]:
data_transformer.transformers[0]

('numerical',
 Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]),
 ['car_rating', 'year_to_start', 'riders', 'year_to_work'])

### Train stacker

In [76]:
estimators = [
    ("svm", make_pipeline(preprocessor, LinearSVC(verbose=False))),
    ("random_forest", make_pipeline(preprocessor, RandomForestClassifier(n_jobs=-1, verbose=False))),
    ("xgboost", make_pipeline(preprocessor, xgb_model)),
    ("lightgbm", make_pipeline(preprocessor, lgbm_model)),
    ("catboost", make_pipeline(preprocessor, cat_model)),
]

stacking_classifier = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(verbose=False),
    n_jobs=-1,
    verbose=False,
)

In [77]:
stacking_classifier.fit(X_train, y_train)

  * (last_sum / last_over_new_count - new_sum) ** 2
  * (last_sum / last_over_new_count - new_sum) ** 2


In [78]:
for name, model in stacking_classifier.named_estimators_.items():
    print(name, 'accuracy: ', round(accuracy_score(model.predict(X_val), y_val), 4))

svm accuracy:  0.1265
random_forest accuracy:  0.1077
xgboost accuracy:  0.1197
lightgbm accuracy:  0.106
catboost accuracy:  0.1214


In [79]:
print('ensemble score:', round(accuracy_score(stacking_classifier.predict(X_val), y_val), 4))
# ensemble is beating the highest score of base model

ensemble score: 0.135
