In [1]:
# Загрузка библиотек

from IPython.display import display
from ipywidgets import IntProgress
from catboost import CatBoostRegressor
import numpy as np
import pandas as pd
import json
import ast
import re
from itertools import combinations
from statsmodels.stats import weightstats
from scipy import stats
from gensim.models import Word2Vec
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
import warnings

import sys
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from nltk.corpus import stopwords
from sklearn.metrics import mean_absolute_error, mean_squared_error, silhouette_score
from tqdm.notebook import tqdm
tqdm.pandas()

warnings.filterwarnings("ignore")

import lazypredict
from lazypredict.Supervised import LazyRegressor
from sklearn.utils import shuffle

  from pandas import Panel


In [2]:
# Фиксация версий пакетов для воспроизводимости экспериментов
!pip freeze > requirements.txt



In [3]:
# Фиксация random seed для воспроизводимости экспериментов
RS = 42

In [4]:
# Функция возвращает целевую метрику
def MAPE(y_true, y_pred):
    return np.mean(np.abs((y_pred-y_true)/y_true))

In [5]:
data = pd.read_csv('alt_data.csv')

In [6]:
data.sample(5)

Unnamed: 0,vendor,Руль,eqp_amount,eur_price,mileage,modelDate,numberOfDoors,parsing_unixtime,productionDate,eng_volume,...,ПТС_Оригинал,ПТС_nan,Привод_задний,Привод_передний,Привод_полный,gen_id_stat,own_time_stat,body_stat,bodyType_stat,color_stat
149990,0,0,0,4012.0,152000.0,2005.0,4.0,1599647912.0,2006.0,1.9,...,1,0,0,1,0,1.0,0.85,0.6,0.55,0.74
52733,0,0,13,4312.0,335000.0,1996.0,4.0,1620203331.0,2001.0,2.2,...,1,0,0,1,0,0.22,0.85,0.8,0.69,0.48
101161,0,0,62,0.0,103000.0,2014.0,5.0,1603217112.0,2018.0,3.0,...,1,0,0,0,1,1.0,0.72,1.2,1.21,0.83
83983,0,0,46,0.0,176000.0,2010.0,4.0,1603238671.0,2010.0,2.0,...,1,0,0,1,0,1.0,0.73,0.6,0.55,1.1
118227,1,0,0,8915.0,200000.0,2007.0,4.0,1599647912.0,2008.0,2.4,...,1,0,0,1,0,1.0,0.85,0.6,0.38,0.83


In [7]:
data.isna().sum()

vendor                                  0
Руль                                    0
eqp_amount                              0
eur_price                               0
mileage                                 0
modelDate                               0
numberOfDoors                           0
parsing_unixtime                        0
productionDate                          0
eng_volume                              0
eng_pwr                                 0
rel_model                               0
eqp_rating                              0
train                                   0
sell_id                                 0
premium_0                               0
premium_1                               0
desc_cluster_0                          0
desc_cluster_1                          0
desc_cluster_2                          0
desc_cluster_3                          0
desc_cluster_4                          0
desc_cluster_5                          0
brand_AUDI                        

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159043 entries, 0 to 159042
Data columns (total 59 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   vendor                                159043 non-null  int64  
 1   Руль                                  159043 non-null  int64  
 2   eqp_amount                            159043 non-null  int64  
 3   eur_price                             159043 non-null  float64
 4   mileage                               159043 non-null  float64
 5   modelDate                             159043 non-null  float64
 6   numberOfDoors                         159043 non-null  float64
 7   parsing_unixtime                      159043 non-null  float64
 8   productionDate                        159043 non-null  float64
 9   eng_volume                            159043 non-null  float64
 10  eng_pwr                               159043 non-null  float64
 11  

Пропусков в данных нет. Тип всех данных: int или float

В качестве базовой модели мы использовали CatBoost и получили MAPE = 0.13379

## Дополнительные модели

Для улучшения результата, блэндинга и стэкинга будут использованы следующие модели:
- Две базовые модели для противоположных эстремальных подходов:
> - Глубокое дерево для максимального заучивания данных
> - Линейная регрессия для отображения общего тренда

- Альтернативные ансамбли:
> - Стандартный бэггинг RandomForest
> - Базовый градиентный бустинг от sklearn

In [9]:
# Функция возвращает метрики для данных и модели (MAPE с кросс-валидацией и без нее)
def alt_metics(data, target, model=CatBoostRegressor(random_seed=RS, silent=True)):
    df = data[data.train == 1].drop(['train', 'sell_id'], axis=1).copy()

    # Разделение переменных
    X = df.drop(target, axis=1).values
    y = df[target].values

    # Создание списков для записи локальных результатов
    local_mape = []

    # Моделирование с 3-ой кросс-валидацией
    SKF = StratifiedKFold(n_splits=3, shuffle=True, random_state=RS)
    for train_index, test_index in SKF.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, np.log(y_train))
        preds = np.exp(model.predict(X_test))
        local_mape.append(MAPE(y_test, preds))

    # Усреднение метрики
    result = []
    result.append(np.mean(local_mape))

    # Добавление мерики без кросс-валидации
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, shuffle=True, random_state=RS)
    model.fit(X_train, np.log(y_train))
    preds = np.exp(model.predict(X_test))
    result.append(MAPE(y_test, preds))

    return result

### Lazy Predict

In [10]:
# Посмотрим, какие еще модели могут представлять интерес
# Обучим LazyRegressor на небольших выборках из датасета

scaler = StandardScaler()

small_data = data[data.train==1].sample(20000).drop(['train','sell_id'], axis=1).copy()

X = small_data.drop('eur_price', axis=1).values
X = scaler.fit_transform(X) 
y = small_data['eur_price'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=RS)

regr=LazyRegressor(verbose=0,predictions=True)

models_r,predictions_r=regr.fit(X_train, X_test, np.log(y_train), np.log(y_test))

models_r

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [03:56<00:00,  5.63s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LGBMRegressor,0.95,0.95,0.23,0.52
HistGradientBoostingRegressor,0.95,0.95,0.23,3.81
RandomForestRegressor,0.95,0.95,0.23,11.91
ExtraTreesRegressor,0.95,0.95,0.23,10.27
XGBRegressor,0.95,0.95,0.23,2.47
NuSVR,0.95,0.95,0.23,39.99
SVR,0.95,0.95,0.23,16.27
BaggingRegressor,0.95,0.95,0.24,1.27
GradientBoostingRegressor,0.94,0.94,0.25,4.33
MLPRegressor,0.94,0.94,0.26,18.34


In [11]:
small_data = data[data.train==1].sample(20000).drop(['train','sell_id'], axis=1).copy()

X = small_data.drop('eur_price', axis=1).values
X = scaler.fit_transform(X) 
y = small_data['eur_price'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=RS)

regr=LazyRegressor(verbose=0,predictions=True)

models_r,predictions_r=regr.fit(X_train, X_test, np.log(y_train), np.log(y_test))

models_r

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [04:06<00:00,  5.86s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
XGBRegressor,0.96,0.96,0.23,2.13
LGBMRegressor,0.96,0.96,0.23,0.58
HistGradientBoostingRegressor,0.95,0.96,0.23,3.98
ExtraTreesRegressor,0.95,0.95,0.23,12.45
RandomForestRegressor,0.95,0.95,0.23,12.39
NuSVR,0.95,0.95,0.24,42.26
SVR,0.95,0.95,0.24,18.13
BaggingRegressor,0.95,0.95,0.24,1.38
GradientBoostingRegressor,0.95,0.95,0.25,3.91
MLPRegressor,0.94,0.94,0.26,23.9


Как видно, хорошо бы еще проверить XGBRegressor и ExtraTreesRegressor

### DecisionTree

In [12]:
# Дефолтные параметры
DT = DecisionTreeRegressor(random_state=RS)
alt_results = pd.DataFrame({'DT': alt_metics(data, 'eur_price', model=DT)},
                           index=['CV', 'TST0.3'])
alt_results

Unnamed: 0,DT
CV,0.2
TST0.3,0.2


In [13]:
df = data[data.train == 1].drop(['train', 'sell_id'], axis=1).copy()

X = df.drop('eur_price', axis=1).values
y = df['eur_price'].values

In [14]:
# Подбор параметров

'''X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=RS)

DF = DecisionTreeRegressor(random_state=RS)

splitter = ['best', 'random']
max_features = ['auto', 'sqrt', 'log2', None]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

random_grid = {'splitter': splitter,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               }

DF_random = RandomizedSearchCV(estimator = DF, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=RS, n_jobs=-1)
DF_random.fit(X_train, np.log(y_train))

DF_random.best_params_'''

"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=RS)\n\nDF = DecisionTreeRegressor(random_state=RS)\n\nsplitter = ['best', 'random']\nmax_features = ['auto', 'sqrt', 'log2', None]\nmax_depth = [int(x) for x in np.linspace(10, 110, num = 11)]\nmax_depth.append(None)\nmin_samples_split = [2, 5, 10]\nmin_samples_leaf = [1, 2, 4]\n\nrandom_grid = {'splitter': splitter,\n               'max_features': max_features,\n               'max_depth': max_depth,\n               'min_samples_split': min_samples_split,\n               'min_samples_leaf': min_samples_leaf,\n               }\n\nDF_random = RandomizedSearchCV(estimator = DF, param_distributions = random_grid, \n                               n_iter = 100, cv = 3, verbose=2, random_state=RS, n_jobs=-1)\nDF_random.fit(X_train, np.log(y_train))\n\nDF_random.best_params_"

Лучшие параметры: {'splitter': 'random',
                     'min_samples_split': 10,
                     'min_samples_leaf': 4,
                     'max_features': 'auto',
                     'max_depth': 90}

In [15]:
# Обучим модель с подобранными параметрами
DT = DecisionTreeRegressor(splitter='random', 
                           min_samples_split=10, 
                           min_samples_leaf=4, 
                           max_features='auto', 
                           max_depth=90, 
                           random_state=RS)
alt_results['DT_new'] = alt_metics(data, 'eur_price', model=DT)
alt_results

Unnamed: 0,DT,DT_new
CV,0.2,0.17
TST0.3,0.2,0.17


Метрика улучшилась, но хуже базовой модели

### Linear regression

In [16]:
# Модель на ненормализованных данных
LR = LinearRegression(n_jobs=-1)
alt_results['LR'] = alt_metics(data, 'eur_price', model=LR)

alt_results

Unnamed: 0,DT,DT_new,LR
CV,0.2,0.17,0.24
TST0.3,0.2,0.17,0.23


In [17]:
# Функция возвращает метрики для нормализованных данных и модели (MAPE с кросс-валидацией и без нее)
def norm_alt_metics(data, target, scaler, model=CatBoostRegressor(random_seed=RS, silent=True)):
    df = data[data.train == 1].drop(['train', 'sell_id'], axis=1).copy()

    # Разделение переменных
    X = df.drop(target, axis=1).values
    X = scaler.fit_transform(X)
    y = df[target].values

    # Создание списков для записи локальных результатов
    local_mape = []

    # Моделирование с 3-ой кросс-валидацией
    SKF = StratifiedKFold(n_splits=3, shuffle=True, random_state=RS)
    for train_index, test_index in SKF.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, np.log(y_train))
        preds = np.exp(model.predict(X_test))
        local_mape.append(MAPE(y_test, preds))

    # Усреднение метрики
    result = []
    result.append(np.mean(local_mape))

    # Добавление метрики без кросс-валидации
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, shuffle=True, random_state=RS)
    model.fit(X_train, np.log(y_train))
    preds = np.exp(model.predict(X_test))
    result.append(MAPE(y_test, preds))

    return result

In [18]:
# Построим линейную регрессию, построенную на признаках, нормализованных с помощью StandardScaler
LR = LinearRegression(n_jobs=-1)

alt_results['LR(ss)'] = norm_alt_metics(data, 'eur_price', scaler=RobustScaler(), model=LR)

alt_results

Unnamed: 0,DT,DT_new,LR,LR(ss)
CV,0.2,0.17,0.24,0.24
TST0.3,0.2,0.17,0.23,0.23


In [19]:
# Построим линейную регрессию, построенную на признаках, нормализованных с помощью RobustScaler
from sklearn.preprocessing import RobustScaler

LR = LinearRegression(n_jobs=-1)

alt_results['LR(rs)'] = norm_alt_metics(data, 'eur_price', scaler=RobustScaler(), model=LR)

alt_results

Unnamed: 0,DT,DT_new,LR,LR(ss),LR(rs)
CV,0.2,0.17,0.24,0.24,0.24
TST0.3,0.2,0.17,0.23,0.23,0.23


Линейная регрессия работает плохо, что было ожидаемо

### Random Forest

In [20]:
# Параметры по умолчанию
RF = RandomForestRegressor(random_state=RS)

alt_results['RF'] = alt_metics(data, 'eur_price', model=RF)
alt_results

Unnamed: 0,DT,DT_new,LR,LR(ss),LR(rs),RF
CV,0.2,0.17,0.24,0.24,0.24,0.14
TST0.3,0.2,0.17,0.23,0.23,0.23,0.14


In [21]:
# Подбор параметров
'''X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=RS)

RF = RandomForestRegressor(random_state=RS)

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

RF_random = RandomizedSearchCV(estimator = RF, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=RS, n_jobs = -1)
RF_random.fit(X_train, y_train)
# Посмотрим, какие гиперпараметры нам предлагают как оптимальные:
RF_random.best_params_'''

"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=RS)\n\nRF = RandomForestRegressor(random_state=RS)\n\nn_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]\nmax_features = ['auto', 'sqrt']\nmax_depth = [int(x) for x in np.linspace(10, 110, num = 11)]\nmax_depth.append(None)\nmin_samples_split = [2, 5, 10]\nmin_samples_leaf = [1, 2, 4]\nbootstrap = [True, False]\n\nrandom_grid = {'n_estimators': n_estimators,\n               'max_features': max_features,\n               'max_depth': max_depth,\n               'min_samples_split': min_samples_split,\n               'min_samples_leaf': min_samples_leaf,\n               'bootstrap': bootstrap}\n\nRF_random = RandomizedSearchCV(estimator = RF, param_distributions = random_grid, \n                               n_iter = 100, cv = 3, verbose=2, random_state=RS, n_jobs = -1)\nRF_random.fit(X_train, y_train)\n# Посмотрим, какие гиперпараметры нам предлагают как опти

Получили следующие значения: 'n_estimators': 1400, 'min_samples_split': 5, 'min_samples_leaf': 1,
'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False

In [22]:
# Обучим RF модель на подобранных параметрах
RF = RandomForestRegressor(n_estimators=1400, min_samples_split=5, min_samples_leaf=1, 
                           max_features='sqrt', max_depth=80, bootstrap=False, random_state=RS)

alt_results['RF_new'] = alt_metics(data, 'eur_price', model=RF)
alt_results

Unnamed: 0,DT,DT_new,LR,LR(ss),LR(rs),RF,RF_new
CV,0.2,0.17,0.24,0.24,0.24,0.14,0.14
TST0.3,0.2,0.17,0.23,0.23,0.23,0.14,0.14


### Gradient Boosting

In [23]:
# Параметры по умолчанию
GB = GradientBoostingRegressor(random_state=RS)
alt_results['GB'] =alt_metics(data, 'eur_price', model=GB)
alt_results

Unnamed: 0,DT,DT_new,LR,LR(ss),LR(rs),RF,RF_new,GB
CV,0.2,0.17,0.24,0.24,0.24,0.14,0.14,0.18
TST0.3,0.2,0.17,0.23,0.23,0.23,0.14,0.14,0.18


In [24]:
# Подбор параметров
'''X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=RS)

GB = GradientBoostingRegressor(random_state=RS)

param_grid = {'learning_rate':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1], 
              'n_estimators':[100, 250, 500, 750, 1000]}
gridsearch = GridSearchCV(GB, param_grid, n_jobs=-1, cv=3)
gridsearch.fit(X_train, y_train)
model = gridsearch.best_estimator_

# печатаем параметры
best_parameters = model.get_params()
for param_name in best_parameters:
    print(param_name, best_parameters[param_name], '\n')'''


"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=RS)\n\nGB = GradientBoostingRegressor(random_state=RS)\n\nparam_grid = {'learning_rate':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1], \n              'n_estimators':[100, 250, 500, 750, 1000]}\ngridsearch = GridSearchCV(GB, param_grid, n_jobs=-1, cv=3)\ngridsearch.fit(X_train, y_train)\nmodel = gridsearch.best_estimator_\n\n# печатаем параметры\nbest_parameters = model.get_params()\nfor param_name in best_parameters:\n    print(param_name, best_parameters[param_name], '\n')"

Получили параметры: 

learning_rate 0.1 

n_estimators 750 

In [25]:
# Обучим модель с новыми параметрами
GB = GradientBoostingRegressor(learning_rate=0.1, n_estimators=750, random_state=RS)
alt_results['GB_new'] = alt_metics(data, 'eur_price', model=GB)
alt_results

Unnamed: 0,DT,DT_new,LR,LR(ss),LR(rs),RF,RF_new,GB,GB_new
CV,0.2,0.17,0.24,0.24,0.24,0.14,0.14,0.18,0.15
TST0.3,0.2,0.17,0.23,0.23,0.23,0.14,0.14,0.18,0.15


### ExtraTreesRegressor

In [26]:
from sklearn.ensemble import ExtraTreesRegressor

In [27]:
# Параметры по умолчанию
ETR = ExtraTreesRegressor(random_state=RS)

alt_results['ETR'] = alt_metics(data, 'eur_price', model=ETR)
alt_results

Unnamed: 0,DT,DT_new,LR,LR(ss),LR(rs),RF,RF_new,GB,GB_new,ETR
CV,0.2,0.17,0.24,0.24,0.24,0.14,0.14,0.18,0.15,0.14
TST0.3,0.2,0.17,0.23,0.23,0.23,0.14,0.14,0.18,0.15,0.14


In [28]:
# Подбор параметров
'''X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=RS)

ETR = ExtraTreesRegressor(random_state=RS)

param_grid = {'learning_rate':[0.0001, 0.001, 0.01, 0.1, 1], 
              'n_estimators':[100, 250, 500, 750, 1000],
              'max_depth': [10, 25, 40, 55, 70, 85, 100, None], 
              'min_samples_split': [2, 5, 10], 
              'min_samples_leaf': [1, 2, 4], 
              'max_features': ['auto', 'sqrt']}
gridsearch = GridSearchCV(GB, param_grid, n_jobs=-1, cv=3)
gridsearch.fit(X_train, y_train)
model = gridsearch.best_estimator_

# печатаем параметры
best_parameters = model.get_params()
for param_name in best_parameters:
    print(param_name, best_parameters[param_name], '\n')'''
# Не хватило вычислительной мощности

"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=RS)\n\nETR = ExtraTreesRegressor(random_state=RS)\n\nparam_grid = {'learning_rate':[0.0001, 0.001, 0.01, 0.1, 1], \n              'n_estimators':[100, 250, 500, 750, 1000],\n              'max_depth': [10, 25, 40, 55, 70, 85, 100, None], \n              'min_samples_split': [2, 5, 10], \n              'min_samples_leaf': [1, 2, 4], \n              'max_features': ['auto', 'sqrt']}\ngridsearch = GridSearchCV(GB, param_grid, n_jobs=-1, cv=3)\ngridsearch.fit(X_train, y_train)\nmodel = gridsearch.best_estimator_\n\n# печатаем параметры\nbest_parameters = model.get_params()\nfor param_name in best_parameters:\n    print(param_name, best_parameters[param_name], '\n')"

### HistGradientBoosting

In [29]:
from sklearn.ensemble import HistGradientBoostingRegressor

In [30]:
# Параметры по умолчанию
HGB = HistGradientBoostingRegressor(random_state=RS)

alt_results['HGB'] = alt_metics(data, 'eur_price', model=HGB)
alt_results

Unnamed: 0,DT,DT_new,LR,LR(ss),LR(rs),RF,RF_new,GB,GB_new,ETR,HGB
CV,0.2,0.17,0.24,0.24,0.24,0.14,0.14,0.18,0.15,0.14,0.15
TST0.3,0.2,0.17,0.23,0.23,0.23,0.14,0.14,0.18,0.15,0.14,0.15
