In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sys
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.base import clone
import json

In [None]:
RANDOM_SEED = 42

In [None]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred-y_true)/y_true))

In [None]:
VERSION    = 20
VAL_SIZE   = 0.20   # 20%

In [None]:
train = pd.read_csv('train90.csv') # датасет для обучения модели (parsed auto.ru)
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [None]:
train.dropna(subset=['price'], inplace=True)
columns = ['bodyType','brand','color','fuelType','numberOfDoors','productionDate','vehicleTransmission','engineDisplacement','enginePower']
df_train = train[columns]
df_test = test[columns]
y = train['price']

In [None]:
df_train['sample'] = 1 # помечаем где у нас трейн
df_test['sample'] = 0 # помечаем где у нас тест

data = df_test.append(df_train, sort=False).reset_index(drop=True) # объединяем
for column in ['bodyType','brand','color','fuelType','vehicleTransmission','engineDisplacement','enginePower']:
    data[column] = data[column].astype('category').cat.codes
X = data.query('sample == 1').drop(['sample'], axis=1)
X_sub = data.query('sample == 0').drop(['sample'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=VAL_SIZE, shuffle=True, random_state=RANDOM_SEED)
tmp_train = X_train.copy()
tmp_train['price'] = np.array(y_train)

**Find the best params for models**

In [None]:
# param_grid = {'max_depth':[1,5,10,30,100]}
# rfr = RandomForestRegressor(random_state=42)
# rfr2 = GridSearchCV(estimator=rfr, scoring='neg_mean_absolute_percentage_error', n_jobs=-1, cv=5, param_grid= param_grid)
# rfr2.fit(X_train, y_train)

In [None]:
# rfr2.best_params_

In [None]:
# param_grid = {'leaf_size':[1,5,10,30,100], 
#               'n_neighbors':[1,2,5,10]}
# knr = KNeighborsRegressor()
# knr2 = GridSearchCV(estimator=knr, scoring='neg_mean_absolute_percentage_error', n_jobs=-1, cv=5, param_grid= param_grid)
# knr2.fit(X_train, y_train)

In [None]:
# knr2.best_params_

In [None]:
# param_grid = {'iterations':[100,1000,5000,10000]}
# dtr = CatBoostRegressor(iterations = 5000,
#                           random_seed = RANDOM_SEED,
#                           eval_metric='MAPE',
#                           custom_metric=['R2', 'MAE'],
#                           silent=True)
# dtr2 = GridSearchCV(estimator=dtr, scoring='neg_mean_absolute_percentage_error', n_jobs=-1, cv=5, param_grid= param_grid)
# dtr2.fit(X_train, y_train)

In [None]:
# dtr2.best_params_

In [None]:
# Create stacking model
def get_stacking():
	# define the base models
	level0 = list()
	level0.append(('knn', KNeighborsRegressor(leaf_size=1, n_neighbors=2)))
	level0.append(('dtr', CatBoostRegressor(iterations = 5000, random_state=RANDOM_SEED)))
	level0.append(('rfr', RandomForestRegressor(max_depth=30, random_state=RANDOM_SEED)))
	# define meta learner model
	level1 = LinearRegression()
	# define the stacking ensemble
	model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
	return model

**Build a model**

In [None]:
model = get_stacking()

In [None]:
# model = RandomForestRegressor(max_depth=30, random_state=RANDOM_SEED)

In [None]:
model.fit(X_train, np.log(y_train))

In [None]:
predict_test = np.exp(model.predict(X_test))
print(f"Точность модели по метрике MAPE: {(mape(y_test, predict_test))*100:0.2f}%")

In [None]:
predict_submission = np.exp(model.predict(X_sub))

In [None]:
# Create submission file
sample_submission['price'] = predict_submission
sample_submission.to_csv(f'submission_2_v' + str(VERSION) + '.csv', index=False)
sample_submission.head(10)