In [5]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error

In [46]:
df_train = pd.read_csv('../data/train_data.csv')
df_test = pd.read_csv('../data/test_data.csv')

In [47]:
df_train['random_feature'] = np.random.standard_normal(len(df_train))
df_test['random_feature'] = np.random.standard_normal(len(df_test))

In [48]:
def eval_metrics(y_test, y_pred):
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    return rmse, mae, mape, r2

In [49]:
def catboost_learn_predict(params, X_train, X_test, y_train, y_test, features:list):
    X_train = X_train[features]
    X_test = X_test[features]
    
    model = CatBoostRegressor(iterations=params['iterations'],
                           learning_rate=params['learning_rate'],
                           depth=params['depth'],
                           verbose=params['verbose'], 
                           random_state=params['random_state'])

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    _, _, mape, _ = eval_metrics(y_test, y_pred)
    
    return mape

In [50]:
X_train = df_train.drop(['price', 'text'], axis=1)
y_train = df_train['price']

X_test = df_test.drop(['price', 'text'], axis=1)
y_test = df_test['price']


In [16]:
X_train

Unnamed: 0,is_dealer,year,engine,mileage,power_horse,transmission_AT,transmission_CVT,transmission_MT,car_body_кабриолет,car_body_купе,...,brand_Япония,region_Другое,region_Приволжский,region_Северо-Западный,region_Сибирский,region_Уральский,region_Центральный,region_Южный,price_class_pred,random_feature
0,False,2008,1.6,200000,81,False,False,True,False,False,...,False,False,False,False,False,False,False,True,0,-0.580420
1,False,1997,1.5,34911,68,False,False,True,False,False,...,False,False,False,False,False,False,True,False,3,0.069009
2,False,2017,1.6,180000,102,False,False,True,False,False,...,True,False,False,False,False,False,False,True,5,0.849094
3,False,2010,1.6,165,81,False,False,True,False,False,...,False,False,False,False,False,False,False,True,2,0.989740
4,False,2004,1.8,282000,150,True,False,False,False,False,...,False,False,False,False,False,False,True,False,5,0.374891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30334,False,2007,1.8,262000,120,False,False,True,False,False,...,True,False,False,False,False,False,False,True,2,1.970765
30335,False,2017,2.0,135580,249,True,False,False,False,False,...,False,False,False,False,False,False,True,False,9,-0.185157
30336,False,1997,3.0,350000,211,True,False,False,False,False,...,False,False,False,False,False,False,False,True,4,0.408747
30337,False,2011,1.6,194000,81,False,False,True,False,False,...,False,False,False,False,False,False,False,True,1,-0.849655


In [62]:
params = {
    'iterations': 600,
    'learning_rate':0.1,
    'depth':6,
    'cat_features':[
        'is_dealer', 'transmission', 'car_body',
        'wheel_drive','fuel_type', 'brand', 
        'region'
        ],
    'verbose':False,
    'random_state':42
}

In [63]:
def remove_constant_columns(df):
    """Удаляет константные столбцы из DataFrame.

    Args:
        df: DataFrame.

    Returns:
        DataFrame без константных столбцов или None, если входной DataFrame не DataFrame.
    """
    if not isinstance(df, pd.DataFrame):
        return None

    constant_columns = [col for col in df.columns if df[col].nunique() == 1]
    df_new = df.drop(columns=constant_columns)
    return df_new

In [64]:
X_train = remove_constant_columns(X_train)

In [65]:
features_left = [*X_train.columns]
features_in_model = []
min_metric_prev = 100

while True:

    metric_check = dict()

    print(f'Checking {", ".join(features_in_model)}.')
    for feature in features_left:

        features_in_model.append(feature)

        mape = catboost_learn_predict(params, X_train, X_test, y_train, y_test, features_in_model)
        if feature == 'random_feature':
            random_feature_metric = mape
        metric_check[mape] = feature
        features_in_model.remove(feature)

    min_metric = min(metric_check.keys())
    min_feature = metric_check[min_metric]

    if min_metric >= random_feature_metric or min_metric_prev - min_metric  <= 0.001:
        print('Feature selection finished.')
        print(f'Selected {min_feature} metric: {min_metric}.')
        print(f'Previous metric value = {min_metric_prev}')
        print(f'Random feature metric value = {random_feature_metric}')
        print(f'Final result: {", ".join(features_in_model)}.')
        break

    min_metric_prev = min_metric

    features_in_model.append(min_feature)
    print('Selected', min_feature, 'metric:', min_metric)
    features_left.remove(min_feature)

        

Checking .
Selected price_class_pred metric: 0.434296835461523
Checking price_class_pred.
Selected power_horse metric: 0.3320592989599127
Checking price_class_pred, power_horse.
Selected year metric: 0.26427150570492575
Checking price_class_pred, power_horse, year.
Selected engine metric: 0.2547281097159831
Checking price_class_pred, power_horse, year, engine.
Selected brand_Россия metric: 0.2511523914647091
Checking price_class_pred, power_horse, year, engine, brand_Россия.
Selected region_Южный metric: 0.2466289579680923
Checking price_class_pred, power_horse, year, engine, brand_Россия, region_Южный.
Selected brand_Япония metric: 0.24364555927032136
Checking price_class_pred, power_horse, year, engine, brand_Россия, region_Южный, brand_Япония.
Selected mileage metric: 0.2401528284814482
Checking price_class_pred, power_horse, year, engine, brand_Россия, region_Южный, brand_Япония, mileage.
Selected brand_Китай metric: 0.23765040632229353
Checking price_class_pred, power_horse, year,

In [66]:
features_in_model

['price_class_pred',
 'power_horse',
 'year',
 'engine',
 'brand_Россия',
 'region_Южный',
 'brand_Япония',
 'mileage',
 'brand_Китай',
 'brand_Южная Корея',
 'wheel_drive_полный',
 'brand_США',
 'wheel_drive_передний']