In [None]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_ml = pd.read_csv('../data/ml_data.csv')

In [4]:
df_ml['random_feature'] = np.random.standard_normal(len(df_ml))

In [5]:
df_ml.corr(numeric_only=True)

Unnamed: 0,price,is_dealer,year,engine,mileage,power_horse,random_feature
price,1.0,0.061369,0.552972,0.433304,-0.301735,0.638362,-0.003763
is_dealer,0.061369,1.0,0.073512,0.038897,-0.046068,0.070572,0.002108
year,0.552972,0.073512,1.0,-0.031034,-0.44802,0.16529,-0.005974
engine,0.433304,0.038897,-0.031034,1.0,0.128778,0.826992,0.004544
mileage,-0.301735,-0.046068,-0.44802,0.128778,1.0,0.031077,-0.001602
power_horse,0.638362,0.070572,0.16529,0.826992,0.031077,1.0,-0.000574
random_feature,-0.003763,0.002108,-0.005974,0.004544,-0.001602,-0.000574,1.0


In [6]:
def eval_metrics(y_test, y_pred):
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    return rmse, mae, mape, r2

In [8]:
def catboost_learn_predict(params, X_train, X_test, y_train, y_test, features:list):
    X_train = X_train[features]
    X_test = X_test[features]

    model = CatBoostRegressor(iterations=params['iterations'],
                           learning_rate=params['learning_rate'],
                           depth=params['depth'],
                           verbose=params['verbose'], 
                           random_state=params['random_state'])

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    _, _, mape, _ = eval_metrics(y_test, y_pred)
    
    return mape

In [9]:
X = df_ml.drop(['price', 'ad_description'], axis=1)
X_numeric = pd.get_dummies(X, columns=[
        'is_dealer', 'transmission', 'car_body',
        'wheel_drive','fuel_type', 'brand', 
        'region',
        ])
y = df_ml['price']
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.25)
X_train.shape

(30276, 55)

In [27]:
params = {
    'iterations': 200,
    'learning_rate':0.1,
    'depth':6,
    'cat_features':[
        'is_dealer', 'transmission', 'car_body',
        'wheel_drive','fuel_type', 'brand', 
        'region'
        ],
    'verbose':False,
    'random_state':42
}

In [None]:
features_left = [*X_train.columns]
features_in_model = []
min_metric_prev = 100

while True:

    metric_check = dict()
    

    print(f'Checking {", ".join(features_in_model)}.')
    for feature in features_left:
        features_in_model.append(feature)
        mape = catboost_learn_predict(params, X_train, X_test, y_train, y_test, features_in_model)
        if feature == 'random_feature':
            random_feature_metric = mape
        metric_check[mape] = feature
        features_in_model.remove(feature)

    min_metric = min(metric_check.keys())
    min_feature = metric_check[min_metric]

    if min_metric >= random_feature_metric or min_metric_prev - min_metric  <= 0.001:
        print('Feature selection finished.')
        print(f'Selected {min_feature} metric: {min_metric}.')
        print(f'Previous metric value = {min_metric_prev}')
        print(f'Random feature metric value = {random_feature_metric}')
        print(f'Final result: {", ".join(features_in_model)}.')
        break

    min_metric_prev = min_metric

    features_in_model.append(min_feature)
    print('Selected', min_feature, 'metric:', min_metric, '.')
    features_left.remove(min_feature)

        

In [138]:
len(['power_horse', 'year', 'engine', 'brand_Япония', 'region_Южный', 'mileage', 'wheel_drive_полный', 'brand_Россия', 'brand_Китай', 'brand_Южная Корея', 'brand_США', 'brand_Франция', 'transmission_AT', 'car_body_внедорожник', 'wheel_drive_задний', 'brand_Чехия', 'fuel_type_электро', 'car_body_хетчбэк', 'region_Приволжский', 'car_body_лифтбек', 'region_Северо-Западный', 'is_dealer_True', 'brand_Италия', 'fuel_type_газ', 'car_body_купе', 'region_Сибирский', 'brand_Украина', 'brand_Испания', 'car_body_микроавтобус'])

29