In [12]:
for name, imp in zip(reg_model.feature_names_, reg_model.feature_importances_):
    print(f'{name} ::: {imp}')

region ::: 11.991139976820293
industry ::: 5.753774440289299
locality_name ::: 9.050816996919382
education_type ::: 1.6193023459517961
citizenship ::: 0.9086198329973372
employement_type ::: 1.1182903850519306
age ::: 0.8487795083888682
gender ::: 3.503135147910313
experience ::: 1.271673335348249
salary_desired ::: 20.001812751671093
relocation_ready ::: 5.178486553883535
travel_ready ::: 5.937543985167039
retraining_ready ::: 0.9385804668319083
is_worldskills_participant ::: 0.0
has_qualifications ::: 0.0
completeness_rate ::: 2.8426932250329715
number_nans ::: 0.00031110342871571307
positions_first ::: 13.061286525871672
positions_second ::: 0.5154726323308979
positions_other ::: 1.0448970347569018
has_drive ::: 0.7338201132350958
has_license_A ::: 0.0
has_license_B ::: 0.0
has_license_C ::: 0.0
has_license_D ::: 0.0
has_license_E ::: 0.0
watch_yes ::: 0.0
flexible_yes ::: 0.0
irregular_yes ::: 0.0
parttime_yes ::: 0.0
fulltime_yes ::: 0.0
shift_yes ::: 0.0
days_between_c_m ::: 6.45

In [1]:
import random
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from pathlib import Path

In [2]:
SEED = 14300631
N_FOLDS = 5

In [3]:
random.seed(SEED)
np.random.seed(SEED)

In [4]:
train = pd.read_pickle('../data/preprocessed/train_final.pkl')
test = pd.read_pickle('../data/preprocessed/test_final.pkl')

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306270 entries, 0 to 306269
Columns: 470 entries, id to position_clean_99
dtypes: bool(17), category(14), datetime64[ns](1), float32(400), float64(15), int64(23)
memory usage: 571.8 MB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131259 entries, 0 to 131258
Columns: 469 entries, id to position_clean_99
dtypes: bool(17), category(14), datetime64[ns](1), float32(400), float64(14), int64(23)
memory usage: 245.3 MB


In [7]:
X_test = test.drop(['id', 'publish_date', 'publish_year'], axis=1)
train = train.drop(['id', 'publish_date'], axis=1)

In [8]:
from sklearn.metrics import mean_squared_error

cv_metrics = []
test_predictions = []
for idx, (train_indexes, val_indexes) in enumerate(skf.split(train, train['publish_year'])):
    X_train = train.loc[train_indexes].drop(['publish_year', 'salary'], axis=1)
    y_train = train.loc[train_indexes, 'salary']
    
    X_val = train.loc[val_indexes].drop(['publish_year', 'salary'], axis=1)
    y_val = train.loc[val_indexes, 'salary']

    cat_features = X_train.select_dtypes('category').columns.values
    # 1st model - zeros classifier
    y_clf_train = (y_train > 0).astype('int')
    y_clf_val = (y_val > 0).astype('int')

    clf_model = CatBoostClassifier(
        iterations=2000,
        random_seed=SEED,
        task_type='GPU',
        use_best_model=True,
    )
    print('--------- Train zeros classifier ---------')
    clf_model.fit(
        X_train,
        y_clf_train,
        eval_set=(X_val, y_clf_val),
        cat_features=cat_features,
        verbose_eval=250,
    )
    val_zero_probes = clf_model.predict_proba(X_val)[:, 1]
    # 2nd model - regressor
    X_reg_train = X_train[y_train > 0]
    y_reg_train = y_train[y_train > 0]

    X_reg_val = X_val[y_val > 0]
    y_reg_val = y_val[y_val > 0]
    
    reg_model = CatBoostRegressor(
        iterations=2000,
        random_seed=SEED,
        task_type='GPU',
        use_best_model=True,
    )
    print('--------- Train regressor ---------')
    reg_model.fit(
        X_reg_train,
        y_reg_train,
        eval_set=(X_reg_val, y_reg_val),
        cat_features=cat_features,
        verbose_eval=250,
    )
    y_val_reg = reg_model.predict(X_val)
    y_val_pred = y_val_reg * val_zero_probes
    val_rmsle = np.sqrt(mean_squared_error(y_val, y_val_pred))
    cv_metrics.append(val_rmsle)
    print(f'Fold {idx + 1}:: RMSLE = {val_rmsle}')

    test_zero_probes = clf_model.predict_proba(X_test)[:, 1]
    y_test_reg = reg_model.predict(X_test)
    test_predictions.append(np.exp(y_test_reg * test_zero_probes) - 1)

print('RMSLE by folds:', cv_metrics)
print(f'Mean CV RMSLE: {np.mean(cv_metrics)}')

--------- Train zeros classifier ---------
Learning rate set to 0.031244
0:	learn: 0.6115056	test: 0.6113204	best: 0.6113204 (0)	total: 56.7ms	remaining: 1m 53s
250:	learn: 0.0218036	test: 0.0136727	best: 0.0136727 (250)	total: 15.2s	remaining: 1m 45s
500:	learn: 0.0210569	test: 0.0132801	best: 0.0132801 (500)	total: 30.2s	remaining: 1m 30s
750:	learn: 0.0206659	test: 0.0131397	best: 0.0131397 (750)	total: 45s	remaining: 1m 14s
1000:	learn: 0.0203477	test: 0.0130441	best: 0.0130439 (999)	total: 59.7s	remaining: 59.6s
1250:	learn: 0.0200425	test: 0.0129669	best: 0.0129669 (1250)	total: 1m 14s	remaining: 44.6s
1500:	learn: 0.0197688	test: 0.0129260	best: 0.0129257 (1497)	total: 1m 29s	remaining: 29.7s
1750:	learn: 0.0194948	test: 0.0128814	best: 0.0128804 (1744)	total: 1m 44s	remaining: 14.9s
1999:	learn: 0.0192504	test: 0.0128673	best: 0.0128662 (1996)	total: 1m 59s	remaining: 0us
bestTest = 0.01286624732
bestIteration = 1996
Shrink model to first 1997 iterations.
--------- Train regres

In [9]:
submit = pd.DataFrame({
    'id': test['id'],
    'salary': np.mean(test_predictions, axis=0)
})

In [10]:
submit.describe()

Unnamed: 0,id,salary
count,295633.0,295633.0
mean,218856.382704,37703.42551
std,126137.696962,16307.855962
min,2.0,0.016644
25%,109922.0,25676.76234
50%,218792.0,33435.884884
75%,327955.0,45972.671778
max,437528.0,165853.400653


In [11]:
submit.to_csv('../submits/catboost-2levels-5fold-finalset-2k.csv', index=False)