In [1]:
import random
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from pathlib import Path

In [2]:
SEED = 14300631
N_FOLDS = 5

In [3]:
random.seed(SEED)
np.random.seed(SEED)

In [4]:
# raw_data_dir = Path('../data/raw')
# raw_train = pd.read_csv(raw_data_dir / 'train.csv', sep=';')
# raw_test = pd.read_csv(raw_data_dir / 'test.csv', sep=';')
# raw_education = pd.read_csv(raw_data_dir / 'education.csv', sep=';')
# raw_employements = pd.read_csv(raw_data_dir / 'employements.csv', sep=';')
# raw_worldskills = pd.read_csv(raw_data_dir / 'worldskills.csv', sep=';')
train = pd.read_pickle('../data/interim/train_clean.pkl')
test = pd.read_pickle('../data/interim/test_clean.pkl')
# train_test_clean = pd.read_csv('../data/interim/train_test_clean.csv', parse_dates=['publish_date'], keep_default_na=False)

In [7]:
train['publish_year'] = train['publish_date'].dt.year
train = train.drop('publish_date', axis=1)
test = test.drop('publish_date', axis=1)

In [7]:
# def preprocess_education(df):
#     df['institution'] = df['institution'].str.lower().str.replace('\"', '')
#     df = df.drop('description', axis=1)
#     return df

In [8]:
# def preprocess_employements(df):
#     df['employer'] = df['employer'].str.lower().str.replace('\"', '').fillna('N/A').astype('category')
#     df['position'] = df['position'].str.lower().str.replace('\"', '').fillna('N/A').astype('category')
#     df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
#     df['finish_date'] = pd.to_datetime(df['finish_date'], errors='coerce')
#     df['work_duration'] = df['finish_date'] - df['start_date']
#     df['work_duration'] = df['work_duration'].dt.days
#     df = df.drop(['achievements', 'responsibilities', 'start_date', 'finish_date'], axis=1)
#     return df

In [9]:
# def preprocess_worldskills(df):
#     df['status'] = df['status'].fillna('N/A').astype('category')
#     df['int_name'] = df['int_name'].fillna('N/A').astype('category')
#     df['ru_name'] = df['ru_name'].fillna('N/A').astype('category')
#     df['code'] = df['code'].fillna('N/A').astype('category')
#     df['is_international'] = df['is_international'].fillna('N/A').astype('category')
#     return df

In [10]:
# def postprocess_megred_dataset(df):
#     df['graduation_year'] = df['graduation_year'].fillna(-1).astype('int').astype('category')
#     df['institution'] = df['institution'].fillna('N/A').astype('category')
#     df['employer'] = df['employer'].fillna('N/A').astype('category')
#     df['position'] = df['position'].fillna('N/A').astype('category')
#     return df

In [11]:
# train = preprocess_train_test(raw_train)
# test = preprocess_train_test(raw_test)
# education = preprocess_education(raw_education)
# employements = preprocess_employements(raw_employements)
# # worldskills = preprocess_worldskills(raw_worldskills)

In [12]:
# full_train = pd.merge(train, education, how='left', on='id')
# full_train = pd.merge(full_train, employements, how='left', on='id')
# # full_train = pd.merge(full_train, worldskills, how='left', on='id')
# full_train = postprocess_megred_dataset(full_train)

In [13]:
# full_test = pd.merge(test, education, how='left', on='id')
# full_test = pd.merge(full_test, employements, how='left', on='id')
# # full_test = pd.merge(full_test, worldskills, how='left', on='id')
# full_test = postprocess_megred_dataset(full_test)

In [8]:
X_test = test.drop('id', axis=1)
train = train.drop('id', axis=1)

In [9]:
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

In [10]:
from sklearn.metrics import mean_squared_error

cv_metrics = []
test_predictions = []
for idx, (train_indexes, val_indexes) in enumerate(skf.split(train, train['publish_year'])):
    X_train = train.loc[train_indexes].drop(['publish_year', 'salary'], axis=1)
    y_train = train.loc[train_indexes, 'salary']
    
    X_val = train.loc[val_indexes].drop(['publish_year', 'salary'], axis=1)
    y_val = train.loc[val_indexes, 'salary']

    cat_features = X_train.select_dtypes('category').columns.values
    # 1st model - zeros classifier
    y_clf_train = (y_train > 0).astype('int')
    y_clf_val = (y_val > 0).astype('int')

    clf_model = CatBoostClassifier(
        iterations=2000,
        random_seed=SEED,
        task_type='GPU',
        use_best_model=True,
    )
    print('--------- Train zeros classifier ---------')
    clf_model.fit(
        X_train,
        y_clf_train,
        eval_set=(X_val, y_clf_val),
        cat_features=cat_features,
        verbose_eval=250,
    )
    val_zero_probes = clf_model.predict_proba(X_val)[:, 1]
    # 2nd model - regressor
    X_reg_train = X_train[y_train > 0]
    y_reg_train = y_train[y_train > 0]

    X_reg_val = X_val[y_val > 0]
    y_reg_val = y_val[y_val > 0]
    
    reg_model = CatBoostRegressor(
        iterations=2000,
        random_seed=SEED,
        task_type='GPU',
        use_best_model=True,
    )
    print('--------- Train regressor ---------')
    reg_model.fit(
        X_reg_train,
        y_reg_train,
        eval_set=(X_reg_val, y_reg_val),
        cat_features=cat_features,
        verbose_eval=250,
    )
    y_val_reg = reg_model.predict(X_val)
    y_val_pred = y_val_reg * val_zero_probes
    val_rmsle = np.sqrt(mean_squared_error(y_val, y_val_pred))
    cv_metrics.append(val_rmsle)
    print(f'Fold {idx + 1}:: RMSLE = {val_rmsle}')

    test_zero_probes = clf_model.predict_proba(X_test)[:, 1]
    y_test_reg = reg_model.predict(X_test)
    test_predictions.append(np.exp(y_test_reg * test_zero_probes) - 1)

print('RMSLE by folds:', cv_metrics)
print(f'Mean CV RMSLE: {np.mean(cv_metrics)}')

--------- Train zeros classifier ---------
Learning rate set to 0.033482
0:	learn: 0.6164779	test: 0.6161844	best: 0.6161844 (0)	total: 32.9ms	remaining: 1m 5s
250:	learn: 0.0423027	test: 0.0425762	best: 0.0425762 (250)	total: 7.21s	remaining: 50.3s
500:	learn: 0.0414173	test: 0.0421593	best: 0.0421585 (498)	total: 14.4s	remaining: 43.1s
750:	learn: 0.0407736	test: 0.0420138	best: 0.0420138 (750)	total: 21.5s	remaining: 35.8s
1000:	learn: 0.0401758	test: 0.0419619	best: 0.0419500 (978)	total: 28.7s	remaining: 28.6s
1250:	learn: 0.0396179	test: 0.0418936	best: 0.0418934 (1249)	total: 35.9s	remaining: 21.5s
1500:	learn: 0.0390901	test: 0.0419155	best: 0.0418872 (1258)	total: 43.3s	remaining: 14.4s
1750:	learn: 0.0385957	test: 0.0419218	best: 0.0418872 (1258)	total: 50.7s	remaining: 7.21s
1999:	learn: 0.0380507	test: 0.0419529	best: 0.0418872 (1258)	total: 58s	remaining: 0us
bestTest = 0.04188718001
bestIteration = 1258
Shrink model to first 1259 iterations.
--------- Train regressor ----

In [12]:
submit = pd.DataFrame({
    'id': test['id'],
    'salary': np.mean(test_predictions, axis=0)
})

In [13]:
submit.describe()

Unnamed: 0,id,salary
count,131259.0,131259.0
mean,218912.071987,33945.440579
std,126301.758751,15962.10467
min,2.0,0.177108
25%,109499.0,22794.281759
50%,218986.0,29489.444979
75%,328305.5,40507.385598
max,437528.0,150903.093342


In [19]:
submit.to_csv('../submits/catboost-2levels-5fold-cleaned_datas.csv', index=False)