In [1]:
import random
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import StratifiedKFold
from pathlib import Path

In [2]:
SEED = 14300631
N_FOLDS = 10

In [3]:
random.seed(SEED)
np.random.seed(SEED)

In [4]:
raw_data_dir = Path('../data/raw')
raw_train = pd.read_csv(raw_data_dir / 'train.csv', sep=';', parse_dates=['creation_date', 'modification_date', 'publish_date'])
raw_test = pd.read_csv(raw_data_dir / 'test.csv', sep=';', parse_dates=['creation_date', 'modification_date', 'publish_date'])
raw_education = pd.read_csv(raw_data_dir / 'education.csv', sep=';')
raw_employements = pd.read_csv(raw_data_dir / 'employements.csv', sep=';')
raw_worldskills = pd.read_csv(raw_data_dir / 'worldskills.csv', sep=';')

In [5]:
def filter_experience(x):
    if np.isnan(x) or x > 50:
        return np.nan
    return x

def filter_age(x):
    if np.isnan(x) or x < 14 or x > 83:
        return np.nan
    return x

In [6]:
def preprocess_train_test(df):
    df['publish_year'] = df['publish_date'].dt.year

    all_drive_licences = ['A', 'B', 'C', 'D', 'E']
    for licence_type in all_drive_licences:
        df[f'drive_licences_{licence_type}'] = df['drive_licences'].fillna('').apply(lambda x: licence_type in x)
    

    all_schedules = [
        ('vahta', 'Вахтовый метод'),
        ('gibkiy', 'Гибкий график'),
        ('nenorm', 'Ненормированный рабочий день'),
        ('nepoln', 'Неполный рабочий день'),
        ('poln', 'Полный рабочий день'),
        ('smena', 'Сменный график'),
    ]

    for schedule_label, schedule_type in all_schedules:
        df[f'schedule_{schedule_label}'] = df['schedule'].apply(lambda x: schedule_type in x)
    
    df['experience'] = df['experience'].apply(filter_experience)
    df['age'] = df['age'].apply(filter_age)
    df = df.drop([
        'locality', 'position', 'locality_name','drive_licences',
        'schedule', 'is_worldskills_participant', 'has_qualifications',
        'creation_date', 'modification_date','publish_date',
    ], axis=1)

    if 'salary' in df.columns:
        df = df[df['salary'] > 0]
        df['salary'] = np.log(df['salary'] + 1)
    
    df['salary_desired'] = np.log(df['salary_desired'] + 1)
    
    df['region'] = df['region'].astype('category')
    df['education_type'] = df['education_type'].astype('category')
    df['industry'] = df['industry'].astype('category')
    df['citizenship'] = df['citizenship'].astype('category')
    df['employement_type'] = df['employement_type'].astype('category')
    df['gender'] = df['gender'].astype('category')
    df['relocation_ready'] = df['relocation_ready'].astype('boolean')
    df['travel_ready'] = df['travel_ready'].astype('boolean')
    df['retraining_ready'] = df['retraining_ready'].astype('boolean')
    return df

In [7]:
def preprocess_education(df):
    df['graduation_year'] = df['graduation_year'].astype('category')
    df['institution'] = df['institution'].str.lower().str.replace('\"', '').astype('category')
    df = df.drop('description', axis=1)
    return df

In [8]:
def preprocess_employements(df):
    df['employer'] = df['employer'].str.lower().str.replace('\"', '').astype('category')
    df['position'] = df['position'].str.lower().str.replace('\"', '').astype('category')
    df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
    df['finish_date'] = pd.to_datetime(df['finish_date'], errors='coerce')
    df['work_duration'] = df['finish_date'] - df['start_date']
    df['work_duration'] = df['work_duration'].dt.days
    df = df.drop(['achievements', 'responsibilities', 'start_date', 'finish_date'], axis=1)
    return df

In [9]:
def preprocess_worldskills(df):
    df['status'] = df['status'].astype('category')
    df['int_name'] = df['int_name'].astype('category')
    df['ru_name'] = df['ru_name'].astype('category')
    df['code'] = df['code'].astype('category')
    df['is_international'] = df['is_international'].astype('boolean')
    return df

In [10]:
train = preprocess_train_test(raw_train)
test = preprocess_train_test(raw_test)
education = preprocess_education(raw_education)
employements = preprocess_employements(raw_employements)
worldskills = preprocess_worldskills(raw_worldskills)

In [11]:
full_train = pd.merge(train, education, how='left', on='id')
full_train = pd.merge(full_train, employements, how='left', on='id')
full_train = pd.merge(full_train, worldskills, how='left', on='id')

In [12]:
full_test = pd.merge(test, education, how='left', on='id')
full_test = pd.merge(full_test, employements, how='left', on='id')
full_test = pd.merge(full_test, worldskills, how='left', on='id')

In [24]:
X_test = full_test.drop(['id'], axis=1)

In [23]:
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

In [27]:
cv_metrics = []
test_preds = []
for train_indexes, val_indexes in skf.split(full_train, full_train['publish_year']):
    X_train = full_train.loc[train_indexes]
    y_train = X_train['salary']
    X_train = X_train.drop(['id', 'salary'], axis=1)

    X_val = full_train.loc[val_indexes]
    y_val = X_val['salary']
    X_val = X_val.drop(['id', 'salary'], axis=1)

    model = CatBoostRegressor(random_seed=SEED, task_type='GPU')
    model.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        use_best_model=True,
        cat_features=cat_columns,
    )
    test_pred = model.predict(X_test)
    test_preds.append(exp(test_pred)) 

CatBoostError: Invalid type for cat_feature[non-default value idx=4686,feature_idx=0]=nan : cat_features must be integer or string, real number values and NaN values should be converted to string.