In [1]:
import random
import numpy as np
import pandas as pd
# from catboost import CatBoostRegressor
from sklearn.model_selection import StratifiedKFold
from pathlib import Path

In [2]:
SEED = 14300631
N_FOLDS = 5

In [3]:
random.seed(SEED)
np.random.seed(SEED)

In [4]:
raw_data_dir = Path('../data/raw')
raw_train = pd.read_csv(raw_data_dir / 'train.csv', sep=';', parse_dates=['creation_date', 'modification_date', 'publish_date'])
raw_test = pd.read_csv(raw_data_dir / 'test.csv', sep=';', parse_dates=['creation_date', 'modification_date', 'publish_date'])
raw_education = pd.read_csv(raw_data_dir / 'education.csv', sep=';')
raw_employements = pd.read_csv(raw_data_dir / 'employements.csv', sep=';')
raw_worldskills = pd.read_csv(raw_data_dir / 'worldskills.csv', sep=';')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
def filter_experience(x):
    if np.isnan(x) or x > 50:
        return np.nan
    return x

def filter_age(x):
    if np.isnan(x) or x < 14 or x > 83:
        return np.nan
    return x

In [6]:
def preprocess_train_test(df):
    df['publish_year'] = df['publish_date'].dt.year

    all_drive_licences = ['A', 'B', 'C', 'D', 'E']
    for licence_type in all_drive_licences:
        df[f'drive_licences_{licence_type}'] = df['drive_licences'].fillna('').apply(lambda x: int(licence_type in x))
    

    all_schedules = [
        ('vahta', 'Вахтовый метод'),
        ('gibkiy', 'Гибкий график'),
        ('nenorm', 'Ненормированный рабочий день'),
        ('nepoln', 'Неполный рабочий день'),
        ('poln', 'Полный рабочий день'),
        ('smena', 'Сменный график'),
    ]

    for schedule_label, schedule_type in all_schedules:
        df[f'schedule_{schedule_label}'] = df['schedule'].apply(lambda x: int(schedule_type in x))
    
    df['experience'] = df['experience'].apply(filter_experience)
    df['age'] = df['age'].apply(filter_age)
    df = df.drop([
        'locality', 'position', 'locality_name','drive_licences',
        'schedule', 'is_worldskills_participant', 'has_qualifications',
        'creation_date', 'modification_date','publish_date',
    ], axis=1)

    if 'salary' in df.columns:
#         df = df[df['salary'] > 0]
        df['salary'] = np.log(df['salary'] + 1)
    
    df['salary_desired'] = np.log(df['salary_desired'] + 1)
    
    df['region'] = df['region'].astype('category')
    df['education_type'] = df['education_type'].astype('category')
    df['industry'] = df['industry'].astype('category')
    df['citizenship'] = df['citizenship'].astype('category')
    df['employement_type'] = df['employement_type'].astype('category')
    df['gender'] = df['gender'].astype('category')
    df['relocation_ready'] = df['relocation_ready'].astype('boolean')
    df['travel_ready'] = df['travel_ready'].astype('boolean')
    df['retraining_ready'] = df['retraining_ready'].astype('boolean')
    return df

In [7]:
def preprocess_education(df):
    df['graduation_year'] = df['graduation_year'].astype('category')
    df['institution'] = df['institution'].str.lower().str.replace('\"', '').astype('category')
    df = df.drop('description', axis=1)
    return df

In [8]:
def preprocess_employements(df):
    df['employer'] = df['employer'].str.lower().str.replace('\"', '').astype('category')
    df['position'] = df['position'].str.lower().str.replace('\"', '').astype('category')
    df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
    df['finish_date'] = pd.to_datetime(df['finish_date'], errors='coerce')
    df['work_duration'] = df['finish_date'] - df['start_date']
    df['work_duration'] = df['work_duration'].dt.days
    df = df.drop(['achievements', 'responsibilities', 'start_date', 'finish_date'], axis=1)
    return df

In [9]:
def preprocess_worldskills(df):
    df['status'] = df['status'].astype('category')
    df['int_name'] = df['int_name'].astype('category')
    df['ru_name'] = df['ru_name'].astype('category')
    df['code'] = df['code'].astype('category')
    df['is_international'] = df['is_international'].astype('boolean')
    return df

In [10]:
train = preprocess_train_test(raw_train)
test = preprocess_train_test(raw_test)
education = preprocess_education(raw_education)
employements = preprocess_employements(raw_employements)
worldskills = preprocess_worldskills(raw_worldskills)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [11]:
full_train = pd.merge(train, education, how='left', on='id')
full_train = pd.merge(full_train, employements, how='left', on='id')
full_train = pd.merge(full_train, worldskills, how='left', on='id')

In [12]:
full_test = pd.merge(test, education, how='left', on='id')
full_test = pd.merge(full_test, employements, how='left', on='id')
full_test = pd.merge(full_test, worldskills, how='left', on='id')

In [13]:
# NEW !!!
new_drop_columns = ['status', 'code', 'is_international', 'int_name', 'ru_name']
full_train = full_train.drop(new_drop_columns, axis=1)
full_test = full_test.drop(new_drop_columns, axis=1)

In [14]:
X_test = full_test.drop(['id'], axis=1)

In [15]:
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

In [16]:
X_test.head(3)

Unnamed: 0,region,industry,education_type,citizenship,employement_type,age,gender,experience,salary_desired,relocation_ready,...,schedule_gibkiy,schedule_nenorm,schedule_nepoln,schedule_poln,schedule_smena,graduation_year,institution,position,employer,work_duration
0,Алтайский край,"Строительство, ремонт, стройматериалы, недвижи...",Высшее,Российская Федерация,Полная занятость,27.0,Женский,0.0,9.238831,False,...,0,0,0,1,0,2014.0,алтайский государственный технический универси...,стажер,комитет по управлению муниципальной собственно...,30.0
1,Свердловская область,"Строительство, ремонт, стройматериалы, недвижи...",,Российская Федерация,Полная занятость,,,0.0,10.944647,,...,0,0,0,1,0,,,,,
2,Ульяновская область,"Транспорт, автобизнес, логистика, склад, ВЭД",Высшее,Российская Федерация,Полная занятость,51.0,Мужской,17.0,10.819798,False,...,0,0,0,1,0,2003.0,сибирский государственный университет путей со...,технический инспектор труда,дорпрофжел на красноярской железной железной д...,6270.0


In [17]:
from category_encoders.cat_boost import CatBoostEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [18]:
cat_columns = full_test.select_dtypes(include=['category', 'boolean']).columns

In [19]:
cat_encoder = CatBoostEncoder(
    cols=cat_columns,
)

In [20]:
# full_test.info()

In [21]:
from tqdm import tqdm

In [22]:
cv_metrics = []
test_preds = []

idx = 0
for train_indexes, val_indexes in tqdm(skf.split(full_train, full_train['publish_year'])):
    print(idx)
    idx += 1

    X_train = full_train.loc[train_indexes]
    y_train = X_train['salary']
    X_train = X_train.drop(['id', 'salary'], axis=1)
    X_train = cat_encoder.fit_transform(X_train, y_train)
    

    X_val = full_train.loc[val_indexes]
    y_val = X_val['salary']
    X_val = X_val.drop(['id', 'salary'], axis=1)
    X_val = cat_encoder.transform(X_val)
    
    X_test_temp = cat_encoder.transform(X_test)
    
    for col in X_train.columns:
        mean_value = X_train[col].mean()
        X_train[col] = X_train[col].fillna(mean_value)
        X_val[col] = X_val[col].fillna(mean_value)
        X_test_temp[col] = X_test_temp[col].fillna(mean_value)

    model = RandomForestRegressor(random_state=SEED, n_jobs=-1)
    model.fit(
        X_train,
        y_train,
    )
    
    val_pred = model.predict(X_val)
    val_score = mean_squared_error(y_val, val_pred)
    cv_metrics.append(val_score)
    print(val_score)

    test_pred = model.predict(X_test_temp)
    test_preds.append(np.exp(test_pred) - 1) 

  elif pd.api.types.is_categorical(cols):


0
1.064824053746449


  elif pd.api.types.is_categorical(cols):


1
1.062840005175606


  elif pd.api.types.is_categorical(cols):


2
1.0315578529535596


3it [17:31, 345.09s/it]

3


  elif pd.api.types.is_categorical(cols):
3it [22:20, 446.93s/it]


KeyboardInterrupt: 

In [23]:
# Local validation
np.mean(cv_metrics)

1.0530739706252048

In [24]:
len(test_preds)

3

In [25]:
np.array(test_preds).shape, np.array(test_preds)

((3, 131259),
 array([[15115.76679878, 68332.65545287, 33311.09457315, ...,
         42895.71553614, 28364.18904425,  8848.82947777],
        [17658.66638354, 67960.93482596, 36876.35111421, ...,
         41052.33888781, 28914.65730242, 11492.47677591],
        [17507.53474632, 63753.27416453, 35597.69168334, ...,
         38776.77065922, 29636.38775588, 14197.76558563]]))

In [26]:
test_pred = np.array(test_preds).mean(axis=0)
test_pred

array([16760.65597621, 66682.28814779, 35261.7124569 , ...,
       40908.27502772, 28971.74470085, 11513.02394644])

In [27]:
new_test = full_test[['id']]
new_test['salary'] = test_pred
new_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_test['salary'] = test_pred


Unnamed: 0,id,salary
0,2,16760.655976
1,6,66682.288148
2,7,35261.712457
3,11,24059.456716
4,16,37474.177125


In [28]:
new_test.to_csv('rf_5_fold_with_0_target.csv', index=False)