In [90]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, roc_auc_score
from tqdm import tqdm
import json

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.options.mode.chained_assignment = None

In [None]:
from datetime import datetime

def to_datetime(date):
    """
    Converts a numpy datetime64 object to a python datetime object 
    Input:
      date - a np.datetime64 object
    Output:
      DATE - a python datetime object
    """
    timestamp = ((date - np.datetime64('1970-01-01T00:00:00')) / np.timedelta64(1, 's'))
    return datetime.utcfromtimestamp(timestamp)

In [3]:
# Считывание данных

sot = pd.read_csv('sotrudniki.csv', sep = ';')
sot['date'] = pd.to_datetime(sot['date'], format='%Y-%m-%d')

train_target_df = sot[['hash_tab_num', 'date', 'sick']]
train_target_df.head()

Unnamed: 0,hash_tab_num,date,sick
0,0,2015-04-01,0
1,0,2015-05-01,0
2,0,2015-06-01,0
3,0,2015-07-01,0
4,0,2015-08-01,0


---

In [4]:
sot_data = sot[[
    'hash_tab_num','date','category','gender','razryad_fact','work_experience_company',
    'name_fact_lvl5','education','home_to_work_distance'
]]

sot_data['gender'] = sot_data['gender'].map(lambda x: 1 if x == 'мужской' else 0)

In [5]:
# Создание вспомогательно датасета с информацией о количестве сотрудников в подразделении
# по фактическому месту работы

division_count = sot_data[['hash_tab_num','date','name_fact_lvl5']].\
groupby(['name_fact_lvl5','date']).agg('count').reset_index()

division_count.columns = ['name_fact_lvl5', 'date', 'personel_num']

sot_data = pd.merge(sot_data, division_count, how = 'left', on = ['date','name_fact_lvl5'])

In [6]:
# Создание dummy переменных

sot_data.education = sot_data['education']\
.map(lambda x: 'Высшее' if x in ['Высшее образование','Высшее-бакалавриат','Высшее-специалитет'] else(\
'Среднее_профессинальное' if x in ['Ср.профессиональное','Нач.профессиональное'] else 'Начальное_среднее'))

sot_data = pd.get_dummies(sot_data, columns = ['category','education','razryad_fact']).drop('name_fact_lvl5', axis = 1)
sot_data['orig_date'] = sot_data['date'].copy()

In [8]:
sot_data.head()

Unnamed: 0,hash_tab_num,date,gender,work_experience_company,home_to_work_distance,personel_num,category_Рабочие,category_Руководители,category_Служащие,category_Специалисты,education_Высшее,education_Начальное_среднее,education_Среднее_профессинальное,razryad_fact_0,razryad_fact_1,razryad_fact_2,razryad_fact_3,razryad_fact_4,razryad_fact_5,razryad_fact_6,orig_date
0,0,2015-04-01,1,9.0,,,1,0,0,0,0,1,0,0,0,0,1,0,0,0,2015-04-01
1,0,2015-05-01,1,9.0,,,1,0,0,0,0,1,0,0,0,0,1,0,0,0,2015-05-01
2,0,2015-06-01,1,9.0,,,1,0,0,0,0,1,0,0,0,0,1,0,0,0,2015-06-01
3,0,2015-07-01,1,9.0,,,1,0,0,0,0,1,0,0,0,0,1,0,0,0,2015-07-01
4,0,2015-08-01,1,9.0,,,1,0,0,0,0,1,0,0,0,0,1,0,0,0,2015-08-01


In [9]:
submission_extra = sot_data[sot_data['orig_date'] == pd.to_datetime('2019-08-01')]
submission_extra['target'] = 0

---

In [10]:
with open('transformed_data/date_of_birth.json', 'r') as f:
    date_of_birth_dict = json.load(f)
    date_of_birth_dict = {int(k): int(v) for k, v in date_of_birth_dict.items()}


def calc_age(hash_tab_num, calc_date, date_of_birth_dict):
    cur_date = int(calc_date)
    birth_date = date_of_birth_dict[hash_tab_num]
    age = cur_date - birth_date
    return age


with open('transformed_data/relatives_info.json', 'r') as f:
    relatives_dict = json.load(f)
    relatives_dict = {int(k): v for k, v in relatives_dict.items()}


def calc_relatives_bins(hash_tab_num, calc_date, relatives_dict):
    '''
    bins:
        0: 0 - 3: младенец
        1: 4 - 7: ребенок
        2: 8 - 18: школьник
        3: 19 - 35: молодежь :)
        4: 36 - 55(F), 60(M): предпенсионный возраст
        5: 55(F), 60(M) - +++: пенсионер
        6: кол-во родственников мужского рода
        7: кол-во родственников женского рода
    '''
    
    bins = [0] * 8
    if hash_tab_num not in relatives_dict:
        return bins
    
    cur_date = int(calc_date)
    for (sex, birth_date) in relatives_dict[hash_tab_num]:
        if sex == 'M':
            bins[6] += 1
        elif sex == 'F':
            bins[7] += 1
            
        if birth_date < 0:
            continue
            
        age = cur_date - birth_date
        if age < 0:
            continue
        elif age <= 3:
            bins[0] += 1
        elif age <= 7:
            bins[1] += 1
        elif age <= 18:
            bins[2] += 1
        elif age <= 35:
            bins[3] += 1
        else:
            if (sex == 'M' and age >= 60) or (sex == 'F' and age >= 55):
                bins[5] += 1
            else:
                bins[4] += 1
    return bins

In [83]:
def target_date_features(df):
    df['year'] = df['date'].dt.year
    df['age'] = df.apply(lambda x: calc_age(x['hash_tab_num'], x['year'], date_of_birth_dict), axis=1)
    df['is_pensioner'] = (((df['age'] >= 60) & (df['gender'] == 1)) | ((df['age'] >= 55) & (df['gender'] == 0))).astype(int)
    df['relatives'] = df.apply(lambda x: calc_relatives_bins(x['hash_tab_num'], x['year'], relatives_dict), axis=1)
    for i in range(8):
        df[f'relatives_{i}'] = df['relatives'].apply(lambda x: x[i])
    df = df.drop(columns=['year', 'relatives'])
    
    return df

## 1 Month model

In [85]:
months = 1
check_shift = 1
prediction_dates = sorted(sot_data['orig_date'].unique())[:-(months + check_shift)]

In [92]:
for idx, prediction_date in enumerate(prediction_dates):
    
    df_train = sot_data[sot_data['orig_date'] <= prediction_date].copy()
    df_train['date'] = df_train['orig_date'] + pd.DateOffset(months=months)
    df_train = pd.merge(df_train, train_target_df, on=['hash_tab_num', 'date'])
    
    test_date = to_datetime(prediction_date) + relativedelta(months=check_shift)
    df_test = sot_data[sot_data['orig_date'] == test_date].copy()
    df_test['date'] = df_test['orig_date'] + pd.DateOffset(months=months)
    df_test = pd.merge(df_test, train_target_df, on=['hash_tab_num', 'date'])

    df_train = target_date_features(df_train)
    df_test = target_date_features(df_test)
    
    X_train = df_train.drop(columns=['hash_tab_num', 'date', 'orig_date', 'sick']).fillna(-100)
    X_test = df_test.drop(columns=['hash_tab_num', 'date', 'orig_date', 'sick']).fillna(-100)
    
    y_train = df_train['sick']
    y_test = df_test['sick']

    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, preds)
    
    print(f'{idx + 1}/{len(prediction_dates)}', score)

1/54 0.6511155302039179
2/54 0.6545483767277404
3/54 0.5645284300878453
4/54 0.6877272727272727
5/54 0.6792152168129822
6/54 0.6573453557891447
7/54 0.6981771590206681
8/54 0.6774611881212693
9/54 0.6663067552602436
10/54 0.6923417590872412
11/54 0.6317884228475865
12/54 0.6975479744136461
13/54 0.6481928736341487
14/54 0.668122157088859
15/54 0.6855188712637293
16/54 0.6455491996587887
17/54 0.692638663262938
18/54 0.6808885596264367
19/54 0.6530021836224355
20/54 0.6479178716020821
21/54 0.6611764287365931
22/54 0.6782167895223282
23/54 0.6613863636363637
24/54 0.7037440042052697
25/54 0.6638358250276855
26/54 0.6925992341944315
27/54 0.6506002250016545
28/54 0.7026553220277241
29/54 0.6702271584624526
30/54 0.667213256801962
31/54 0.6490829943623777
32/54 0.6618747623590376
33/54 0.6840862412686617
34/54 0.6590831180744997
35/54 0.65755245814711
36/54 0.7240151363657341
37/54 0.7045324285539281
38/54 0.6770011534996505
39/54 0.6565502022643208
40/54 0.6796352636471648
41/54 0.659354

## 2 Months model

In [93]:
months = 2
check_shift = 1
prediction_dates = sorted(sot_data['orig_date'].unique())[:-(months + check_shift)]

In [94]:
for idx, prediction_date in enumerate(prediction_dates):
    
    df_train = sot_data[sot_data['orig_date'] <= prediction_date].copy()
    df_train['date'] = df_train['orig_date'] + pd.DateOffset(months=months)
    df_train = pd.merge(df_train, train_target_df, on=['hash_tab_num', 'date'])
    
    test_date = to_datetime(prediction_date) + relativedelta(months=check_shift)
    df_test = sot_data[sot_data['orig_date'] == test_date].copy()
    df_test['date'] = df_test['orig_date'] + pd.DateOffset(months=months)
    df_test = pd.merge(df_test, train_target_df, on=['hash_tab_num', 'date'])
    
    
    df_train = target_date_features(df_train)
    df_test = target_date_features(df_test)
    
    X_train = df_train.drop(columns=['hash_tab_num', 'date', 'orig_date', 'sick']).fillna(-100)
    X_test = df_test.drop(columns=['hash_tab_num', 'date', 'orig_date', 'sick']).fillna(-100)
    
    y_train = df_train['sick']
    y_test = df_test['sick']

    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, preds)
    
    print(f'{idx + 1}/{len(prediction_dates)}', score)


1/53 0.619779524713111
2/53 0.6050254486077091
3/53 0.6221172370806891
4/53 0.6985824558676873
5/53 0.6386934326710817
6/53 0.6854657687991021
7/53 0.6759918463036864
8/53 0.6839781271360217
9/53 0.6867372940869599
10/53 0.6439135184865735
11/53 0.6963285606631499
12/53 0.6422720563390302
13/53 0.6884239360489728
14/53 0.7042754813548336
15/53 0.6433444722918408
16/53 0.6844676533664157
17/53 0.7032806339241281
18/53 0.6399026148010313
19/53 0.6444176681358403
20/53 0.6559602597923512
21/53 0.6909033322178133
22/53 0.6509112313448701
23/53 0.6853677224439936
24/53 0.6644593161560287
25/53 0.6945996496046215
26/53 0.6580794854855014
27/53 0.7156558638955209
28/53 0.6825446651674163
29/53 0.6662282780945673
30/53 0.6601027063862928
31/53 0.6718166146391464
32/53 0.6819306419821884
33/53 0.6600112177489716
34/53 0.6624408738607404
35/53 0.7060436909174653
36/53 0.7059539297643432
37/53 0.6689178256107275
38/53 0.659558178752108
39/53 0.6952584315602626
40/53 0.6683592353023495
41/53 0.696