In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from tqdm import tqdm
import json

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.options.mode.chained_assignment = None

In [2]:
# Считывание данных

sot = pd.read_csv('sotrudniki.csv', sep = ';')
sot['date'] = pd.to_datetime(sot['date'], format='%Y-%m-%d')

# rod = pd.read_csv('rodstvenniki.csv', sep = ';')
# ogrv = pd.read_csv('OGRV.csv', sep = ';')

In [3]:
train_target_df = sot[['hash_tab_num', 'date', 'sick']]
train_target_df.head()

Unnamed: 0,hash_tab_num,date,sick
0,0,2015-04-01,0
1,0,2015-05-01,0
2,0,2015-06-01,0
3,0,2015-07-01,0
4,0,2015-08-01,0


---

In [4]:
sot_data = sot[[
    'hash_tab_num','date','category','gender','razryad_fact','work_experience_company',
    'name_fact_lvl5','education','home_to_work_distance'
]]

sot_data['gender'] = sot_data['gender'].map(lambda x: 1 if x == 'мужской' else 0)

In [5]:
# Создание вспомогательно датасета с информацией о количестве сотрудников в подразделении
# по фактическому месту работы

division_count = sot_data[['hash_tab_num','date','name_fact_lvl5']].\
groupby(['name_fact_lvl5','date']).agg('count').reset_index()

division_count.columns = ['name_fact_lvl5', 'date', 'personel_num']

sot_data = pd.merge(sot_data, division_count, how = 'left', on = ['date','name_fact_lvl5'])

In [6]:
# Создание dummy переменных

sot_data.education = sot_data['education']\
.map(lambda x: 'Высшее' if x in ['Высшее образование','Высшее-бакалавриат','Высшее-специалитет'] else(\
'Среднее_профессинальное' if x in ['Ср.профессиональное','Нач.профессиональное'] else 'Начальное_среднее'))

sot_data = pd.get_dummies(sot_data, columns = ['category','education','razryad_fact']).drop('name_fact_lvl5', axis = 1)

In [7]:
sot_data['orig_date'] = sot_data['date'].copy()

In [8]:
sot_data.head()

Unnamed: 0,hash_tab_num,date,gender,work_experience_company,home_to_work_distance,personel_num,category_Рабочие,category_Руководители,category_Служащие,category_Специалисты,education_Высшее,education_Начальное_среднее,education_Среднее_профессинальное,razryad_fact_0,razryad_fact_1,razryad_fact_2,razryad_fact_3,razryad_fact_4,razryad_fact_5,razryad_fact_6,orig_date
0,0,2015-04-01,1,9.0,,,1,0,0,0,0,1,0,0,0,0,1,0,0,0,2015-04-01
1,0,2015-05-01,1,9.0,,,1,0,0,0,0,1,0,0,0,0,1,0,0,0,2015-05-01
2,0,2015-06-01,1,9.0,,,1,0,0,0,0,1,0,0,0,0,1,0,0,0,2015-06-01
3,0,2015-07-01,1,9.0,,,1,0,0,0,0,1,0,0,0,0,1,0,0,0,2015-07-01
4,0,2015-08-01,1,9.0,,,1,0,0,0,0,1,0,0,0,0,1,0,0,0,2015-08-01


In [9]:
submission_extra = sot_data[sot_data['orig_date'] == pd.to_datetime('2019-08-01')]
submission_extra['target'] = 0

---

In [10]:
with open('transformed_data/date_of_birth.json', 'r') as f:
    date_of_birth_dict = json.load(f)
    date_of_birth_dict = {int(k): int(v) for k, v in date_of_birth_dict.items()}


def calc_age(hash_tab_num, calc_date, date_of_birth_dict):
    cur_date = int(calc_date)
    birth_date = date_of_birth_dict[hash_tab_num]
    age = cur_date - birth_date
    return age


with open('transformed_data/relatives_info.json', 'r') as f:
    relatives_dict = json.load(f)
    relatives_dict = {int(k): v for k, v in relatives_dict.items()}


def calc_relatives_bins(hash_tab_num, calc_date, relatives_dict):
    '''
    bins:
        0: 0 - 3: младенец
        1: 4 - 7: ребенок
        2: 8 - 18: школьник
        3: 19 - 35: молодежь :)
        4: 36 - 55(F), 60(M): предпенсионный возраст
        5: 55(F), 60(M) - +++: пенсионер
        6: кол-во родственников мужского рода
        7: кол-во родственников женского рода
    '''
    
    bins = [0] * 8
    if hash_tab_num not in relatives_dict:
        return bins
    
    cur_date = int(calc_date)
    for (sex, birth_date) in relatives_dict[hash_tab_num]:
        if sex == 'M':
            bins[6] += 1
        elif sex == 'F':
            bins[7] += 1
            
        if birth_date < 0:
            continue
            
        age = cur_date - birth_date
        if age < 0:
            continue
        elif age <= 3:
            bins[0] += 1
        elif age <= 7:
            bins[1] += 1
        elif age <= 18:
            bins[2] += 1
        elif age <= 35:
            bins[3] += 1
        else:
            if (sex == 'M' and age >= 60) or (sex == 'F' and age >= 55):
                bins[5] += 1
            else:
                bins[4] += 1
    return bins

In [17]:
results = []

for months in range(1, 13):
    sot_data['date'] = sot_data['orig_date'] + pd.DateOffset(months=months)
    sot_data['year'] = sot_data['date'].dt.year
    sot_data['age'] = sot_data.apply(lambda x: calc_age(x['hash_tab_num'], x['year'], date_of_birth_dict), axis=1)
    sot_data['is_pensioner'] = (((sot_data['age'] >= 60) & (sot_data['gender'] == 1)) | ((sot_data['age'] >= 55) & (sot_data['gender'] == 0))).astype(int)
    sot_data['relatives'] = sot_data.apply(lambda x: calc_relatives_bins(x['hash_tab_num'], x['year'], relatives_dict), axis=1)
    for i in range(8):
        sot_data[f'relatives_{i}'] = sot_data['relatives'].apply(lambda x: x[i])
    sot_data = sot_data.drop(columns=['year', 'relatives'])

    merged_data = pd.merge(sot_data, train_target_df, on=['hash_tab_num', 'date'])
    X = merged_data.drop(columns=['hash_tab_num', 'date', 'orig_date', 'sick']).fillna(-100)
    y = merged_data['sick']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y,
    )

    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    p, r, thresholds = precision_recall_curve(y_test, model.predict_proba(X_test)[:,1])
    f1_scores = 2 * r * p / (r + p)
    f1_scores = f1_scores[p > 0]
    th = thresholds[np.argmax(f1_scores)]
    
    print(months, th, max(f1_scores))
    
    submission_extra['date'] = submission_extra['orig_date'] + pd.DateOffset(months=months)
    submission_extra['year'] = submission_extra['date'].dt.year
    submission_extra['age'] = submission_extra.apply(lambda x: calc_age(x['hash_tab_num'], x['year'], date_of_birth_dict), axis=1)
    submission_extra['is_pensioner'] = (((submission_extra['age'] >= 60) & (submission_extra['gender'] == 1)) | ((submission_extra['age'] >= 55) & (submission_extra['gender'] == 0))).astype(int)
    submission_extra['relatives'] = submission_extra.apply(lambda x: calc_relatives_bins(x['hash_tab_num'], x['year'], relatives_dict), axis=1)
    for i in range(8):
        submission_extra[f'relatives_{i}'] = submission_extra['relatives'].apply(lambda x: x[i])
    submission_extra = submission_extra.drop(columns=['year', 'relatives'])
    
    X_submission_extra = submission_extra.drop(columns=['hash_tab_num', 'date', 'orig_date', 'target']).fillna(-100)
    submission_extra['target'] = (model.predict_proba(X_submission_extra)[:, 1] >= th).astype(int)
    results.append(submission_extra[['hash_tab_num', 'date', 'target']].copy())

1 0.1608888888888889 0.3240793201133145
2 0.19982068146851423 0.33185279187817257
3 0.16399098124098122 0.3262433052792655
4 0.14 0.3312739040894381
5 0.18833333333333332 0.3436213991769548
6 0.18575937950937949 0.33858134416793634
7 0.15642857142857142 0.3382401046777887
8 0.12 0.33285736950658423
9 0.15158644133644128 0.33137322725700447
10 0.13055194805194806 0.33696563285834036
11 0.16333333333333333 0.334793372690916
12 0.19301932789432782 0.3288078065337293


In [18]:
result_df = pd.concat(results, ignore_index=True)
result_df.head()

Unnamed: 0,hash_tab_num,date,target
0,0,2019-09-01,1
1,1,2019-09-01,0
2,2,2019-09-01,0
3,3,2019-09-01,0
4,4,2019-09-01,1


---

In [19]:
check_df = pd.read_csv('submission_check.csv', sep=';').drop(columns=['target'])
check_df['date'] = pd.to_datetime(check_df['date'], format='%Y-%m-%d')
check_df.head()

Unnamed: 0,hash_tab_num,date
0,0,2019-09-01
1,0,2019-10-01
2,0,2019-11-01
3,0,2019-12-01
4,0,2020-01-01


In [20]:
result_df_new = pd.merge(result_df, check_df, on=['hash_tab_num', 'date']).sort_values(['hash_tab_num', 'date'])
result_df_new.head()

Unnamed: 0,hash_tab_num,date,target
0,0,2019-09-01,1
1757,0,2019-10-01,1
3509,0,2019-11-01,1
5265,0,2019-12-01,1
7025,0,2020-01-01,1


In [21]:
result_df_new.to_csv('submission_2.csv', sep=';', index=False)