In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, roc_auc_score, f1_score
from tqdm import tqdm
import json

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.options.mode.chained_assignment = None
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

def to_datetime(date):
    """
    Converts a numpy datetime64 object to a python datetime object 
    Input:
      date - a np.datetime64 object
    Output:
      DATE - a python datetime object
    """
    timestamp = ((date - np.datetime64('1970-01-01T00:00:00')) / np.timedelta64(1, 's'))
    return datetime.utcfromtimestamp(timestamp)

In [3]:
# Считывание данных

sot = pd.read_csv('sotrudniki.csv', sep = ';')
sot['date'] = pd.to_datetime(sot['date'], format='%Y-%m-%d')

train_target_df = sot[['hash_tab_num', 'date', 'sick']]
train_target_df.head()

Unnamed: 0,hash_tab_num,date,sick
0,0,2015-04-01,0
1,0,2015-05-01,0
2,0,2015-06-01,0
3,0,2015-07-01,0
4,0,2015-08-01,0


---

In [4]:
sot_data = sot[[
    'hash_tab_num','date','category','gender','razryad_fact','work_experience_company',
    'name_fact_lvl5','education','home_to_work_distance'
]]

sot_data['gender'] = sot_data['gender'].map(lambda x: 1 if x == 'мужской' else 0)

In [5]:
# Создание вспомогательно датасета с информацией о количестве сотрудников в подразделении
# по фактическому месту работы

division_count = sot_data[['hash_tab_num','date','name_fact_lvl5']].\
groupby(['name_fact_lvl5','date']).agg('count').reset_index()

division_count.columns = ['name_fact_lvl5', 'date', 'personel_num']

sot_data = pd.merge(sot_data, division_count, how = 'left', on = ['date','name_fact_lvl5'])

In [6]:
# Создание dummy переменных

sot_data.education = sot_data['education']\
.map(lambda x: 'Высшее' if x in ['Высшее образование','Высшее-бакалавриат','Высшее-специалитет'] else(\
'Среднее_профессинальное' if x in ['Ср.профессиональное','Нач.профессиональное'] else 'Начальное_среднее'))

sot_data = pd.get_dummies(sot_data, columns = ['category','education','razryad_fact']).drop('name_fact_lvl5', axis = 1)
sot_data['orig_date'] = sot_data['date'].copy()

In [7]:
sot_data.head()

Unnamed: 0,hash_tab_num,date,gender,work_experience_company,home_to_work_distance,personel_num,category_Рабочие,category_Руководители,category_Служащие,category_Специалисты,education_Высшее,education_Начальное_среднее,education_Среднее_профессинальное,razryad_fact_0,razryad_fact_1,razryad_fact_2,razryad_fact_3,razryad_fact_4,razryad_fact_5,razryad_fact_6,orig_date
0,0,2015-04-01,1,9.0,,,1,0,0,0,0,1,0,0,0,0,1,0,0,0,2015-04-01
1,0,2015-05-01,1,9.0,,,1,0,0,0,0,1,0,0,0,0,1,0,0,0,2015-05-01
2,0,2015-06-01,1,9.0,,,1,0,0,0,0,1,0,0,0,0,1,0,0,0,2015-06-01
3,0,2015-07-01,1,9.0,,,1,0,0,0,0,1,0,0,0,0,1,0,0,0,2015-07-01
4,0,2015-08-01,1,9.0,,,1,0,0,0,0,1,0,0,0,0,1,0,0,0,2015-08-01


In [8]:
submission_extra = sot_data[sot_data['orig_date'] == pd.to_datetime('2019-08-01')]
submission_extra['target'] = 0

---

In [9]:
with open('transformed_data/date_of_birth.json', 'r') as f:
    date_of_birth_dict = json.load(f)
    date_of_birth_dict = {int(k): int(v) for k, v in date_of_birth_dict.items()}


def calc_age(hash_tab_num, calc_date, date_of_birth_dict):
    cur_date = int(calc_date)
    birth_date = date_of_birth_dict[hash_tab_num]
    age = cur_date - birth_date
    return age


with open('transformed_data/relatives_info.json', 'r') as f:
    relatives_dict = json.load(f)
    relatives_dict = {int(k): v for k, v in relatives_dict.items()}


def calc_relatives_bins(hash_tab_num, calc_date, relatives_dict):
    '''
    bins:
        0: 0 - 3: младенец
        1: 4 - 7: ребенок
        2: 8 - 18: школьник
        3: 19 - 35: молодежь :)
        4: 36 - 55(F), 60(M): предпенсионный возраст
        5: 55(F), 60(M) - +++: пенсионер
        6: кол-во родственников мужского рода
        7: кол-во родственников женского рода
    '''
    
    bins = [0] * 8
    if hash_tab_num not in relatives_dict:
        return bins
    
    cur_date = int(calc_date)
    for (sex, birth_date) in relatives_dict[hash_tab_num]:
        if sex == 'M':
            bins[6] += 1
        elif sex == 'F':
            bins[7] += 1
            
        if birth_date < 0:
            continue
            
        age = cur_date - birth_date
        if age < 0:
            continue
        elif age <= 3:
            bins[0] += 1
        elif age <= 7:
            bins[1] += 1
        elif age <= 18:
            bins[2] += 1
        elif age <= 35:
            bins[3] += 1
        else:
            if (sex == 'M' and age >= 60) or (sex == 'F' and age >= 55):
                bins[5] += 1
            else:
                bins[4] += 1
    return bins

In [10]:
def target_date_features(df):
    df['year'] = df['date'].dt.year
    df['age'] = df.apply(lambda x: calc_age(x['hash_tab_num'], x['year'], date_of_birth_dict), axis=1)
    df['is_pensioner'] = (((df['age'] >= 60) & (df['gender'] == 1)) | ((df['age'] >= 55) & (df['gender'] == 0))).astype(int)
    df['relatives'] = df.apply(lambda x: calc_relatives_bins(x['hash_tab_num'], x['year'], relatives_dict), axis=1)
    for i in range(8):
        df[f'relatives_{i}'] = df['relatives'].apply(lambda x: x[i])
    df = df.drop(columns=['year', 'relatives'])
    
    return df

## 1 Month model

In [11]:
def run_training(months, check_shift):
    prediction_dates = sorted(sot_data['orig_date'].unique())[:-(months + check_shift)]
    
    roc_auc_scores = []
    thresholds = []
    f1_scores = []
    last_model = None

    for idx, prediction_date in enumerate(prediction_dates):

        df_train = sot_data[sot_data['orig_date'] <= prediction_date].copy()
        df_train['date'] = df_train['orig_date'] + pd.DateOffset(months=months)
        df_train = pd.merge(df_train, train_target_df, on=['hash_tab_num', 'date'])

        test_date = to_datetime(prediction_date) + relativedelta(months=check_shift)
        df_test = sot_data[sot_data['orig_date'] == test_date].copy()
        df_test['date'] = df_test['orig_date'] + pd.DateOffset(months=months)
        df_test = pd.merge(df_test, train_target_df, on=['hash_tab_num', 'date'])

        df_train = target_date_features(df_train)
        df_test = target_date_features(df_test)

        X_train = df_train.drop(columns=['hash_tab_num', 'date', 'orig_date', 'sick']).fillna(-100)
        X_test = df_test.drop(columns=['hash_tab_num', 'date', 'orig_date', 'sick']).fillna(-100)

        y_train = df_train['sick']
        y_test = df_test['sick']

        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_test)[:, 1]
        roc_auc_score_ = roc_auc_score(y_test, preds)
        roc_auc_scores.append(roc_auc_score_)

        if len(thresholds) > 0:
            th = np.mean(thresholds)
            f1_score_ = f1_score(y_test, (preds > th).astype(int))
            f1_scores.append(f1_score_)
        else:
            f1_score_ = 0


        p, r, thresholds_ = precision_recall_curve(y_test, preds)
        f1_scores_ = 2 * r * p / (r + p)
        f1_scores_ = f1_scores_[p > 0]
        th = thresholds_[np.argmax(f1_scores_)]
        thresholds.append(th)

        last_model = model
        print(f'{idx + 1}/{len(prediction_dates)}', roc_auc_score_, f1_score_, th, sep='\t')
        
    return last_model, roc_auc_scores, thresholds, f1_scores

In [12]:
m1 = run_training(months=1, check_shift=1)

1/54	0.6732342526209291	0	0.6
2/54	0.6601655416264867	0.22500000000000003	0.14400000000000002
3/54	0.5672982923227323	0.1990950226244344	0.29333333333333333
4/54	0.6896228956228956	0.38490566037735846	0.31
5/54	0.6905799414737963	0.35555555555555557	0.3606666666666667
6/54	0.6558164979875738	0.30094043887147337	0.32312445887445873
7/54	0.6885082832219216	0.33994334277620397	0.24
8/54	0.680344415350186	0.28726287262872624	0.11467735042735047
9/54	0.668553539483772	0.2408376963350785	0.09429150016650015
10/54	0.6913321698662012	0.3172413793103448	0.1556067821067821
11/54	0.6251775496865266	0.27440633245382584	0.25135339660339645
12/54	0.6970741530443023	0.45274725274725275	0.31993478743478737
13/54	0.6518918880578223	0.2874251497005988	0.0402920967920968
14/54	0.6841866070856557	0.3063063063063063	0.3184285714285714
15/54	0.6807785927825789	0.1986754966887417	0.128
16/54	0.6379547393246023	0.23809523809523808	0.3106904761904761
17/54	0.6924089115804131	0.23923444976076555	0.3195881504116

In [13]:
m2 = run_training(months=2, check_shift=1)

1/53	0.6227613626095599	0	0.55
2/53	0.5755814963433912	0.20408163265306123	0.33
3/53	0.6207077289211242	0.277511961722488	0.19789285714285715
4/53	0.7061040525313751	0.3909774436090226	0.30085714285714277
5/53	0.6534667388351164	0.28923076923076924	0.2051437728937729
6/53	0.6777191294735154	0.35324675324675325	0.15714321789321783
7/53	0.6759552150041803	0.31266846361185985	0.23523412698412688
8/53	0.673898382319435	0.3043478260869565	0.22910064935064928
9/53	0.6794096992315255	0.332541567695962	0.14
10/53	0.6418070649569702	0.26666666666666666	0.09104166666666666
11/53	0.6914649585531273	0.41201716738197425	0.20455971805971804
12/53	0.6329373477263432	0.30894308943089427	0.3703333333333333
13/53	0.6906340562013611	0.3106796116504854	0.14124242424242425
14/53	0.709373987283069	0.2453222453222453	0.1125
15/53	0.6390870338238759	0.23008849557522124	0.262095238095238
16/53	0.687905715357318	0.26130653266331655	0.28529040404040396
17/53	0.700440225089002	0.2890442890442891	0.182800976800976

In [14]:
m3 = run_training(months=3, check_shift=1)

1/52	0.6170044360999135	0	0.63
2/52	0.6447285242677531	0.26277372262773724	0.39645238095238095
3/52	0.6538692889970262	0.2679425837320574	0.32333333333333336
4/52	0.6396188340807175	0.29213483146067415	0.24362698412698403
5/52	0.6707163919569789	0.2965116279069767	0.17527777777777778
6/52	0.6705922682236377	0.3227665706051873	0.2549285714285714
7/52	0.648603717060698	0.303951367781155	0.2677380952380951
8/52	0.6933448610349627	0.3118279569892473	0.18906501831501824
9/52	0.6601632023251637	0.27624309392265195	0.20167576704341406
10/52	0.6988710720085773	0.41588785046728977	0.34
11/52	0.619136926438456	0.29067245119305857	0.16089754689754684
12/52	0.6876319079160985	0.3652561247216035	0.51
13/52	0.6990713904797237	0.2702702702702703	0.2826190476190476
14/52	0.659603301031019	0.2458100558659218	0.255
15/52	0.66319573283859	0.2662721893491124	0.2502896825396825
16/52	0.7066927874620182	0.2958904109589041	0.15773484848484848
17/52	0.6553957735770813	0.2525773195876289	0.4829999999999998
18/

In [15]:
m4 = run_training(months=4, check_shift=1)

1/51	0.6265580057526366	0	0.23
2/51	0.6925967549343262	0.35344827586206895	0.18666666666666668
3/51	0.6052494503636057	0.2506527415143603	0.1703333333333333
4/51	0.6730915868846904	0.38287153652392947	0.18
5/51	0.6533406068162926	0.30708661417322836	0.35142857142857137
6/51	0.6505389514943656	0.28635346756152125	0.5007380952380951
7/51	0.7029606820363759	0.3682864450127877	0.2602142857142857
8/51	0.6553025895615974	0.2796833773087072	0.19410714285714284
9/51	0.712557721733428	0.4189189189189189	0.17671428571428574
10/51	0.6402288974558994	0.28048780487804875	0.058416666666666665
11/51	0.6582551509224419	0.31349206349206343	0.22
12/51	0.7075232069459196	0.30939226519337015	0.26933333333333337
13/51	0.6468405056998565	0.26704545454545453	0.22
14/51	0.7091525694222789	0.2653061224489796	0.10677489177489177
15/51	0.711107667407063	0.330316742081448	0.18266666666666664
16/51	0.6765615917728593	0.2756892230576441	0.22833333333333333
17/51	0.651264752239265	0.24154589371980675	0.1162222222222

In [16]:
m5 = run_training(months=5, check_shift=1)

1/50	0.663611111111111	0	0.33
2/50	0.598139255702281	0.28444444444444444	0.1375
3/50	0.6428775927548934	0.3464566929133858	0.2576071428571428
4/50	0.6491375999231821	0.3804034582132565	0.24882142857142844
5/50	0.635203846799478	0.26	0.30066666666666664
6/50	0.7035361130097972	0.38694638694638694	0.2748095238095237
7/50	0.6536219947974329	0.32041343669250644	0.23060714285714287
8/50	0.7042606739333186	0.431924882629108	0.19844498556998555
9/50	0.6397975708502024	0.3057324840764331	0.1064047619047619
10/50	0.6888824189095928	0.30654205607476637	0.2621111111111111
11/50	0.7019264691423712	0.22760290556900725	0.16883333333333334
12/50	0.6398055704458143	0.2554517133956386	0.3508333333333333
13/50	0.7121439845715425	0.32415902140672787	0.27416666666666667
14/50	0.7407922477925019	0.35937499999999994	0.1253751526251526
15/50	0.6640708705900688	0.24936386768447835	0.11
16/50	0.6537110072386145	0.260204081632653	0.204
17/50	0.6796770336651351	0.2705314009661836	0.1729262265512266
18/50	0.69570

In [17]:
m6 = run_training(months=6, check_shift=1)

1/49	0.570359391011565	0	0.24
2/49	0.6402901554404143	0.4075471698113208	0.2525
3/49	0.6342461441611584	0.2995391705069124	0.31192857142857133
4/49	0.6559348068401164	0.31125827814569534	0.18
5/49	0.6848079975244806	0.3469879518072289	0.313952380952381
6/49	0.6299177558770764	0.2777777777777778	0.27783333333333327
7/49	0.7007292128193767	0.43269230769230776	0.44
8/49	0.6362964227994468	0.3053763440860215	0.28
9/49	0.6788152132084807	0.3057851239669422	0.2951984126984126
10/49	0.7001799437007827	0.2204724409448819	0.17011904761904761
11/49	0.6358224554945866	0.2411764705882353	0.29995598845598836
12/49	0.7158158104870909	0.3472222222222222	0.32
13/49	0.7149080853185847	0.34957020057306587	0.136
14/49	0.6753734509210875	0.29629629629629634	0.13933333333333334
15/49	0.6470457034838527	0.21081081081081082	0.0325
16/49	0.674995327277485	0.2849872773536896	0.053690476190476184
17/49	0.6976081434040573	0.33183856502242154	0.18575901875901868
18/49	0.647419498029595	0.26107226107226106	0.15396

In [18]:
m7 = run_training(months=7, check_shift=1)

1/48	0.6057870504430416	0	0.2
2/48	0.6870022775465326	0.37908496732026137	0.355
3/48	0.6102054978827063	0.26785714285714285	0.2650238095238095
4/48	0.689817880794702	0.38461538461538464	0.48797799422799415
5/48	0.6234945725378337	0.25146198830409355	0.19442857142857142
6/48	0.6834614842658074	0.41813602015113344	0.45
7/48	0.6529053186162455	0.30357142857142855	0.16221428571428576
8/48	0.6976711479786423	0.3311258278145695	0.29166666666666663
9/48	0.6875808936825887	0.21714285714285714	0.21478787878787878
10/48	0.6451982368661568	0.2506527415143603	0.28725
11/48	0.7025897535836474	0.2787878787878788	0.31825072150072153
12/48	0.7200913508086205	0.3559870550161812	0.22279761904761905
13/48	0.6541548797736916	0.2558139534883721	0.23289285714285712
14/48	0.6729019762953667	0.23918575063613232	0.059666666666666666
15/48	0.6653918943488537	0.2762148337595908	0.10728571428571428
16/48	0.7097388445626477	0.32835820895522383	0.20688492063492056
17/48	0.6575749614684041	0.22842639593908629	0.095


In [19]:
m8 = run_training(months=8, check_shift=1)

1/47	0.6726978596316574	0	0.27
2/47	0.6700325732899023	0.3107569721115538	0.39195238095238105
3/47	0.6387537107208576	0.2710622710622711	0.2595952380952381
4/47	0.6269802220764389	0.2819672131147541	0.18842857142857145
5/47	0.6800628657607004	0.43410852713178294	0.285
6/47	0.631802444360073	0.2894736842105263	0.22
7/47	0.6989761581932088	0.32000000000000006	0.3141666666666667
8/47	0.7113351743625972	0.2554347826086957	0.22121139971139972
9/47	0.6266766485744588	0.24068767908309452	0.37691774891774893
10/47	0.7034346539726616	0.25066666666666665	0.16802175602175598
11/47	0.6939006700283208	0.31884057971014496	0.2872301587301586
12/47	0.6467800619881114	0.2875	0.24150074925074924
13/47	0.6545538759526717	0.25133689839572193	0.4326190476190477
14/47	0.6820715660001375	0.30170316301703165	0.18
15/47	0.6954465919032847	0.3018867924528302	0.19966666666666666
16/47	0.6433542101600557	0.2198391420911528	0.0425
17/47	0.6801951499193069	0.3144963144963145	0.10921428571428571
18/47	0.654220556482

In [20]:
m9 = run_training(months=9, check_shift=1)

1/46	0.6313439270756344	0	0.2
2/46	0.6750456871345029	0.3539094650205762	0.3
3/46	0.5805650252525252	0.24022346368715083	0.27
4/46	0.6999698173952412	0.4558404558404559	0.33
5/46	0.6447399817872812	0.29047619047619044	0.34
6/46	0.6570804372502371	0.304147465437788	0.2667738095238095
7/46	0.6925556418748551	0.23566878980891717	0.18160622710622715
8/46	0.6269487750556794	0.2682926829268293	0.3221984126984126
9/46	0.7159342577212874	0.2922636103151863	0.16155647130647124
10/46	0.6966007126781696	0.29516539440203565	0.32888095238095233
11/46	0.6581321022727273	0.2185430463576159	0.125
12/46	0.6619320257140769	0.30487804878048785	0.18732178932178928
13/46	0.685465477918308	0.3096446700507614	0.37449999999999994
14/46	0.7144388791009371	0.32888888888888895	0.0837383984264789
15/46	0.6328243060526526	0.2367758186397985	0.03666666666666667
16/46	0.6800394144144145	0.3225806451612903	0.37
17/46	0.6413727913102674	0.3129251700680272	0.268
18/46	0.7079557056467447	0.30303030303030304	0.135
19/46	

In [21]:
m10 = run_training(months=10, check_shift=1)

1/45	0.6195167722261319	0	0.24
2/45	0.5851318551561269	0.28936170212765955	0.3974999999999999
3/45	0.6382009868742635	0.33540372670807456	0.37
4/45	0.6311331212647002	0.30517711171662126	0.10333333333333332
5/45	0.6477873294086165	0.2923433874709977	0.30747402597402584
6/45	0.708486013986014	0.27932960893854747	0.2968015873015873
7/45	0.6218699909753371	0.25082508250825086	0.29354761904761906
8/45	0.7108920431382724	0.27388535031847133	0.19087301587301578
9/45	0.7151552378825107	0.30635838150289013	0.17095743145743147
10/45	0.677640872704442	0.2564102564102564	0.12896177738669995
11/45	0.6401428538299204	0.1971014492753623	0.13429911754911755
12/45	0.6714469985258322	0.30812324929971985	0.13142857142857142
13/45	0.7224215842636896	0.35835351089588374	0.16241666666666668
14/45	0.6573504393498943	0.2823529411764706	0.11161616161616163
15/45	0.6614876983584407	0.2997658079625293	0.415
16/45	0.6423534972376975	0.3194444444444444	0.23708333333333337
17/45	0.6943766937669378	0.31601731601731

In [22]:
m11 = run_training(months=11, check_shift=1)

1/44	0.6163723776223776	0	0.52
2/44	0.6969882957465029	0.4594594594594595	0.33
3/44	0.5990588013094938	0.23648648648648646	0.225
4/44	0.6808363970588236	0.3097643097643098	0.24666666666666667
5/44	0.6974566714150047	0.26143790849673204	0.21
6/44	0.6257739938080494	0.2460567823343848	0.40078571428571425
7/44	0.7147883693354642	0.2625482625482625	0.16865476190476192
8/44	0.7043702747556704	0.31715210355987056	0.205
9/44	0.6712014183218372	0.24437299035369778	0.15566666666666668
10/44	0.6463604062360463	0.24927536231884054	0.17666666666666664
11/44	0.6751733795678848	0.26190476190476186	0.14125
12/44	0.7384446900843364	0.39577836411609496	0.245
13/44	0.6667536262130855	0.29156010230179025	0.12955158730158728
14/44	0.6882852178561881	0.3359173126614987	0.17666666666666667
15/44	0.6480680868838762	0.304147465437788	0.4428030303030303
16/44	0.6860221958769678	0.30769230769230765	0.21
17/44	0.6622344566789011	0.30368763557483736	0.255
18/44	0.7150205714545496	0.2925531914893617	0.194893578643

In [23]:
m12 = run_training(months=12, check_shift=1)

1/43	0.7092119115656489	0	0.14
2/43	0.6120162195431014	0.303886925795053	0.1225
3/43	0.6002872135173059	0.21348314606741572	0.07083333333333333
4/43	0.7022458724871594	0.2872628726287263	0.195
5/43	0.6320711143695015	0.23908045977011497	0.37399206349206354
6/43	0.716871810465639	0.3231197771587744	0.19833333333333336
7/43	0.7065318204790705	0.3507853403141361	0.3969021256521257
8/43	0.6559477756286267	0.25787965616045844	0.20098160173160168
9/43	0.6313197653023674	0.25388601036269426	0.23104761904761906
10/43	0.6688471903565316	0.27722772277227725	0.1808816738816739
11/43	0.7334491994324238	0.34192037470725994	0.17074012099012098
12/43	0.6898786777586756	0.3291770573566085	0.1980079365079365
13/43	0.6763389068069937	0.32911392405063294	0.26190384615384615
14/43	0.6522445507708664	0.2974828375286041	0.06105011655011655
15/43	0.6873571428571428	0.2933884297520661	0.0975
16/43	0.6707367234392614	0.2862453531598513	0.07666666666666667
17/43	0.727856645692317	0.304950495049505	0.23916666666

In [27]:
result_models = [m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12]

---

In [44]:
results = []

for month in range(1, 13):
    submission_extra['date'] = submission_extra['orig_date'] + pd.DateOffset(months=month)
    new_df = target_date_features(submission_extra)
    X = new_df.drop(columns=['hash_tab_num', 'date', 'orig_date', 'target']).fillna(-100)
    
    model = result_models[month - 1][0]
    th = np.mean(result_models[month - 1][2])
    submission_extra['target'] = (model.predict_proba(X)[:, 1] >= th).astype(int)
    results.append(submission_extra[['hash_tab_num', 'date', 'target']].copy())
    
result_df = pd.concat(results, ignore_index=True)
result_df.head()

Unnamed: 0,hash_tab_num,date,target
0,0,2019-09-01,1
1,1,2019-09-01,0
2,2,2019-09-01,0
3,3,2019-09-01,0
4,4,2019-09-01,1


In [45]:
check_df = pd.read_csv('submission_check.csv', sep=';').drop(columns=['target'])
check_df['date'] = pd.to_datetime(check_df['date'], format='%Y-%m-%d')
check_df.head()

Unnamed: 0,hash_tab_num,date
0,0,2019-09-01
1,0,2019-10-01
2,0,2019-11-01
3,0,2019-12-01
4,0,2020-01-01


In [46]:
result_df_new = pd.merge(result_df, check_df, on=['hash_tab_num', 'date']).sort_values(['hash_tab_num', 'date'])
result_df_new.head()

Unnamed: 0,hash_tab_num,date,target
0,0,2019-09-01,1
1757,0,2019-10-01,1
3509,0,2019-11-01,1
5265,0,2019-12-01,1
7025,0,2020-01-01,1


In [48]:
result_df_new.to_csv('submission_8.csv', sep=';', index=False)

In [None]:
X = new_df.drop(columns=['hash_tab_num', 'date', 'orig_date']).fillna(-100)

In [29]:
X_train.columns

NameError: name 'X_train' is not defined

In [35]:
np.mean(m2[2]), np.mean(m2[2][-10:])

(0.2097956778116834, 0.22004723054723058)

In [28]:
target_date_features(submission_extra)

Unnamed: 0,hash_tab_num,date,gender,work_experience_company,home_to_work_distance,personel_num,category_Рабочие,category_Руководители,category_Служащие,category_Специалисты,education_Высшее,education_Начальное_среднее,education_Среднее_профессинальное,razryad_fact_0,razryad_fact_1,razryad_fact_2,razryad_fact_3,razryad_fact_4,razryad_fact_5,razryad_fact_6,orig_date,target,age,is_pensioner,relatives_0,relatives_1,relatives_2,relatives_3,relatives_4,relatives_5,relatives_6,relatives_7
52,0,2019-08-01,1,13.0,,,1,0,0,0,0,1,0,0,0,0,0,0,1,0,2019-08-01,0,34,0,0,0,1,0,0,0,1,0
105,1,2019-08-01,1,14.0,,121.0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,2019-08-01,0,36,0,1,0,2,0,1,0,3,1
158,2,2019-08-01,0,14.0,,,0,1,0,0,1,0,0,1,0,0,0,0,0,0,2019-08-01,0,52,0,0,0,0,0,1,0,1,0
211,3,2019-08-01,1,12.0,,121.0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,2019-08-01,0,43,0,0,0,1,0,0,0,1,0
264,4,2019-08-01,1,12.0,,121.0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,2019-08-01,0,33,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99209,2648,2019-08-01,1,,4953.5,,0,0,0,0,0,1,0,1,0,0,0,0,0,0,2019-08-01,0,39,0,1,1,0,1,0,0,2,1
99210,2649,2019-08-01,1,,,,0,0,0,0,0,0,1,1,0,0,0,0,0,0,2019-08-01,0,37,0,0,0,1,0,0,0,0,1
99211,2650,2019-08-01,1,,,,0,0,0,0,0,1,0,1,0,0,0,0,0,0,2019-08-01,0,25,0,0,0,0,0,0,0,0,0
99212,2651,2019-08-01,1,,2463.8,170.0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,2019-08-01,0,21,0,0,0,0,0,0,0,0,0


In [24]:
submission_extra

Unnamed: 0,hash_tab_num,date,gender,work_experience_company,home_to_work_distance,personel_num,category_Рабочие,category_Руководители,category_Служащие,category_Специалисты,education_Высшее,education_Начальное_среднее,education_Среднее_профессинальное,razryad_fact_0,razryad_fact_1,razryad_fact_2,razryad_fact_3,razryad_fact_4,razryad_fact_5,razryad_fact_6,orig_date,target
52,0,2019-08-01,1,13.0,,,1,0,0,0,0,1,0,0,0,0,0,0,1,0,2019-08-01,0
105,1,2019-08-01,1,14.0,,121.0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,2019-08-01,0
158,2,2019-08-01,0,14.0,,,0,1,0,0,1,0,0,1,0,0,0,0,0,0,2019-08-01,0
211,3,2019-08-01,1,12.0,,121.0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,2019-08-01,0
264,4,2019-08-01,1,12.0,,121.0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,2019-08-01,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99209,2648,2019-08-01,1,,4953.5,,0,0,0,0,0,1,0,1,0,0,0,0,0,0,2019-08-01,0
99210,2649,2019-08-01,1,,,,0,0,0,0,0,0,1,1,0,0,0,0,0,0,2019-08-01,0
99211,2650,2019-08-01,1,,,,0,0,0,0,0,1,0,1,0,0,0,0,0,0,2019-08-01,0
99212,2651,2019-08-01,1,,2463.8,170.0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,2019-08-01,0
