In [16]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [2]:
# Считывание данных

sot = pd.read_csv('sotrudniki.csv', sep = ';')
rod = pd.read_csv('rodstvenniki.csv', sep = ';')
ogrv = pd.read_csv('OGRV.csv', sep = ';')

## Выделил таргет - человек, дата, таргет (болел или нет)

In [None]:
sot[['hash_tab_num', 'date', 'sick']].to_csv('transformed_data/train_target_df.csv', index=False)

## Дата рождения

In [9]:
temp_df = sot[['hash_tab_num', 'date_of_birth']].drop_duplicates().reset_index(drop=True)
temp_df.head()

Unnamed: 0,hash_tab_num,date_of_birth
0,0,1985
1,1,1983
2,2,1967
3,3,1976
4,4,1986


In [15]:
date_of_birth_dict = dict(temp_df.values)
temp_df.shape, len(date_of_birth_dict)

((2653, 2), 2653)

In [23]:
date_of_birth_dict = {int(k): int(v) for k, v in date_of_birth_dict.items()}

with open('transformed_data/date_of_birth.json', 'w') as f:
    json.dump(date_of_birth_dict, f)

## Родственники

In [30]:
males = [
    'Сын',
    'Муж',
    'Отец',
    'Брат',
    'Пасынок',
    'Внук',
    'Отчим',
    'Опекаемый (воспитанник)',
    'Другая степень родства, свойства',
    'Племянник',
]

In [64]:
relatives_dict = defaultdict(list)

for hash_tab_num, rel_type, rel_birth in rod.values:
    try:
        rel_birth = int(rel_birth)
    except ValueError:
        rel_birth = -1
    
    hash_tab_num = int(hash_tab_num)
    sex = 'M' if rel_type in males else 'F'
    relatives_dict[hash_tab_num].append([sex, rel_birth])
    
print(len(relatives_dict), rod['hash_tab_num'].nunique())

1898 1898


In [65]:
with open('transformed_data/relatives_info.json', 'w') as f:
    json.dump(relatives_dict, f)

---

In [66]:
with open('transformed_data/relatives_info.json', 'r') as f:
    relatives_dict = json.load(f)


def calc_relatives_bins(hash_tab_num, calc_date, relatives_dict):
    '''
    bins:
        0: 0 - 3: младенец
        1: 4 - 7: ребенок
        2: 8 - 18: школьник
        3: 19 - 35: молодежь :)
        4: 36 - 55(F), 60(M): предпенсионный возраст
        5: 55(F), 60(M) - +++: пенсионер
        6: кол-во родственников мужского рода
        7: кол-во родственников женского рода
    '''
    
    bins = [0] * 8
    if hash_tab_num not in relatives_dict:
        return bins
    
    cur_date = int(calc_date[:4])
    for (sex, birth_date) in relatives_dict[hash_tab_num]:
        if sex == 'M':
            bins[6] += 1
        elif sex == 'F':
            bins[7] += 1
            
        if birth_date < 0:
            continue
            
        age = cur_date - birth_date
        if age < 0:
            continue
        elif age <= 3:
            bins[0] += 1
        elif age <= 7:
            bins[1] += 1
        elif age <= 18:
            bins[2] += 1
        elif age <= 35:
            bins[3] += 1
        else:
            if (sex == 'M' and age >= 60) or (sex == 'F' and age >= 55):
                bins[5] += 1
            else:
                bins[4] += 1
    return bins