In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
DATA_PATH = Path('../data/raw/')

In [3]:
employee_df = pd.read_csv(DATA_PATH / 'sotrudniki.csv', sep=';', parse_dates=['date'])
relatives_df = pd.read_csv(DATA_PATH / 'rodstvenniki.csv', sep=';')
ogrv_df = pd.read_csv(DATA_PATH / 'OGRV.csv', sep=';', parse_dates=['date'])

In [4]:
def preprocess_employee(df):
    df['gender'] = df['gender'].map({'мужской': 0, 'женский': 1})
    df['category'] = df['category'].astype('category')
    df['name_post_lvl4'] = df['name_post_lvl4'].astype('category')
    df['name_post_lvl5'] = df['name_post_lvl5'].astype('category')
    df['prof_post_augment'] = df['prof_post_augment'].astype('category')
    df['name_fact_lvl4'] = df['name_fact_lvl4'].astype('category')
    df['name_fact_lvl5'] = df['name_fact_lvl5'].astype('category')
    df['prof_fact_augment'] = df['prof_fact_augment'].astype('category')
    df['is_married'] = df['married'].map({'жен/зм': 1, 'ГрБрак': 1, 'хол/нз': 0, 'разв.': 0, 'вдов.': 0})
    df['education'] = df['education'].astype('category')
    return df

In [5]:
def preprocess_relatives(df):
    df['rel_type'] = df['rel_type'].astype('category')
    df = df.drop_duplicates()
    return df

In [6]:
def preprocess_ogrv(df):
    df['graphic_rule_level_2'] = df['graphic_rule_level_2'].astype('category')
    df['graphic_rule_level_1'] = df['graphic_rule_level_1'].astype('category')
    df['work_shift_type'] = df['work_shift_type'].astype('category')
    ogrv_df['number_of_working_hours'] = pd.to_numeric(ogrv_df['number_of_working_hours'].str.replace(',', '.'))
    return df

In [7]:
employee_df = preprocess_employee(employee_df)
relatives_df = preprocess_relatives(relatives_df)
ogrv_df = preprocess_ogrv(ogrv_df)

In [12]:
employee_df.to_pickle('../data/interim/employee.pkl')
relatives_df.to_pickle('../data/interim/relatives.pkl')
ogrv_df.to_pickle('../data/interim/ogrv.pkl')