In [310]:
import pandas as pd
import numpy as np
import random
import pickle
import scipy
from IPython.display import display

import os

In [311]:
from sklearn import *
from lightgbm import LGBMClassifier

In [312]:
RANDOM_SEED = 42
def seed_everything(seed=42):
    """"
    Seed everything.
    """   
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(RANDOM_SEED)

In [313]:
def save_to_pickle(model, path):
    with open(path, 'wb') as f:
        pickle.dump(model, f)

# LOAD DATA

In [314]:
data = {}
for df_name in ['address', 'application', 'client', 'document', 'phone', 'work']:
    df = pd.read_csv(f'../data/csv_data/{df_name}.csv')
    if df_name == 'client':
        df['sex'] = df['sex'].fillna(df['middlename'].str[-1] == 'а').astype(np.bool_)
    data[df_name] = df
del df

# PREPROCESSING

In [315]:
categories = ['regionname', 'regioncode', 'countrycode', 'street', 'house',
              'opendate', 'appdatetime', 'signdate', 'card_number', 'client_snils', 'client_inn',
              'lastname', 'firstname', 'middlename', 'birthplacetown', 'birthdate',
              'series', 'number', 'issuedate', 'issuercode', 'issuer', 
              'phone_number',
              'title', 'inn']

In [316]:
X_text, y_text = [], []
X_numb, y_numb = [], []
X_text_test, y_text_test = [], []
X_numb_test, y_numb_test = [], []
test_data = {}
for df_name, df in data.items():
    X_train, X_test = model_selection.train_test_split(df, test_size=0.2, shuffle=True)
    test_data[df_name] = X_test
    for col_name, series in X_train.iteritems():
        category = col_name if col_name in categories else 'other'
        if 'date' in category:
            category = 'date'
        elif category in ['client_inn', 'inn']:
            category = 'client_inn'
        target = pd.Series([category] * len(series), name='target')
        if series.dtype == object:
            X_text.append(series)
            y_text.append(target)
        else:
            X_numb.append(series)
            y_numb.append(target)
            
    for col_name, series in X_test.iteritems():
        category = col_name if col_name in categories else 'other'
        if 'date' in category:
            category = 'date'
        elif category in ['client_inn', 'inn']:
            category = 'client_inn'
        target = pd.Series([category] * len(series), name='target')
        if series.dtype == object:
            X_text_test.append(series)
            y_text_test.append(target)
        else:
            X_numb_test.append(series)
            y_numb_test.append(target)
del series, target, col_name, X_train, X_test

In [317]:
for name, df in test_data.items():
    df.to_csv(f'../data/test_data/{name}.csv', index=False)

In [318]:
X_text = pd.concat(X_text)
y_text = pd.concat(y_text)
X_numb = pd.concat(X_numb)
y_numb = pd.concat(y_numb)

In [356]:
X_text_test = pd.concat(X_text_test)
y_text_test = pd.concat(y_text_test)
X_numb_test = pd.concat(X_numb_test)
y_numb_test = pd.concat(y_numb_test)

X_text_test.dropna().to_csv('../data/test_data/X_text_test_raw.csv', index=False)
X_numb_test.dropna().to_csv('../data/test_data/X_numb_test_raw.csv', index=False)
y_text_test.dropna().to_csv('../data/test_data/y_text_test_raw.csv', index=False)
y_numb_test.dropna().to_csv('../data/test_data/y_numb_test_raw.csv', index=False)

In [320]:
text_label_encoder = preprocessing.LabelEncoder()
y_text_lbl = text_label_encoder.fit_transform(y_text)
numb_label_encoder = preprocessing.LabelEncoder()
y_numb_lbl = numb_label_encoder.fit_transform(y_numb)

In [321]:
text_label_encoder.classes_

array(['birthplacetown', 'countrycode', 'date', 'firstname', 'house',
       'issuer', 'lastname', 'middlename', 'other', 'regionname',
       'series', 'street', 'title'], dtype=object)

In [322]:
save_to_pickle(text_label_encoder, '../models/text_label_encoder.pkl')
save_to_pickle(numb_label_encoder, '../models/numb_label_encoder.pkl')

# SENSETIVE DATA CLASSIFICATION

In [323]:
skf = model_selection.StratifiedKFold(shuffle=True)

## TEXT DATA

In [324]:
def squeeze(X):
    return X[:, 0]

In [325]:
text_pipe = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='NA')),
    ('reshape', preprocessing.FunctionTransformer(func=squeeze)),
    ('text_feature_extraction', pipeline.FeatureUnion(n_jobs=-1, transformer_list=[
        ('count_vectorizer_char_wb', feature_extraction.text.CountVectorizer(analyzer='char_wb', ngram_range=(1,2))),
        ('count_vectorizer_char', feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(1,2))),
        ('count_vectorizer_word', feature_extraction.text.CountVectorizer(analyzer='word')),
        ('tf_idf_vectorizer_char_wb', feature_extraction.text.TfidfVectorizer(analyzer='char_wb', ngram_range=(1,2))),
        ('tf_idf_vectorizer_char', feature_extraction.text.TfidfVectorizer(analyzer='char', ngram_range=(1,2))),
        ('tf_idf_vectorizer_word', feature_extraction.text.TfidfVectorizer(analyzer='word')),
    ])),
    ('model', LGBMClassifier())
])

In [326]:
cv_text = model_selection.cross_validate(estimator=text_pipe,
                                         X=X_text.values.reshape(-1,1),
                                         y=y_text_lbl,
                                         scoring=['f1_macro', 'roc_auc_ovr'],
                                         cv=skf,
                                         n_jobs=-1,
                                         verbose=5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.6min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.7min finished


In [352]:
pd.DataFrame(cv_text)

Unnamed: 0,fit_time,score_time,test_f1_macro,test_roc_auc_ovr
0,93.419942,4.325528,0.824121,0.984405
1,93.531802,4.145356,0.80906,0.980976
2,98.740987,1.653193,0.811641,0.982909
3,91.844333,4.135791,0.822104,0.983977
4,92.063936,4.397916,0.824528,0.983755


In [328]:
cv_text_pred = model_selection.cross_val_predict(estimator=text_pipe, 
                                                 X=X_text.values.reshape(-1,1),
                                                 y=y_text_lbl,
                                                 cv=skf)

cv_text_pred = pd.DataFrame({'PRED': text_label_encoder.inverse_transform(cv_text_pred),
                             'TRUE': text_label_encoder.inverse_transform(y_text_lbl)})

In [329]:
cv_text_pred.head()

Unnamed: 0,PRED,TRUE
0,regionname,regionname
1,issuer,regionname
2,series,regionname
3,regionname,regionname
4,regionname,regionname


In [330]:
text_pipe.fit(X_text.values.reshape(-1,1), y_text_lbl)

Pipeline(steps=[('imputer',
                 SimpleImputer(fill_value='NA', strategy='constant')),
                ('reshape',
                 FunctionTransformer(func=<function squeeze at 0x7fbf9d3cb560>)),
                ('text_feature_extraction',
                 FeatureUnion(n_jobs=-1,
                              transformer_list=[('count_vectorizer_char_wb',
                                                 CountVectorizer(analyzer='char_wb',
                                                                 ngram_range=(1,
                                                                              2))),
                                                ('count_vectorizer_char',
                                                 CountVectorizer(analyzer='char',
                                                                 ngram_range=(1,
                                                                              2))),
                                                ('count_ve

In [331]:
save_to_pickle(text_pipe, '../models/text_pipe.pkl')

## NUMERIC DATA

In [332]:
def get_stats(X: np.array) -> np.array:
    if isinstance(X, pd.DataFrame):
        X = X.values
    return np.hstack([X // 10**i for i in range(12)])

statistics_transformer = preprocessing.FunctionTransformer(func=get_stats)

In [333]:
numb_pipe = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(fill_value=-1, strategy='constant')),
    ('statistics', statistics_transformer),
    ('model', LGBMClassifier())
])

In [334]:
cv_numb = model_selection.cross_validate(estimator=numb_pipe,
                                         X=X_numb.values.reshape(-1,1),
                                         y=y_numb_lbl,
                                         scoring=['f1_macro', 'roc_auc_ovr'],
                                         n_jobs=-1,
                                         verbose=5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    5.2s remaining:    7.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.8s finished


In [353]:
pd.DataFrame(cv_numb)

Unnamed: 0,fit_time,score_time,test_f1_macro,test_roc_auc_ovr
0,3.899663,1.314537,0.753462,0.965469
1,3.897893,1.241724,0.817514,0.972279
2,3.877337,1.334542,0.743969,0.946608
3,4.104791,0.979829,0.847346,0.986803
4,4.110117,1.178703,0.713693,0.937371


In [336]:
cv_numb_pred = model_selection.cross_val_predict(estimator=numb_pipe, 
                                                 X=X_numb.values.reshape(-1,1), 
                                                 y=y_numb_lbl)

cv_numb_pred = pd.DataFrame({'PRED': numb_label_encoder.inverse_transform(cv_numb_pred),
                             'TRUE': numb_label_encoder.inverse_transform(y_numb_lbl)})

In [337]:
cv_numb_pred.head()

Unnamed: 0,PRED,TRUE
0,other,other
1,other,other
2,other,other
3,other,other
4,other,other


In [338]:
numb_pipe.fit(X=X_numb.values.reshape(-1,1), y=y_numb_lbl)

Pipeline(steps=[('imputer', SimpleImputer(fill_value=-1, strategy='constant')),
                ('statistics',
                 FunctionTransformer(func=<function get_stats at 0x7fbf855cd950>)),
                ('model', LGBMClassifier())])

In [339]:
save_to_pickle(numb_pipe, '../models/numb_pipe.pkl')

# REPLACING SENSETIVE DATA

## NAMES

In [340]:
NAMES = (pd.read_csv('../data/csv_data/russian_names.csv', sep=';', usecols=['Name', 'Sex'])
        .replace({'Sex':{'Ж': 1, 'М': 0}})
        .rename(columns={'Name': 'firstname', 'Sex': 'sex'})
        .astype({'sex': np.bool_}))
SURNAMES = pd.read_csv('../data/csv_data/russian_surnames.csv', sep=';', usecols=['Surname', 'Sex'])
SURNAMES['Sex'] = SURNAMES['Surname'].str[-1] == 'а'
SURNAMES = (SURNAMES.astype({'Sex': np.bool_})
                    .rename(columns={'Surname': 'lastname', 'Sex': 'sex'}))
MIDDLENAMES = data['client'][['middlename']].drop_duplicates()
MIDDLENAMES['sex'] = MIDDLENAMES['middlename'].str[-1] == 'а'


display(NAMES.head())
display(SURNAMES.head())
display(MIDDLENAMES.head())

Unnamed: 0,firstname,sex
0,Аалия,True
1,Аанжелла,True
2,Аба,True
3,Абав,True
4,Абам,True


Unnamed: 0,lastname,sex
0,Аалферов,False
1,Аалферова,True
2,Ааль,False
3,Ааман,False
4,Аамана,True


Unnamed: 0,middlename,sex
0,Михайловна,True
1,Валериевич,False
2,Октябрятовна,True
3,Миродарович,False
4,Валентиновна,True


In [341]:
save_to_pickle(NAMES, '../data/catalogs/names.pkl')
save_to_pickle(SURNAMES, '../data/catalogs/surnames.pkl')
save_to_pickle(MIDDLENAMES, '../data/catalogs/middlenames.pkl')

In [342]:
def replace_names(original_names: pd.DataFrame, fake_names: pd.DataFrame, name_col: str) -> pd.Series:
    # я решил ориентироваться на поле sex в исходных данных при генерации имени определенного рода
    # так как сами имена очень грязные, часто по роду не совпадают имя, отчество и фамилия
    
    assert ('sex' in original_names) and ('sex' in fake_names)
    assert set(original_names['sex']) >= set(fake_names['sex'])
    
    for sex in original_names['sex'].unique():
        orig_mask = original_names['sex'] == sex
        fake_mask = fake_names['sex'] == sex
        
        sample_fake_names = fake_names.loc[fake_mask, name_col].sample(sum(orig_mask), replace=True).values
        mapping = {orig_name: np.random.choice(sample_fake_names) for orig_name in original_names.loc[orig_mask, name_col].unique()}
        
        original_names.loc[orig_mask, name_col] = original_names.loc[orig_mask, name_col].map(mapping)
        
    null_mask = original_names[name_col].isnull()
    if sum(null_mask) > 0:
        original_names[name_col] = original_names[name_col].fillna(fake_names[name_col].sample(sum(null_mask)).values)
        
    return original_names[name_col]

##  CITIES

In [343]:
CITIES = pd.read_csv('../data/csv_data/koord_russia.csv', sep=';', encoding='cp1251', usecols=['Город'])['Город']
save_to_pickle(CITIES, '../data/catalogs/cities.pkl')

In [344]:
def replace_geodata(original: pd.Series, fake: pd.Series) -> pd.Series:
    # я решил использовать города и название регионов "как есть". Так как в реальной жизни обычно данные с городом обычно приведены к нормальному виду
    # и нормализация и очистка данных не задача данного контеста
    
    sample_fake_names = fake.sample(len(original), replace=True).values
    mapping = {orig_name: np.random.choice(sample_fake_names) for orig_name in original.unique()}

    original = original.map(mapping)
    return original

## BIRTH DATE

In [345]:
def replace_date(original_dates: pd.Series, date_format='%Y-%m-%d') -> pd.Series:
    # тут необходимо пояснение какие свойства в целом должны оставаться 
    # у измененных данных. Я принял решение добавить случайно +- 90 дней к оригинальной дате рождения
    # чтобы сохранить и некоторые общие свойства распределениея дат и в то же время анонимизировать данные
    
    if not np.issubdtype(original_dates.dtype , np.datetime64):
        original_dates = pd.to_datetime(original_dates, format=date_format)
    return original_dates.apply(lambda x: x + pd.Timedelta(days=np.random.randint(1,90)))
    

In [346]:
test_data.keys()

dict_keys(['address', 'application', 'client', 'document', 'phone', 'work'])

## REGION

In [347]:
REGIONS = pd.read_csv('../data/csv_data/koord_russia.csv', sep=';', encoding='cp1251', usecols=['Регион'])['Регион'].drop_duplicates()
STREETS = data['address']['street'].drop_duplicates().dropna().values
HOUSES = pd.Series(list(range(150)))

In [348]:
save_to_pickle(REGIONS, '../data/catalogs/regions.pkl')
save_to_pickle(STREETS, '../data/catalogs/streets.pkl')
save_to_pickle(HOUSES, '../data/catalogs/houses.pkl')

## СНИЛС

In [349]:
def snils_generator() -> pd.Series:
    def _generate_snils_number() -> np.array:
        return np.random.randint(0, 9, size=9)
    
    def _get_control_number(sum_: int) -> str:
        if sum_ < 100:
            control_number = str(sum_)
        elif 100 <= sum_ <= 101:
            control_number = '00'
        else:
            raise ValueError('sum_ should be <= 101')
        return control_number

    def _check_snils_number(snils_number: np.array) -> bool:
        first_triple = int(''.join(map(str, snils_number[:3])))
        second_triple = int(''.join(map(str, snils_number[3:6])))
        third_triple = int(''.join(map(str, snils_number[6:9])))
        if first_triple > 1:
            return True
        elif second_triple > 1:
            return True
        elif third_triple > 998:
            return True
        else:
            return False
        
    snils_number = _generate_snils_number()
    while not _check_snils_number(snils_number):
        snils_number = _generate_snils_number()
    s = sum([i * n for i, n in zip(list(range(1,10))[::-1], snils_number)])
    
    while s > 101:
        s = s % 101
    control_number = _get_control_number(s)
    return ''.join(map(str, snils_number)) + control_number

## ИНН

In [350]:
def inn_ctrl_summ(nums, type):
    """
    Подсчет контрольной суммы
    """
    inn_ctrl_type = {
        'n2_12': [7, 2, 4, 10, 3, 5, 9, 4, 6, 8],
        'n1_12': [3, 7, 2, 4, 10, 3, 5, 9, 4, 6, 8],
        'n1_10': [2, 4, 10, 3, 5, 9, 4, 6, 8],
    }
    n = 0
    l = inn_ctrl_type[type]
    for i in range(0, len(l)):
        n += nums[i] * l[i]
    return n % 11 % 10


def inn_gen(l=None):
    def rnd(low: int, high: int) -> int:
        return np.random.randint(low, high)
    """
    Генерация ИНН (10 или 12 значный)
    На входе указывается длина номера - 10 или 12.
    Если ничего не указано, будет выбрана случайная длина.
    """
    if not l:
        l = list((10, 12))[rnd(0, 1)]
    if l not in (10, 12):
        return None
    nums = [
        rnd(1, 9) if x == 0
        else rnd(0, 9)
        for x in range(0, 9 if l == 10 else 10)
    ]
    if l == 12:
        n2 = inn_ctrl_summ(nums, 'n2_12')
        nums.append(n2)
        n1 = inn_ctrl_summ(nums, 'n1_12')
        nums.append(n1)
    elif l == 10:
        n1 = inn_ctrl_summ(nums, 'n1_10')
        nums.append(n1)
    return ''.join([str(x) for x in nums])


def inn_check(inn):
    """
    Проверка ИНН на корректность
    В соответствии с алгоритмом, описанным по ссылке:
        https://ru.wikipedia.org/wiki/Контрольное_число
    """
    sinn = str(inn)
    nums = [int(x) for x in sinn]
    if len(sinn) == 10:
        n1 = inn_ctrl_summ(nums, 'n1_10')
        return n1 == nums[-1]
    elif len(sinn) == 12:
        n2 = inn_ctrl_summ(nums, 'n2_12')
        n1 = inn_ctrl_summ(nums, 'n1_12')
        return n2 == nums[-2] and n1 == nums[-1]
    else:
        return False

In [351]:
test_data['document']

Unnamed: 0,app_id,run_id,client_id,document_id,documenttype,series,number,issuedate,issuercode,issuer
1266,2300155167,621180,2000897397,434367,1.0,8921,914923.0,2005-06-21,193051.0,ОВД РОМНЕНСКОГО РАЙОНА АМУРСКОЙ ОБЛ.
2996,2300155421,621519,2000898198,434735,1.0,3830,77174.0,2015-05-01,386804.0,ГУ МВД РОССИИ ПО ВОЛГОГРАДСКОЙ ОБЛ.
2827,2300155167,621393,2000897397,434603,1.0,8921,914923.0,2005-06-21,193051.0,ОВД РОМНЕНСКОГО РАЙОНА АМУРСКОЙ ОБЛ.
2556,2300154808,620754,2000895597,433870,1.0,1237,874531.0,2019-06-07,386802.0,МВД ПО УДМУРТСКОЙ РЕСП.
933,2300154744,620696,2000895122,433802,1.0,9683,641994.0,2019-10-10,931210.0,КАНЕВСКИМ РОВД КРАСНОДАРСКОГО КРАЯ
...,...,...,...,...,...,...,...,...,...,...
2076,2300152876,619707,2000886000,432722,1.0,3830,642075.0,2012-01-22,386808.0,ОТДЕЛЕНИЕМ УФМС РОССИИ ПО ЗАБАЙКАЛЬСКОМУ КРАЮ ...
2759,2300155111,621088,2000897197,434253,1.0,6191,626874.0,2016-09-23,435696.0,2 ОМ БЕЛОВСКОГО УВД КЕМЕРОВСКОЙ ОБЛ.
3064,2300155551,621657,2000898898,434883,1.0,1235,874530.0,2019-06-07,386802.0,МВД ПО УДМУРТСКОЙ РЕСП.
1428,2300155416,621473,2000898030,434686,1.0,6041,471986.0,2010-06-18,386908.0,ОВД СЕРГИЕВСКОГО РАЙОНА САМАРСКОЙ ОБЛ.
