In [1]:
import os
import sys
import pickle
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
sys.path.append('/home/u2/debug_scoring')
# sys.path.append('/home/u2/scoring')
from scoring import Agreements, Statuses, Features

from FeaturesSelection.lib.features_selection import FeaturesSelection
from Sampling.lib.sampling import Sampling as Sampling
from Modeling.lib.modeling import Modeling
from Metrics.lib.metrics import Metrics

modeling = Modeling(n_jobs=-1, accuracy_threshold=0.6, log=False)
features_selection = FeaturesSelection()
sampling = Sampling()
metrics = Metrics()

# pd.options.display.max_rows = 999

In [2]:
def load_card(name_card,
              id_card = '_v0',
              load_path: str = 'models/',):
    """
    Загрузка результатов моделирования
    """
    load_path +=name_card+'/'
    with open(f'{load_path}{name_card}{id_card}.sav', 'rb') as f:
        data_load = pickle.load(f)
    return data_load

In [3]:
def unpack_sav(info_model):
    """
    Распаковка sav файла
    """
    model = info_model['model']
    # Подгрузка переменных из sav файла
    predict_list = info_model['predict_list']
    research_end = info_model['research_end']
    dict_params = info_model['all_params']
    df_features_importance = info_model['features_importance']


    test_period = dict_params['test_period']
    status = dict_params['status']
    cnt_loans = dict_params['cnt_loans']
    loan_id_status = dict_params['loan_id_status']
    num = dict_params['num']
    num_online = dict_params['num_online']
    prolong = dict_params['prolong']
    model_type = dict_params['model_type']
    target_name = dict_params['target_name']
    if 'decision_process_type' in dict_params:
        decision_process_type = dict_params['decision_process_type']

    # Получение данных для eval_set
    # Импорт данных
    # API
#     if research_start is None and research_end is None:
        research_start = research_end
        # research_start = datetime.strptime('2021-11-03  00:00:00', '%Y-%m-%d %H:%M:%S')
#         test_period = 20
        research_end = research_start + timedelta(test_period)
    #     research_end = datetime.strptime('2021-11-11  00:00:00', '%Y-%m-%d %H:%M:%S')
    
    if 'decision_process_type' in dict_params:
        agreements = Agreements(delay=status,
                                research_start=research_start,
                                research_end=research_end,
                                num_online=num_online,
                                num=num,
                                loan_id=loan_id_status,
                                cnt_loans=cnt_loans,
                                prolong=prolong,
                                decision_process_type = decision_process_type).get()
        print(decision_process_type)
    else:
        agreements = Agreements(delay=status,
                                research_start=research_start,
                                research_end=research_end,
                                num_online=num_online,
                                num=num,
                                loan_id=loan_id_status,
                                cnt_loans=cnt_loans,
                                prolong=prolong).get()
            
    agreements = agreements.dropna(axis = 0,subset = ['nbki_cs__cnt_loans'])
    # agreements = agreements.fillna(0)
    get_status = Statuses(dataset=agreements,
                          target=status).get(prolong=prolong)

    f = Features(dataset=agreements,
                 data_type=model_type)
    get_features = f.get()

    df_type = f.get_features_list()

    # Соединение предикторов и таргетов в единый датарфейм
    df = pd.merge(get_status, get_features, left_index=True, right_index=True)
    df_app_id = df['app_id'].astype('str')
    df = df.drop(['app_id'], axis=1).join(agreements['creation_date'])

    print(research_start)
    print(research_end)
    print(df.shape)

    # dummies для eval_set

    # Получение отобранных фич
    df = df[predict_list + [target_name]]
    # Получение данных биннинга из sav файла
    df_woe = info_model['df_woe']
    # фичи, по которым нуждно сделать биннинг
    mas_predict_to_woe = info_model['mas_predict_to_woe']
    
    # Замена значений на промежутки
    df.loc[:, mas_predict_to_woe] = features_selection.interval_definition(df.loc[:, mas_predict_to_woe],
                                                                                 df_woe[df_woe['column_name'].isin(
                                                                                     mas_predict_to_woe)]).copy()
    # Применение dummies
    df = pd.get_dummies(data=df, columns=mas_predict_to_woe)

    # Если получилось так, что не хватает нескольких столбцов биннинга, то добавляем эти столбцы с значением 0
    mas_predict_incompatible = [i for i in info_model['predict_list_binning'] if i not in df.columns]
    if len(mas_predict_incompatible) != 0:
        df.loc[:,mas_predict_incompatible] = 0

    # Разбиение на X и y
    y = df[[target_name]]
    X = df.drop([target_name],axis = 1)

    # Сортировка как в X_train
    X= X[info_model['predict_list_binning']]

    # получение серии со значением features_importance по модулю для изображения на графике
    ser_features_importance = df_features_importance.copy()
    ser_features_importance = ser_features_importance.set_index(ser_features_importance['features'])
    ser_features_importance = ser_features_importance['_coef']
    return X,y,model,ser_features_importance

In [4]:
def processing_data(df,df_type,info_model):
    """
    Распаковка sav файла
    """
    target_name = info_model['all_params']['target_name']
    # Получение отобранных фич
    df = df[info_model['predict_list'] + [target_name]]
    # Получение данных биннинга из sav файла
    df_woe = info_model['df_woe']
    # фичи, по которым нуждно сделать биннинг
    mas_predict_to_woe = info_model['mas_predict_to_woe']
    
    # Замена значений на промежутки
    df.loc[:, mas_predict_to_woe] = features_selection.interval_definition(df.loc[:, mas_predict_to_woe],
                                                                                 df_woe[df_woe['column_name'].isin(
                                                                                     mas_predict_to_woe)]).copy()
    # Применение dummies
    df = pd.get_dummies(data=df, columns=mas_predict_to_woe)

    # Если получилось так, что не хватает нескольких столбцов биннинга, то добавляем эти столбцы с значением 0
    mas_predict_incompatible = [i for i in info_model['predict_list_binning'] if i not in df.columns]
    if len(mas_predict_incompatible) != 0:
        df.loc[:,mas_predict_incompatible] = 0

    # Разбиение на X и y
    y = df[[target_name]]
    X = df.drop([target_name],axis = 1)

    # Сортировка как в X_train
    X= X[info_model['predict_list_binning']]

    return X,y

In [5]:
def get_set_select_type(dict_params,
                        type_selection,
                        decision_process_type,
                        dev,
                        df = None,
                        research_start = None,
                        research_end = None,
                        use_build_data_for_valid = True,
                        set_app_id_as_index = False):
    """
    Отбор множеств по указанным параметрам
    
    df - датафрейм который идет в сохранение с столбцами app_id и type_selection
    use_build_data_for_valid - использовать ли выборку из билда
    """
    # Если старотвая дата не указана
    if research_start is None:
         # Выборка из билда если необходимо
        if use_build_data_for_valid:
            research_start = dict_params['all_params']['research_start']
        else:
            research_start = dict_params['research_end'] - timedelta(dict_params['all_params']['test_period'])
    if research_end is None:
        research_end       = dict_params['research_end']
        
    # Выборка из билда если необходимо
    dev_table = True
    if dev is None:
        dev_table = False
    
    print(research_start)
    print(research_end)
    agreements = Agreements(delay                 = dict_params['all_params']['status'],
                            research_start        = research_start,
                            research_end          = research_end,
                            num_online            = dict_params['all_params']['num_online'],
                            num                   = dict_params['all_params']['num'],
                            loan_id               = dict_params['all_params']['loan_id_status'],
                            cnt_loans             = dict_params['all_params']['cnt_loans'],
                            prolong               = dict_params['all_params']['prolong'],
                            decision_process_type = decision_process_type,
                            add_features          = dict_params['all_params']['model_type'],
                            model_type            = dict_params['all_params']['model_type'],
                            dev_table             = dev_table,
                            dev_table_postfix     = dev,
                            
                            db_name='scoring3').get()
    agreements = agreements.dropna(axis = 0,subset = ['nbki_cs__cnt_loans'])
    
    get_status = Statuses(dataset=agreements,
                          target=dict_params['all_params']['status']).get(prolong=dict_params['all_params']['prolong'])


    f = Features(dataset=agreements,
                 data_type=dict_params['all_params']['model_type'])
    get_features = f.get()

    df_type1 = f.get_features_list()

    # Соединение предикторов и таргетов в единый датарфейм
    df1 = pd.merge(get_status, get_features, left_index=True, right_index=True)
    df_app_id = df1['app_id'].astype('str')
    df1 = df1.drop(['app_id'], axis=1).join(agreements['creation_date']) 
    
    print(df1.shape)
    if df is None:
        X,y = processing_data(df1,df_type1,info_model)
        return X,y
    
    # отбор записей с указанным типом и decision_process_type
    select_app_id = df['app_id'].isin(get_status['app_id'])
    
    select_set = (df['type_selection'] == type_selection)
    df = df[select_app_id & select_set]
    
    if set_app_id_as_index:
        # удаление ненужных столбцов
        df = df.drop(['predict_proba_0','status_predict','type_selection'],axis = 1)
        df = df.set_index('app_id')
    else:
        # удаление ненужных столбцов
        df = df.drop(['predict_proba_0','status_predict','type_selection','app_id'],axis = 1)

    # Выборка наборов
    X = df.drop(['status'],axis = 1)
    y = df[['status']]
    
    return X,y

In [6]:
def load_data(name_card, id_card):
    # Путь
    path = f'samples/{name_card}/{id_card}/'
    # Имя датафрейма
    name_data = os.listdir(path)[0]
    path += name_data

    # Выгрузка
    data = pd.read_csv(path,index_col = 0,dtype = {'app_id':str})
    
    return data

In [7]:
def df_cum_sum(X,y,model,size,inters,treshold):
    mas_result = []
    X_main             = X.copy()
    X_main['status']   = y['status'].copy()

    for i in range(inters):

        # Генерация выборки
        X = X_main.copy()
        X = X.sample(n = size)
#         print(X.shape)
        y = X[['status']]
        X = X.drop(['status'],axis = 1)

        # получаем датафрейм просрочки
        df_cumsum = metrics.get_cum_sum(X, y, model)
        df_cumsum = df_cumsum[df_cumsum['predict_proba'] >= treshold]
#         display(df_cumsum)

        # Выор значений
        sum_credits  = None
        cum_sum      = None
        n_credit_prt = None
        if df_cumsum.shape[0] != 0:
            sum_credits = df_cumsum['n_credits'].sum()
            cum_sum      = df_cumsum.iloc[0]['cum_sum']
            n_credit_prt = df_cumsum.iloc[0]['n_credit_prt']

        # Запись в массив
        dict_record = {'n_credits':sum_credits,
                      'cum_sum':cum_sum,
                      'n_credit_prt':n_credit_prt}
        mas_result.append(dict_record)

    df_result = pd.DataFrame(mas_result)
    return df_result

In [8]:
def model_predict(X,model,treshold = 0.5):
    """
    Получение предсказаний
    """
    # predict_proba для 0
    predict_proba_0 = model.predict_proba(X).T[0]
    
    if treshold is None:
        y_predict = model.predict(X)
    else:
        y_predict = list(map(lambda x : 0 if x >=treshold else 1,predict_proba_0))
    
    return y_predict

In [9]:
def select_data_of_prediction(X,y,model,treshold = 0.5,target_predict = 1):
    """
    Отбор данных, для которых был предсказан 1 или 0
    """
    # предсказания
    predict = model_predict(X = X,
                            model = model,
                            treshold = treshold)
    # Проверка условия
    mas_bool_pred = [val == target_predict for val in predict]
    
    X = X[mas_bool_pred]
    y = y[mas_bool_pred]
    
    return X,y

In [10]:
# def get_data(params = None,app_ids = None,dev = None):
    
#     if dev is None:
#         dev_table = None
#         dev_table_postfix = None
#         # Проверка на dev
#         if 'dev' in params:
#             dev_table = True
#             dev_table_postfix = params['dev']
#             if dev_table_postfix is None:
#                 dev_table = False
#         else:
#             dev_table = False
#             dev_table_postfix = None 
#     else:
#         dev_table = False
#         dev_table_postfix = None

#     if 
#     agreements = Agreements(delay                 = params['status'],
#                             research_start        = params['research_start'],
#                             research_end          = params['research_end'],
#                             num_online            = params['num_online'],
#                             num                   = params['num'],
#                             loan_id               = params['loan_id_status'],
#                             cnt_loans             = params['cnt_loans'],
#                             prolong               = params['prolong'],
#                             decision_process_type = params['decision_process_type'],
#                             add_features          = params['model_type'],
#                             model_type            = params['model_type'],
#                             dev_table             = dev_table,
#                             dev_table_postfix     = dev_table_postfix,
                            
#                             db_name='scoring3').get()
#     agreements = agreements.dropna(axis = 0,subset = ['nbki_cs__cnt_loans'])
    
#     get_status = Statuses(dataset=agreements,
#                           target=params['status']).get(prolong=params['prolong'])


#     f = Features(dataset=agreements,
#                  data_type=params['model_type'])
#     get_features = f.get()

#     df_type1 = f.get_features_list()

#     # Соединение предикторов и таргетов в единый датарфейм
#     df1 = pd.merge(get_status, get_features, left_index=True, right_index=True)
    
#     return df1

In [11]:
def get_data_app_ids(app_ids = None,status = 3,prolong = True,model_type = ['equifax_cs','nbki_cs'],dev = None):
    
    dev_table = False
    if dev is not None:
        dev_table = True

    agreements = Agreements(app_ids = app_ids,
                            db_name = 'scoring3',
                            dev_table = dev_table,
                            dev_table_postfix = dev).get()
    agreements = agreements.dropna(axis = 0,subset = ['nbki_cs__cnt_loans'])
    get_status = Statuses(dataset=agreements,target=status).get(prolong=prolong)
    f = Features(dataset=agreements,data_type=model_type)
    get_features = f.get()

    df_type = f.get_features_list()

    # Соединение предикторов и таргетов в единый датарфейм
    df = pd.merge(get_status, get_features, left_index=True, right_index=True)
    
    return df,df_type

### Загрузка моделей и данных

### 1 модель

In [34]:
# name_card = 'RF'
name_card_1 = 'LR'

# Модель
id_card_1   = '_v1647260486'

# Подгрузка sav файла
info_model = load_card(name_card = name_card_1,id_card = id_card_1)
model = info_model['model']

# Загрузка данных на которых обучалась модель
df_full = load_data(name_card = name_card_1,id_card = id_card_1)

In [35]:
# Получение данных по app_id
mas_app_id = ['000031942']
df,df_type = get_data_app_ids(mas_app_id,
                               status = 3,
                               prolong = True,
                               model_type = ['equifax_cs','nbki_cs'],
                               dev = 'dev2')

192.168.14.169 scoring3
[INFO]: Ошибка получения контекста...закрытие соединения с БД
192.168.14.169 scoring3
[INFO]: Ошибка получения контекста...закрытие соединения с БД


In [37]:
# обработка фичей для подачи в модель
X,y = processing_data(df = df,
                               df_type = df_type,
                               info_model = info_model)

In [41]:
X

Unnamed: 0,equifax_cs__cnt_delay5_micr,equifax_cs__cnt_request_microcredits_quarter,equifax_cs__max_request_microcredits_day,equifax_cs__mean_request_microcredits_hour,equifax_cs__min_request_microcredits_week,equifax_cs__share_of_microloans
0,0,10,0.0,0.0,16000.0,0.998


In [31]:
# _coef = важность фичей
# iv = важность фичей по модулю
# _pred - источник фичи
info_model['features_importance']


Unnamed: 0,features,_coef,iv,_pred
0,equifax_cs__share_of_microloans,1.0,1.0,equifax_cs
1,equifax_cs__cnt_delay5_micr,0.194135,0.194135,equifax_cs
2,equifax_cs__cnt_request_microcredits_quarter,-0.134268,0.134268,equifax_cs
3,equifax_cs__mean_request_microcredits_hour,-0.000292,0.000292,equifax_cs
4,equifax_cs__max_request_microcredits_day,-0.000138,0.000138,equifax_cs
5,equifax_cs__min_request_microcredits_week,-9.5e-05,9.5e-05,equifax_cs
