In [217]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from  lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import precision_recall_curve, auc
from lightgbm import LGBMClassifier


pd.set_option('display.max_columns', None)

In [218]:
def pr_auc(y, pred):
    precision, recall, _ = precision_recall_curve(y, pred)
    return auc(recall, precision)

In [219]:
df1 = pd.read_parquet('./data_features/new_train_data_with_features_amplituda_0.parquet')
df2 = pd.read_parquet('./data_features/new_train_data_with_features_amplituda_1.parquet')
df3 = pd.read_parquet('./data_features/new_train_data_with_features_amplituda_2.parquet')

df = pd.concat([df1, df2, df3])

application_data = pd.read_parquet('./data/train_app_data.parquet')

df.columns = df.columns.str.lower()
application_data.columns = application_data.columns.str.lower()

df = pd.merge(df, application_data, how='outer', on=['applicationid'])

target_data = pd.read_parquet('./data/train_target_data.parquet')
target_data.columns = target_data.columns.str.lower()


df = pd.merge(df, target_data[['applicationid', 'target']], how='outer', indicator=True)
df = df.query("_merge == 'both'")

In [220]:
for col in df.columns:
    if df[col].isna().sum() > df.shape[0] * 0.7:
        df.drop([col], axis=1, inplace=True)

df = df.rename(columns=lambda x: x.strip())
df.columns = df.columns.str.upper()
df['TOTALAMOUNT'] = df['TOTALAMOUNT'].str.replace(' ', '').astype('float64')
df['SUM_CREDIT_KZT'] = df['SUM_CREDIT_KZT'].str.replace(' ', '').astype('float64')
df['DM5DPD1GCVPSUM'] = (df['DM5DPD1GCVPSUM'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df['DM5EXPSUM'] = (df['DM5EXPSUM'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df['DM5INCSUM'] = (df['DM5INCSUM'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df['DM6SCOREN6PD'] = (df['DM6SCOREN6PD'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df['DM6SCOREN6'] = (df['DM6SCOREN6'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df['FINALKDN'] = (df['FINALKDN'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df.columns = df.columns.str.lower()

id_cols = ['applicationid', 'create_date', 'create_datetime', 'data_issue',\
            'vintage', 'product_group', 'regregion', 'company_name', 'spf', \
                'mng_name_login_init', 'mng_name_init', 'regtown', 'birthcountry', \
                    '_merge', 'regcounty', 'target']

id_cols += ['last_event_type', 'second_last_event_type', 'most_common_device_type']

In [221]:
feature_mapping = {
    'gender': {
        'Женский': 0,
        'Мужской': 1},

    'maritalstatus': {
        'Женат/Замужем': 0,
        'Холост/Не замужем': 1,
        'Разведен/Разведена': 2},

    'purpose_loan': {
        'На потребительские цели': 0,
        'На рефинансирование займа и потребительские цели': 1,
        'На рефинансирование займа': 2},

    'opv_reason': {
        'Онлайн СБОЛ': 0,
        'NEW САЙТ': 1},

    'kanal_prodazh': {
        'B-Bank': 0, 
        'QR': 1, 
        'WEB': 2},
        
    'branch': {
        'Алматы': 0,
        'Астана': 1,
        'Шымкент': 2,
        'Караганда': 3,
        'Усть-Каменогорск': 4,
        'Тараз': 5,
        'Актобе': 6,
        'Костанай': 7,
        'Кызылорда': 8,
        'Павлодар': 9,
        'Актау': 10,
        'Уральск': 11,
        'Атырау': 12,
        'Туркестан': 13,
        'Алматинская область': 14,
        'Кокшетау': 15,
        'Петропавловск': 16,
        'Семей': 17},
    'application_isa0auto': {
        'Улица': 0, 
        'Зарплатники': 1},
    'bki': {
        'Хорошая': 0, 
        'Средняя': 1, 
        'Плохая': 2, 
        'Отсутствует': 3},
    'vki': {
        'Отсутствует': 0,
        'Хорошая': 1,
        'Средняя': 2, 
        'Плохая': 3},
        
    'top_event_type': {
        'main_page': 0,
        'enter_pin': 1,
        'session_end': 2,
        'initial_transfer_internal': 3}

    }

for key in feature_mapping:
    df[key] = df[key].map(feature_mapping[key])

In [222]:
audio_pd = pd.read_parquet('data_features/audio_pd.parquet')
audio_pd.columns = audio_pd.columns.str.lower()

In [223]:
df = pd.merge(df, audio_pd, how='left', on=['applicationid'])

In [224]:
df = df.drop_duplicates(subset=['applicationid'], keep='first')

In [225]:
X = df.drop(id_cols, axis=1)
y = df['target']

In [226]:
X.fillna(0, inplace=True)

In [227]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train.fillna(0, inplace=True)
# X_test.fillna(0, inplace=True)

In [228]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()

In [229]:
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [230]:
model = LGBMClassifier(random_state=42, min_data_in_leaf=700, n_estimators=12)
model.fit(X_train, y_train)



0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,12
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [231]:
pred = model.predict_proba(X_train)[:, 1]
print(f"PRC_AUC: {pr_auc(y_train, pred)}")
print(f"GINI: {roc_auc_score(y_train, pred) * 2 - 1}")


PRC_AUC: 0.5438657754055124
GINI: 0.9030849174234072


In [232]:
pred = model.predict_proba(X_test)[:, 1]
print(f"PRC_AUC: {pr_auc(y_test, pred)}")
print(f"GINI: {roc_auc_score(y_test, pred) * 2 - 1}")


PRC_AUC: 0.5174978079569545
GINI: 0.925735170657993


# test model

In [233]:
df_test1 = pd.read_parquet('./data_features/new_test_with_features_0.parquet')
df_test2 = pd.read_parquet('./data_features/new_test_with_features_1.parquet')

df_test = pd.concat([df_test1, df_test2])

application_data = pd.read_parquet('./data/test_app_data.parquet')

df_test.columns = df_test.columns.str.lower()
application_data.columns = application_data.columns.str.lower()

df_test = pd.merge(df_test, application_data, how='outer', on=['applicationid'])

target_data = pd.read_parquet('./data/test_target_data.parquet')
target_data.columns = target_data.columns.str.lower()


df_test = pd.merge(df_test, target_data[['applicationid', 'target']], how='outer', indicator=True)
df_test = df_test.query("_merge == 'both'")

In [234]:
drop_cols = []
for col in df_test.columns:
    if df_test[col].isna().sum() > df_test.shape[0] * 0.7:
        df_test.drop([col], axis=1, inplace=True)
        drop_cols.append(col)
        
df_test = df_test.rename(columns=lambda x: x.strip())
df_test.columns = df_test.columns.str.upper()
df_test['TOTALAMOUNT'] = df_test['TOTALAMOUNT'].str.replace(' ', '').astype('float64')
df_test['SUM_CREDIT_KZT'] = df_test['SUM_CREDIT_KZT'].str.replace(' ', '').astype('float64')
df_test['DM5DPD1GCVPSUM'] = (df_test['DM5DPD1GCVPSUM'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df_test['DM5EXPSUM'] = (df_test['DM5EXPSUM'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df_test['DM5INCSUM'] = (df_test['DM5INCSUM'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df_test['DM6SCOREN6PD'] = (df_test['DM6SCOREN6PD'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df_test['DM6SCOREN6'] = (df_test['DM6SCOREN6'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df_test['FINALKDN'] = (df_test['FINALKDN'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df_test.columns = df_test.columns.str.lower()

In [235]:
import yaml

In [236]:
df_test = pd.merge(df_test, audio_pd, how='left', on=['applicationid'])

In [237]:
for key in feature_mapping:
    df_test[key] = df_test[key].map(feature_mapping[key])

df_test[X_test.columns] = scaler.transform(df_test[X_test.columns])

In [249]:
with open("feature_mapping.yaml", "w", encoding="utf-8") as file:
    yaml.dump(feature_mapping, file, allow_unicode=True, sort_keys=False)

In [250]:
pickle.dump(scaler, open('scaler.pkl', 'wb'))

In [238]:
pred = model.predict_proba(df_test[X_train.columns])[:, 1]
print(f"PRC_AUC: {pr_auc(df_test['target'], pred)}")
print(f"GINI: {roc_auc_score(df_test['target'], pred) * 2 - 1}")


PRC_AUC: 0.22448406388360292
GINI: 0.7675191038555054


In [245]:
final_cols

array(['cli_age', 'dm5expsum', 'dm5incsum', 'finalkdn', 'session_len_std',
       'time_to_first_event', 'kanal_prodazh', 'median_time_diff_sec',
       'dm5dpd1gcvpsum', 'avg_events_per_minute', 'creditterm_rbl0',
       'branch', 'pd', 'time_span_sec', 'avg_session_len',
       'p95_time_diff_sec', 'weekend_events', 'dm6scoren6pd',
       'max_session_len', 'event_type_count_cl_back', 'avg_time_diff_sec',
       'std_time_diff_sec', 'event_type_count_transfer_initial',
       'sum_credit_kzt', 'event_type_count_new_product_preopen',
       'event_type_count_transfer_new_view_confirm_screen',
       'event_type_count_frequent_operations_click',
       'event_type_count_tab_history_transfers', 'time_to_last_event',
       'max_time_diff_sec',
       'event_type_count_cl_bank_instructions_cellinputactivated',
       'unique_ips', 'avg_lng_change', 'event_type_count_session_start',
       'event_type_count_transfer_view_history_screen',
       'event_type_count_ecp_initial',
       'even

In [239]:
t = {}
t['values'] = model.feature_importances_
t['cols'] = model.feature_name_

In [240]:
final_cols = pd.DataFrame(t).sort_values(by=['values'], ascending=False).iloc[:50]['cols'].values

In [241]:
model = LGBMClassifier(random_state=42, min_data_in_leaf=700, n_estimators=25)
model.fit(X_train[final_cols], y_train)



0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,25
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [242]:
pred = model.predict_proba(X_train[final_cols])[:, 1]
print(f"PRC_AUC: {pr_auc(y_train, pred)}")
print(f"GINI: {roc_auc_score(y_train, pred) * 2 - 1}")


PRC_AUC: 0.6372274358658169
GINI: 0.9431864495450801


In [243]:
pred = model.predict_proba(X_test[final_cols])[:, 1]
print(f"PRC_AUC: {pr_auc(y_test, pred)}")
print(f"GINI: {roc_auc_score(y_test, pred) * 2 - 1}")


PRC_AUC: 0.5788248958922422
GINI: 0.9428653986575399


In [244]:
pred = model.predict_proba(df_test[final_cols])[:, 1]
print(f"PRC_AUC: {pr_auc(df_test['target'], pred)}")
print(f"GINI: {roc_auc_score(df_test['target'], pred) * 2 - 1}")


PRC_AUC: 0.3264613102802751
GINI: 0.7844086488364015


In [128]:
df[list(final_cols) + ['target']].groupby(['target']).mean()

Unnamed: 0_level_0,time_to_first_event,finalkdn,dm5incsum,cli_age,median_time_diff_sec,time_span_sec,kanal_prodazh,session_len_std,dm5dpd1gcvpsum,dm5expsum,creditterm_rbl0,avg_session_len,avg_events_per_minute,sum_credit_kzt,event_type_count_open_notification,branch,avg_time_diff_sec,event_type_count_cl_back,p95_time_diff_sec,event_type_count_frequent_operations_click,totalamount,time_to_last_event,max_time_diff_sec,weekend_events,event_type_count_cl_bank_instructions_cellinputactivated,events_day,event_type_count_transfer_select_new_type,event_type_count_transfer_new_view_confirm_screen,event_type_count_mainscreen_open_new_product_btn,event_type_count_new_product_preopen,avg_lng_change,unique_versions,events_night,event_type_count_cash_loan_uploading,event_type_count_transfer_initial,event_type_count_enter_pin,top_event_type,most_active_hour,event_type_count_payments_open_tab,event_type_count_cl_duration_cellinputactivated,max_session_len,event_type_count_transfer_new_success,event_type_count_open_card,event_type_count_login,event_type_entropy,event_type_count_onboarding_process_success,dm6scoren6pd,events_before_app,events_after_app,std_time_diff_sec
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
0.0,-1208298.0,0.349385,1537214.0,38.737061,5.155959,6636254.0,0.395988,15.460489,188082.844957,1533309.0,44.312812,12.325845,0.014869,1676918.0,10.595536,4.147401,22689.5141,5.215601,45966.294303,8.094331,2115536.0,5427956.0,1863469.0,85.068713,3.493923,315.865924,19.503043,7.44095,4.446493,3.751868,0.007634,4.883609,113.726784,7.026581,8.229506,42.904178,5776.660711,9.026021,3.886851,3.99845,69.604472,7.780018,10.904521,39.447646,4.994988,1.901363,0.068058,45.167653,397.611405,152847.670226
1.0,-144546.4,0.356515,784853.9,47.394118,7.476215,3461198.0,0.064706,27.389675,185417.127059,780596.7,49.788235,18.709578,0.160398,1739459.0,5.732323,5.311765,14783.275002,2.408602,22798.348022,3.880795,1991382.0,3316651.0,1489496.0,16.680147,1.634615,215.488971,11.59127,9.018519,3.547325,2.942222,0.02132,2.875,47.477941,3.496324,7.731959,20.916667,7257.216216,9.654412,2.537931,2.362445,97.117647,9.231818,8.412245,18.352941,5.138102,2.033835,0.051856,4.165441,262.863971,116954.783312


In [130]:
'pd' in final_cols

False

In [206]:
t = {}
t['values'] = model.feature_importances_
t['cols'] = model.feature_name_

In [207]:
pd.DataFrame(t).sort_values(by=['values'], ascending=False)[:20]

Unnamed: 0,values,cols
0,38,cli_age
7,24,median_time_diff_sec
2,23,dm5incsum
1,22,dm5expsum
10,21,creditterm_rbl0
3,20,finalkdn
6,19,kanal_prodazh
5,19,time_to_first_event
11,19,branch
8,18,dm5dpd1gcvpsum


In [208]:
import pickle

In [209]:
pickle.dump(model, open('model.pkl', 'wb'))