In [34]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from  lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import precision_recall_curve, auc
from lightgbm import LGBMClassifier
import yaml

pd.set_option('display.max_columns', None)

In [35]:
with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

In [48]:
df_test1 = pd.read_parquet('./data_features/new_test_with_features_0.parquet')
df_test2 = pd.read_parquet('./data_features/new_test_with_features_1.parquet')

df_test = pd.concat([df_test1, df_test2])

application_data = pd.read_parquet('./data/test_app_data.parquet')

df_test.columns = df_test.columns.str.lower()
application_data.columns = application_data.columns.str.lower()

df_test = pd.merge(df_test, application_data, how='outer', on=['applicationid'])

target_data = pd.read_parquet('./data/test_target_data.parquet')
target_data.columns = target_data.columns.str.lower()


df_test = pd.merge(df_test, target_data[['applicationid', 'target']], how='outer', indicator=True)
df_test = df_test.query("_merge == 'both'")

In [49]:
        
df_test = df_test.rename(columns=lambda x: x.strip())
df_test.columns = df_test.columns.str.upper()
df_test['TOTALAMOUNT'] = df_test['TOTALAMOUNT'].str.replace(' ', '').astype('float64')
df_test['SUM_CREDIT_KZT'] = df_test['SUM_CREDIT_KZT'].str.replace(' ', '').astype('float64')
df_test['DM5DPD1GCVPSUM'] = (df_test['DM5DPD1GCVPSUM'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df_test['DM5EXPSUM'] = (df_test['DM5EXPSUM'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df_test['DM5INCSUM'] = (df_test['DM5INCSUM'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df_test['DM6SCOREN6PD'] = (df_test['DM6SCOREN6PD'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df_test['DM6SCOREN6'] = (df_test['DM6SCOREN6'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df_test['FINALKDN'] = (df_test['FINALKDN'].str.replace(' ', '').str.replace(',', '.').str.replace('-', '0')).astype('float64')
df_test.columns = df_test.columns.str.lower()

In [50]:
id_cols = ['applicationid', 'create_date', 'create_datetime', 'data_issue',\
            'vintage', 'product_group', 'regregion', 'company_name', 'spf', \
                'mng_name_login_init', 'mng_name_init', 'regtown', 'birthcountry', 'regcounty', 'target']

id_cols += ['last_event_type', 'second_last_event_type', 'most_common_device_type']

In [51]:
with open("feature_mapping.yaml", "r") as file:
    feature_mapping = yaml.safe_load(file)

In [52]:
for key in feature_mapping:
    df_test[key] = df_test[key].map(feature_mapping[key])

In [53]:
audio_pd = pd.read_parquet('data_features/audio_pd.parquet')
audio_pd.columns = audio_pd.columns.str.lower()
df_test = pd.merge(df_test, audio_pd, how='left', on=['applicationid'])
df_test = df_test.drop_duplicates(subset=['applicationid'], keep='first')

In [54]:
import pickle

In [55]:
scaler = pickle.load(open('scaler.pkl', 'rb'))

In [56]:
X = pd.DataFrame(scaler.transform(df_test[config['scaler_cols']]), columns=config['scaler_cols'])

In [57]:
model = pickle.load(open('model.pkl', 'rb'))

In [58]:
df_test['FINAL_PD'] = model.predict_proba(X[config['final_cols']])[:,1]

In [59]:
def pr_auc(y, pred):
    precision, recall, _ = precision_recall_curve(y, pred)
    return auc(recall, precision)

In [60]:
pr_auc(df_test['target'], df_test['FINAL_PD'])

0.2901493363855641

In [61]:
df_test[['applicationid', 'FINAL_PD']]

Unnamed: 0,applicationid,FINAL_PD
0,Д\286\011895433,0.027777
1,Д\286\011895453,0.007144
2,Д\286\011895475,0.006555
3,Д\286\011895518,0.003381
4,Д\286\011895559,0.007509
...,...,...
5801,Д\286\012041434,0.010941
5802,Д\286\012041437,0.082940
5803,Д\286\012041504,0.002474
5804,Д\286\012041576,0.002328
