In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import numpy as np

from category_encoders.target_encoder import TargetEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbS

In [2]:
train = pd.read_csv('train_base.csv', sep=';')
test = pd.read_csv('test_base.csv', sep=';') 

In [4]:
bureau = pd.read_csv('bureau_balance.csv')

In [5]:
bureau

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C
...,...,...,...
27299920,5041336,-47,X
27299921,5041336,-48,X
27299922,5041336,-49,X
27299923,5041336,-50,X


In [6]:
agr_train = bureau.loc[bureau.index.isin(list(train.SK_ID_CURR.unique()))]

In [7]:
agr_test = bureau.loc[bureau.index.isin(list(test.SK_ID_CURR.unique()))]

In [None]:
agr_train

In [10]:
without_credit = list(set(train.SK_ID_CURR) - set(agr_train.index))
empty_df = pd.DataFrame(np.nan, index=without_credit, columns=list(agr_train))
empty_df.fillna(9999, inplace=True)
empty_df['dummy_credit_history'] = 0
agr_train['dummy_credit_history'] = 1
features = pd.concat([empty_df, agr_train])
features['SK_ID_CURR'] = features.index
train_df = train.merge(features, on='SK_ID_CURR', how='left')

In [11]:
without_credit = list(set(test.SK_ID_CURR) - set(agr_test.index))
empty_df = pd.DataFrame(np.nan, index=without_credit, columns=list(agr_test))
empty_df.fillna(9999, inplace=True)
empty_df['dummy_credit_history'] = 0
agr_train['dummy_credit_history'] = 1
features = pd.concat([empty_df, agr_test])
features['SK_ID_CURR'] = features.index
test_df = test.merge(features, on='SK_ID_CURR', how='left')

In [12]:
import analytics as als

In [14]:
X = train_df.drop(columns=['SK_ID_CURR', 'Target'])
y = train_df['Target']

In [15]:
feature_str, feature_int = als.get_lists_type_feature(train_df, list(train_df))

In [16]:
feature_str

['STATUS']

In [19]:
kf = StratifiedKFold(n_splits=5,  shuffle=True)

In [17]:
TE_encoder = TargetEncoder(cols=feature_str)

In [20]:
for train_indices, test_indices in kf.split(X, y):
    X.iloc[train_indices] = TE_encoder.fit_transform(X.iloc[train_indices], y.iloc[train_indices])
    X.iloc[test_indices] = TE_encoder.transform(X.iloc[test_indices])

In [21]:
for feature in feature_str:
    X[feature] = X[feature].astype('float64')

In [22]:
for train_indices, test_indices in kf.split(X, y):
    X_train, X_val = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[test_indices]
    lgbmc = lgb.LGBMClassifier(mobjective='binary',
                               max_depth=47, learning_rate=0.05,
                               n_estimators=100, class_weight='balanced',
                               subsample=0.8,
                               colsample_byt
                               ree=0.8,
                               random_state=42,
                               importance_type='gain')
    model = lgbmc.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    score_train = roc_auc_score(y_train, y_pred_train)
    print(f'train_score {score_train}')   
    y_pred_val = model.predict(X_val)
    score_val = roc_auc_score(y_val, y_pred_val)
    print(f'test_score {score_val}')

train_score 0.7260375628875688
test_score 0.6844712870951062
train_score 0.7240725286341116
test_score 0.6889541602382048
train_score 0.7249620473926148
test_score 0.6898872691375022
train_score 0.7237493632506684
test_score 0.6879816330856819
train_score 0.7255489227997188
test_score 0.6821590083178551
