In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import numpy as np

from category_encoders.target_encoder import TargetEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

In [2]:
train = pd.read_csv('train_base.csv', sep=';')
test = pd.read_csv('test_base.csv', sep=';')

In [3]:
prev = pd.read_csv('previous_application.csv')

In [4]:
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)

In [5]:
prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']

In [7]:
# Аргрегирование:
features_min = ['AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'APP_CREDIT_PERC', 'AMT_DOWN_PAYMENT', 
               'AMT_GOODS_PRICE', 'HOUR_APPR_PROCESS_START', 'RATE_DOWN_PAYMENT', 'DAYS_DECISION']
features_max = ['AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'APP_CREDIT_PERC', 'AMT_DOWN_PAYMENT', 
               'AMT_GOODS_PRICE', 'HOUR_APPR_PROCESS_START', 'RATE_DOWN_PAYMENT', 'DAYS_DECISION']
features_mean = ['AMT_ANNUITY', 'AMT_APPLICATION', 'CNT_PAYMENT',  'AMT_CREDIT', 'APP_CREDIT_PERC', 'AMT_DOWN_PAYMENT', 
               'AMT_GOODS_PRICE', 'HOUR_APPR_PROCESS_START', 'RATE_DOWN_PAYMENT', 'DAYS_DECISION']

In [25]:
import aggregation as agt

In [13]:
agr_min = agt.aggregate_min_value(prev, features_min, 'SK_ID_PREV')
agr_max = agt.aggregate_min_value(prev, features_max, 'SK_ID_PREV')
agr_mean = agt.aggregate_min_value(prev, features_mean, 'SK_ID_PREV')

In [14]:
agr_min.to_csv('agr_min_prev', sep=';')
agr_max.to_csv('agr_max_prev', sep=';')
agr_mean.to_csv('agr_mean_prev', sep=';')

In [6]:
agr_min = pd.read_csv('agr_min_prev', sep=';')
agr_max = pd.read_csv('agr_max_prev', sep=';')
agr_mean = pd.read_csv('agr_mean_prev', sep=';')

In [7]:
def create_data_aggregation(data, data_agr, feature):
    without_credit = list(set(data[feature]) - set(data_agr.index))
    empty_df = pd.DataFrame(np.nan, index=without_credit, columns=list(data_agr))
    empty_df.fillna(9999, inplace=True)
    empty_df['dummy_credit_history'] = 0
    data_agr['dummy_credit_history'] = 1
    features = pd.concat([empty_df, data_agr])
    features[feature] = features.index
    
    return features

In [8]:
with_credit = prev.loc[prev.SK_ID_PREV.isin(list(train.SK_ID_CURR.unique()))]

In [9]:
min_df = create_data_aggregation(with_credit, agr_min, 'SK_ID_CURR')

In [10]:
max_df = create_data_aggregation(with_credit, agr_max, 'SK_ID_CURR')

In [11]:
mean_df = create_data_aggregation(with_credit, agr_mean, 'SK_ID_CURR')

In [12]:
df1 = train.merge(min_df, on='SK_ID_CURR', how='left')

In [13]:
df3 = df1.merge(max_df, on='SK_ID_CURR', how='left')

In [14]:
df4 = df3.merge(mean_df, on='SK_ID_CURR', how='left')

In [15]:
X = df4.drop(columns=['SK_ID_CURR', 'Target'])
y = df4['Target']

In [16]:
kf = StratifiedKFold(n_splits=5,  shuffle=True)

In [18]:
for train_indices, test_indices in kf.split(X, y):
    X_train, X_val = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[test_indices]
    lgbmc = lgb.LGBMClassifier(mobjective='binary',
                               max_depth=47, learning_rate=0.05,
                               n_estimators=500, class_weight='balanced',
                               subsample=0.8,
                               colsample_bytree=0.8,
                               random_state=42,
                               importance_type='gain',
                               reg_alpha=100,
                               reg_lambda=80)
    model = lgbmc.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    score_train = roc_auc_score(y_train, y_pred_train)
    print(f'train_score {score_train}')   
    y_pred_val = model.predict(X_val)
    score_val = roc_auc_score(y_val, y_pred_val)
    print(f'test_score {score_val}')

train_score 0.7548411587892369
test_score 0.6909226196991024
train_score 0.7548998919722248
test_score 0.6882762980304117
train_score 0.7576880058006296
test_score 0.6874277879057282
train_score 0.7572579727307217
test_score 0.6819277184446418
train_score 0.7560137043334263
test_score 0.6902022934668497


In [None]:
### 

In [21]:
cash = pd.read_csv('POS_CASH_balance.csv')

In [23]:
with_credit = cash.loc[cash.SK_ID_PREV.isin(list(train.SK_ID_CURR.unique()))]

In [52]:
agr_max.to_csv('agr_max_cash', sep=';')
agr_mean.to_csv('agr_mean_cash', sep=';')

In [42]:
features_max = ['MONTHS_BALANCE', 'SK_DPD', 'CNT_INSTALMENT', 'SK_DPD_DEF', 'CNT_INSTALMENT_FUTURE']
features_mean = ['MONTHS_BALANCE', 'SK_DPD', 'CNT_INSTALMENT', 'SK_DPD_DEF']

In [43]:
agr_max = agt.aggregate_min_value(cash, features_max, 'SK_ID_PREV')
agr_mean = agt.aggregate_min_value(cash, features_mean, 'SK_ID_PREV')

In [44]:
max_df = create_data_aggregation(with_credit, agr_max, 'SK_ID_PREV')

In [45]:
mean_df = create_data_aggregation(with_credit, agr_mean, 'SK_ID_PREV')

In [50]:
df1 = df4.merge(mean_df, on='SK_ID_PREV', how='left')

In [51]:
df2 = df1.merge(max_df, on='SK_ID_PREV', how='left')

In [53]:
X = df2.drop(columns=['SK_ID_CURR', 'Target'])
y = df2['Target']

In [55]:
for train_indices, test_indices in kf.split(X, y):
    X_train, X_val = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[test_indices]
    lgbmc = lgb.LGBMClassifier(mobjective='binary',
                               max_depth=50, learning_rate=0.05,
                               n_estimators=300, class_weight='balanced',
                               subsample=0.8,
                               colsample_bytree=0.8,
                               random_state=42,
                               importance_type='gain',
                               reg_alpha=100,
                               reg_lambda=80)
    model = lgbmc.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    score_train = roc_auc_score(y_train, y_pred_train)
    print(f'train_score {score_train}')   
    y_pred_val = model.predict(X_val)
    score_val = roc_auc_score(y_val, y_pred_val)
    print(f'test_score {score_val}')

train_score 0.7368276316274155
test_score 0.6997860867051391
train_score 0.7382359379934623
test_score 0.6926963790607412
train_score 0.7391677515259486
test_score 0.6920094543071849
train_score 0.7380471431517751
test_score 0.6830863128886376
train_score 0.7386283006394263
test_score 0.687491545869337


In [None]:
# Все равно модель неустойчива.